## Evaluate our flow

This notebook shows how we can take the Amazon Bedrock Knowledge Bases we created in `rag-router.ipynb` and put them in a structured flow using Amazon Bedrock Prompt Flows (https://aws.amazon.com/bedrock/prompt-flows/).

This will allow us to have a versioned flow where we can specify all of the sequential components, as well as any conditions we want to model. 

We will start with a description of a RAG framework with additional modules (e.g., current date, web search, etc.) to generate a prompt flow as shown below.

In [None]:
import os
import time
import boto3
import logging
import pprint
import json

In [None]:
import pandas as pd
from botocore.client import Config
from langchain_aws.chat_models.bedrock import ChatBedrock
from langchain_aws.embeddings.bedrock import BedrockEmbeddings
from langchain_aws.retrievers.bedrock import AmazonKnowledgeBasesRetriever
from langchain.chains import RetrievalQA
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    answer_similarity,
    answer_correctness,
    answer_relevancy,
    faithfulness
    )

model_id_eval = "anthropic.claude-3-haiku-20240307-v1:0"
bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 0})
bedrock_client = boto3.client('bedrock-runtime')
bedrock_agent_client = boto3.client("bedrock-agent-runtime",
                                    config=bedrock_config)
llm_for_evaluation = ChatBedrock(model_id= model_id_eval, client=bedrock_client)
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",
                                                    client=bedrock_client)



In [None]:
metrics = [
          context_precision,
        context_recall, # currently this metric might trigger timeout error raised by bedrock: ValueError: Error raised by bedrock service: Read timeout on endpoint URL: "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-v2/invoke"
        answer_similarity,
        answer_correctness,
        answer_relevancy,
        faithfulness
]

column_map = {
        "question": "question",
        "contexts": "llm_contexts",
        "answer": "llm_answer",
        "ground_truths": "reference_answer",
    }

result_df = pd.read_csv("10QA.csv")

# Evaluate
eval_result = evaluate(Dataset.from_pandas(result_df), 
                       metrics=metrics, 
                       column_map=column_map, 
                       llm=llm_for_evaluation,
                        embeddings=bedrock_embeddings, raise_exceptions=False)