### Setup

To run this notebook you would need to install dependencies, langchain and LLaMa Index and the updated boto3, botocore whls.


In [2]:
%pip install --upgrade pip
%pip install boto3==1.33.2 --force-reinstall --quiet
%pip install botocore==1.33.2 --force-reinstall --quiet
%pip install langchain==0.0.342 --force-reinstall --quiet
%pip install llama-index==0.9.3.post1 --force-reinstall --quiet

[0mNote: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.3.3 requires pyqt5<5.16, which is not installed.
spyder 5.3.3 requires pyqtwebengine<5.16, which is not installed.
awscli 1.32.16 requires botocore==1.34.16, but you have botocore 1.33.13 which is incompatible.
awscli 1.32.16 requires s3transfer<0.11.0,>=0.10.0, but you have s3transfer 0.8.2 which is incompatible.
distributed 2022.7.0 requires tornado<6.2,>=6.0.3, but you have tornado 6.4 which is incompatible.
jupyterlab 3.4.4 requires jupyter-server~=1.16, but you have jupyter-server 2.12.1 which is incompatible.
jupyterlab-server 2.10.3 requires jupyter-server~=1.4, but you have jupyter-server 2.12.1 which is incompatible.
llama-index 0.9.3.post1 requires urllib3<2, but you have urllib3 2.0.7 which is incompatible.
notebook 6.5.


Restart the kernel with the updated packages that are installed through the dependencies above

In [None]:
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import nest_asyncio
nest_asyncio.apply()

### Follow the steps below to initiate the bedrock client:

1. Import the necessary libraries, along with langchain for bedrock model selection, llama index to store the service context containing the llm and embedding model instances. We will use this service context later in the notebook for evaluating the responses from our Q&A application. 

2. Initialize `amazon.titan-text-lite-v1` as our large language model to perform query completions using the RAG pattern with the given knowledge bases, once we get all text chunk searches through the `retrieve` API.

3. For evaluating the response with LlamaIndex we will use `anthropic.claude-v2` model. 

In [None]:
import time 
import boto3
import pandas as pd 

from llama_index.evaluation import ( 
    RelevancyEvaluator, 
    FaithfulnessEvaluator, 
    CorrectnessEvaluator
) 

from llama_index import ( 
    SimpleDirectoryReader, 
    VectorStoreIndex, 
    ServiceContext 
) 

import boto3
import pprint
from botocore.client import Config
from langchain.llms.bedrock import Bedrock
from llama_index import (
    ServiceContext,
    set_global_service_context
)
from langchain.embeddings.bedrock import BedrockEmbeddings
from llama_index.embeddings import LangchainEmbedding

pp = pprint.PrettyPrinter(indent=2)

bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 0})
bedrock_client = boto3.client('bedrock-runtime')
bedrock_agent_client = boto3.client("bedrock-agent-runtime",
                              config=bedrock_config)

parameters_llm = {
    "maxTokenCount":2000,
    "stopSequences":[],
    "temperature":0,
    "topP":0.9
    }

model_kwargs_claude = {
    "temperature": 0,
    "top_k": 10,
    "max_tokens_to_sample": 3000
}

embed_model = LangchainEmbedding(
    BedrockEmbeddings(model_id="amazon.titan-embed-text-v1")
)

llm = Bedrock(model_id = "amazon.titan-text-lite-v1",
              model_kwargs=parameters_llm,
              client = bedrock_client,)

llm_claude = Bedrock(model_id = "anthropic.claude-v2",
              model_kwargs=model_kwargs_claude,
              client = bedrock_client,)

service_context = ServiceContext.from_defaults(llm=llm_claude,
                                               embed_model=embed_model)
set_global_service_context(service_context)

### Load KB and OSS DB details

In [None]:
%store -r kb_oss_index_dict

In [None]:
from botocore.client import Config

bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 0})
bedrock_client = boto3.client('bedrock-runtime')
bedrock_agent_client = boto3.client("bedrock-agent-runtime",
                              config=bedrock_config
                              )


In [None]:
bedrock_agent_client = boto3.client("bedrock-agent-runtime",
                              config=bedrock_config)

def retrieve(query, kbId, numberOfResults=5):
    return bedrock_agent_client.retrieve(
        retrievalQuery= {
            'text': query
        },
        knowledgeBaseId=kbId,
        retrievalConfiguration= {
            'vectorSearchConfiguration': {
                'numberOfResults': numberOfResults
            }
        }
    )

In [None]:
from langchain.prompts import PromptTemplate

PROMPT_TEMPLATE = """
You are a financial advisor AI system, and provides answers to questions by using fact based and statistical information when possible. 
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context_str}
</context>

<question>
{query_str}
</question>

The response should be specific and use statistics or numbers when possible.
"""
titan_prompt = PromptTemplate(template=PROMPT_TEMPLATE, 
                               input_variables=["context_str","query_str"])

In [None]:
# fetch context from the response
def get_contexts(retrievalResults):
    contexts = []
    for retrievedResult in retrievalResults: 
        contexts.append(retrievedResult['content']['text'])
    return contexts

In [None]:
eval_question_answer_pair = [("How many days has Amazon asked employees to come to work in office?",
                          "Amazon has asked corporate employees to come back to office at least three days a week beginning May 2022."),
                         ("By what percentage did AWS revenue grow year-over-year in 2022?",
                          "AWS had a 29% year-over-year ('YoY') revenue in 2022 on $62B revenue base."),
                         ("Compared to Graviton2 processors, what performance improvement did Graviton3 chips deliver according to the passage?",
                          "In 2022, AWS delivered their Graviton3 chips, providing 25% better performance than the Graviton2 processors."),
                         ("Which was the first inference chip launched by AWS according to the passage?",
                          "AWS launched their first inference chips (“Inferentia”) in 2019, and they have saved companies like Amazon over a hundred million dollars in capital expense."),
                         ("According to the context, in what year did Amazon's annual revenue increase from $245B to $434B?",
                          "Amazon's annual revenue increased from $245B in 2019 to $434B in 2022."
                          )
                          ]

In [None]:
llm_parameters = {
    "maxTokenCount":2000,
    "stopSequences":[],
    "temperature":0,
    "topP":0.9
    }

model_kwargs_claude = {
    "temperature": 0,
    "top_k": 10,
    "max_tokens_to_sample": 3000
}


In [None]:
from llama_index.llms import LangChainLLM
from langchain.llms import Bedrock
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings import BedrockEmbeddings
from llama_index import set_global_service_context


def evaluate_chunk_size_kb(chunk_size, qa_pairs, kb_id, vector_index):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by amazon.titan-text-lite-v1 for a given chunk size.
    We'll use 'amazon.titan-embed-text-v1' for embedding and 'anthropic.claude-v2' to evaluate the response
    
    Parameters:
    chunk_size (int): The size of data chunks being processed.
    
    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """
    bedrock_client = boto3.client('bedrock-runtime')
    
    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0
    total_correctness = []
    results_list = []
    
    # create vector index
    # 1. define bedrock model to generate and evaluate the responses
    llm = Bedrock(model_id = "amazon.titan-text-lite-v1",
              model_kwargs=llm_parameters,
              client = bedrock_client,)
    
    # 2. define bedrock model to evaluate the responses
    llm_claude = Bedrock(model_id = "anthropic.claude-v2",
              model_kwargs=model_kwargs_claude,
              client = bedrock_client,)
    
    # 3. Define the embed model to be used - 
    embed_model = BedrockEmbeddings(model_id='amazon.titan-embed-text-v1')
    
    # Pass two parameters llm and embed model in the service context
    service_context = ServiceContext.from_defaults(llm=llm_claude, embed_model=embed_model)
    set_global_service_context(service_context)
    

    # Finally, get vector index
    vector_index = vector_index
    kb_id = kb_id
    
    # Establishing evaluators
    serviceContextLLM = ServiceContext.from_defaults(llm = llm_claude, embed_model=embed_model)
    faithfulnessLLM = FaithfulnessEvaluator(service_context=serviceContextLLM)
    relevancyLLM = RelevancyEvaluator(service_context=serviceContextLLM)
    CorrectnessLLM = CorrectnessEvaluator(service_context=serviceContextLLM)
    
    
    
    # Iterate over each question in eval_question_answer_pair to compute metrics.
    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),
    # we're using a loop here to specifically measure response time for different chunk sizes.
    for question, reference_answer in qa_pairs:
        start_time = time.time()
      
        # retrieve matching documents
        result = retrieve(question, kb_id, 5)
        retrievalResults = result['retrievalResults']
        contexts = get_contexts(retrievalResults=retrievalResults)
        #call LLM with updated context and question.
        prompt = titan_prompt.format(context_str=contexts, 
                                 query_str=question)
        response_vector = llm(prompt)
        generated_answer = str(response_vector)
        elapsed_time = time.time() - start_time
        
       
        faithfulness_result = faithfulnessLLM.evaluate(query=question,
                                                      response=response_vector, 
                                                      contexts=contexts)
       
        
        relevancy_result = relevancyLLM.evaluate(query=question,
                                                      response=response_vector, 
                                                      contexts=contexts)

        correctness_result = CorrectnessLLM.evaluate(
                                                    query=question,
                                                    response=response_vector,
                                                    reference=reference_answer)
        
        cur_result_dict = {
            "query": question,
            "generated_answer": generated_answer,
            "correctness": correctness_result.passing,
            "correctness_feedback": correctness_result.feedback,
            "correctness_score": correctness_result.score,
            "faithfulness": faithfulness_result.passing,
            "faithfulness_feedback": faithfulness_result.feedback,
            "faithfulness_score": faithfulness_result.score,
            "relevancy": relevancy_result.passing,
            "relevancy_feedback": relevancy_result.feedback,
            "relevancy_score": relevancy_result.score
        }
        
        results_list.append(cur_result_dict)
        
        
        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result.passing
        total_relevancy += relevancy_result.passing
        total_correctness.append(correctness_result.passing)

    average_response_time = total_response_time / len(qa_pairs)
    average_faithfulness = total_faithfulness / len(qa_pairs)
    average_relevancy = total_relevancy / len(qa_pairs)
    avereage_correctness = sum(total_correctness)/ len(qa_pairs)
    
    evals_df = pd.DataFrame(results_list)

    return average_response_time, average_faithfulness, average_relevancy, avereage_correctness, evals_df

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
%%time

chunk_sizes = list(kb_oss_index_dict.keys())

data = []

base_kb_name = f"amazon-shareholder-letters-knowledge-base"
base_index_name = f"bedrock-sample-index"

for chunk_size in chunk_sizes[:3]:
    vector_index = kb_oss_index_dict[chunk_size]['index_name']
    kb_id = kb_oss_index_dict[chunk_size]['kb_id']
    
    avg_response_time, avg_faithfulness, avg_relevancy , avg_correctness, detail_eval_df= evaluate_chunk_size_kb(chunk_size, eval_question_answer_pair, kb_id, vector_index)
    print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}, Average Correctness: {avg_correctness:.2f}")
    data.append({'Chunk Size': chunk_size, 'Average Response Time': avg_response_time, 'Average Faithfulness': avg_faithfulness, 'Average Relevancy': avg_relevancy, 'Average Correctness': avg_correctness} )
    
# Creating a DataFrame
df = pd.DataFrame(data)
df.head()

In [None]:
ax = df.plot(
        x='Chunk Size', 
        y=['Average Response Time','Average Faithfulness', 'Average Relevancy', 'Average Correctness'], 
        kind='bar', 
        figsize=(9,6))


ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
ax.bar_label(ax.containers[2])

ax.legend( bbox_to_anchor =(1 ,1))