In [None]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

In [None]:
%store -r sentiment_categorisation_prompt_id
%store -r summarisation_prompt_id
%store -r extraction_prompt_id

In [None]:
#or set them manually
#sentiment_categorisation_prompt_id = ""
#summarisation_prompt_id = ""
#extraction_prompt_id = "" 

In [None]:
!pip install -q langchain==0.2.11
!pip install -q  evaluate==0.4.2
!pip install -q rapidfuzz==3.9.5 
!pip install -q bert-score==0.3.13

In [None]:
import boto3
import importlib

#adding our utils library to sys path
import sys
sys.path.append("../src/utils/")
import llm_utils

importlib.reload(llm_utils)

session = boto3.Session()
region_name = session.region_name

# Evaluation

We will be using a ranges of different evaluator libraries to cover the need of our 3 use cases.

Note that RAG related metrics (e.g. faithfulness, context precision, context recall) will be covered in the next notebook.

For our Sentiment Classification, we'll use a straightforward string match to measure accuracy (correct answer/all answer) and string distance measure as well.

For our summarisation task, we'll use the ROUGE metrics (ROUGE1 and ROUGEL) 

For our theme extraction task, we'll use various metrics including semantic similarity and Correctness.


## Evaluation libraries

There are various libraries out there that you can use. We're just listing a few well known ones that are covering our needs for our scenarios' evaluation.

### Langchain Evaluation

Langchain includes 3 types of Evaluators:

- String Evaluators: A string evaluator is a component within LangChain designed to assess the performance of a language model by comparing its generated outputs (predictions) to a reference string or an input. 
- Comparison Evaluators: Comparison evaluators in LangChain help measure two different chains or LLM outputs.
- Trajectory Evaluators: Provide a more holistic approach to evaluating an agent. These evaluators assess the full sequence of actions taken by an agent and their corresponding responses, it is referred as the "trajectory".

https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/

For reference, see below the class storing Prompts used for Criteria evaluation: 
https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/evaluation/criteria/eval_chain.py

### HuggingFace Evaluate

The HF Evaluate library library covers various metrics used to evaluate ML models and dataset including traditional NLP metrics that are relevant for our scenarios' evaluation.

https://github.com/huggingface/evaluate
https://huggingface.co/docs/evaluate/en/base_evaluator

### Note on Amazon Bedrock Evaluation

Amazon Bedrock supports model evaluation jobs. The results of a model evaluation job allow you to compare model outputs, and then choose the model best suited for your downstream generative AI applications.

It does not support yet prompt evaluation metrics required to optimise your prompt for production.

## Loading our groundtruth datasets generated in notebook 3

In [None]:
sentiment_data = llm_utils.load_dict_from_json("../generated/groundtruth/sentiment_gt.json")
summary_data = llm_utils.load_dict_from_json("../generated/groundtruth/summary_gt.json")
extraction_data = llm_utils.load_dict_from_json("../generated/groundtruth/extraction_gt.json")

In [None]:
# we will record our scores in that dict
evaluation_scores = dict()
evaluation_scores["sentiment"] = []
evaluation_scores["summarisation"] = []
evaluation_scores["extraction"] = []

## Sentiment Classification Evaluation

### Accuracy
Simply calculated as correct answer / all answer, using a string match evaluator.

In [None]:
from langchain.evaluation import ExactMatchStringEvaluator

def generic_evaluate(data, evaluator):

    #getting the size of our dataset
    data_length = len(data["question"])

    #to store the scores that we'll average after.
    scores = []

    #calculating scores for each question/answer/groundtruth
    for i in range(data_length):
        score = evaluator.evaluate_strings(
            prediction=data["answer"][i],
            reference=data["groundtruth"][i],
        )
        scores.append(float(score["score"]))

    #calculate average across all scores
    average_score = sum(scores) / len(scores)
    
    return average_score

In [None]:
#exact match evaluator
exact_match_evaluator = ExactMatchStringEvaluator()

average_exact_match_score = generic_evaluate(sentiment_data, exact_match_evaluator)
print(f"Average accuracy score: {average_exact_match_score}")

#storing value aside
evaluation_scores["sentiment"].append({"exact_match_accuracy":average_exact_match_score})

### Answer/Groundtruth distance

The previous exact match accuracy metrics does not estimate the scale of the error if the string do not match. Assuming that the groundtruth is "Very Negative", a "Negative" response would in theory be close than "Positive" for example.

There are different ways to measure the distance between answers and groundtruth.


### Embedding distance / Semantic Similarity
To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector distance metric the two embedded representations using the embedding_distance evaluator.

In [None]:
from langchain.evaluation import EmbeddingDistance

list(EmbeddingDistance)

In [None]:
from langchain_community.embeddings import BedrockEmbeddings

embedding_model_id = "cohere.embed-english-v3"

bedrock_embeddings = BedrockEmbeddings(region_name=region_name,
                                       model_id = embedding_model_id)

Running the below cell should take 1-2min

In [None]:
from langchain.evaluation import load_evaluator

# configure evaluator with our embeddings model and picking the cosine distance.
embedding_distance_evaluator = load_evaluator("embedding_distance", embeddings=bedrock_embeddings, distance_metric=EmbeddingDistance.COSINE)

average_embeddings_distance_score = generic_evaluate(sentiment_data, embedding_distance_evaluator)
print(f"Embedding distance score: {average_embeddings_distance_score}")

#storing value aside
evaluation_scores["sentiment"].append({"embeddings_distance":average_embeddings_distance_score})

### String Distance

The "String distance" can be measured using techniques like the Levenshtein distance (Wikipedia). It is a string metric for measuring the difference between two sequences. The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other. 

In [None]:
from langchain.evaluation import load_evaluator

string_distance_evaluator = load_evaluator("string_distance")

average_string_distance_score = generic_evaluate(sentiment_data, string_distance_evaluator)
print(f"Average string distance score: {average_string_distance_score}")

#storing value aside
evaluation_scores["sentiment"].append({"string_distance":average_string_distance_score})

## Summarisation Evaluation

### ROUGE

ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for evaluating automatic summarization and machine translation software in natural language processing. The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.

The ROUGE-N (ROUGE-1, ROUGE-2) Score is calculated by taking the number of overlapping n-grams and dividing it by the total number of n-grams in the reference summary.

ROUGE-L - The Longest Common Subsequence (LCS) method identifies the longest sequence of words that appear in the same order in both the reference and machine-generated summaries.

ROUGE-LSum applies the ROUGE-L calculation method at the sentence level and then aggregates all the results for the final score.

In [None]:
import evaluate

rouge = evaluate.load('rouge')
results = rouge.compute(predictions=summary_data["answer"], references=summary_data["groundtruth"])

In [None]:
print(results)

In [None]:
evaluation_scores["summarisation"].append(results)

### Semantic similarity / BERTScore

Semantic similarity is also a good option to compare two summarised paragraph and determine their distance using either cosine similarity or euclidean distance.

We could use the langchain library and calculate the embedding distance but instead we're using BERTScore which works similarly but adds summarisation specific features.

https://github.com/Tiiiger/bert_score

for model_type, see:
https://github.com/Tiiiger/bert_score/blob/19e7f551fe4fa43fdd07b8129ae947015b902b2d/bert_score/utils.py#L64

Note that the below code will download the model you select in model_type. The selected model below is 3G and will take around 3min to download then 1min to do the evaluation.

In [None]:
from evaluate import load

bertscore = load("bertscore")
predictions = extraction_data["answer"]
references = extraction_data["groundtruth"]
results = bertscore.compute(predictions=predictions, references=references, lang="en", nthreads=10, model_type="microsoft/deberta-xlarge-mnli")

In [None]:
#calculating averages
total_len = len(results["precision"])
bert_precision_avg = sum(results["precision"])/total_len
bert_recall_avg = sum(results["recall"])/total_len
bert_f1_avg = sum(results["f1"])/total_len

#adding the result to our combined result dict.
evaluation_scores["summarisation"].append({"BERTScore-precision":bert_precision_avg, "BERTScore-recall":bert_recall_avg, "BERTScore-f1":bert_f1_avg})

print(f"Precision={bert_precision_avg}")
print(f"Recall={bert_recall_avg}")
print(f"F1={bert_f1_avg}")

## Extraction Evaluation

We can use different approach to evaluate the alignment of our generated answers and the ground truth.
Semantic Similarity would make sense as well as Correctness which determines whether an LLM output is factually correct based on some ground truth.

### Correctness

We are using the langchain labeled_criteria evaluator with criteria=correctness. You will notice that the evaluator is using a llm to extract and compare facts between prediction, reference and input (instruction + transcript).

In [None]:
from langchain_aws import ChatBedrockConverse

bedrock_agent_client = boto3.client('bedrock-agent')

#retrieve prompt details from Bedrock as it needs to be passed along with the transcript as an input.
extraction_prompt = bedrock_agent_client.get_prompt(
        promptIdentifier=extraction_prompt_id
    )
#parse the response and retrieve the elements we need for later.
extraction_prompt_dict = llm_utils.get_elts_from_prompt_get_response(extraction_prompt)

eval_llm = ChatBedrockConverse(
    model_id="anthropic.claude-3-sonnet-20240229-v1:0",
    max_tokens = 4096,
    temperature = 0,
    top_p = 0.6
)

#evaluator
evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=eval_llm)


The cell below should take 2-3min to run.

In [None]:
import concurrent.futures

def run_evaluator(prompt_dict, question, answer, groundtruth, scores):
    input_question_transcript = prompt_dict["prompt_text"].format(transcript=question)

    score = evaluator.evaluate_strings(
        input=input_question_transcript,
        prediction=answer,
        reference=groundtruth,
    )
    scores.append(float(score["score"]))


#to store the scores that we'll average after.
scores = []

data_size = len(extraction_data["question"])

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    
    futures = []

    for i in range(data_size):

        futures.append(executor.submit(run_evaluator,
                                        extraction_prompt_dict, 
                                        extraction_data["question"][i], 
                                        extraction_data["answer"][i], 
                                        extraction_data["groundtruth"][i], scores))



    for future in concurrent.futures.as_completed(futures):
        result = future.result()

    #calculate average across all scores
    average_score = sum(scores) / len(scores)

In [None]:
evaluation_scores["extraction"].append({"correctness":average_score})

print(f"correctness average score:{average_score}")

### Semantic Similarity

Again here we use embedding distance to measure the semantic similarity between 2 lists of themes.

The below cell should take 1-2min to execute

In [None]:
#we reuse the same evaluator as previously

average_embeddings_distance_score = generic_evaluate(extraction_data, embedding_distance_evaluator)
print(f"Embedding distance score: {average_embeddings_distance_score}")

#storing value aside
evaluation_scores["extraction"].append({"embeddings_distance":average_embeddings_distance_score})

## Summary 

Printing our computed metrics. 

Note that in a normal evaluation workflow you'd compare those values with previous scores to understand whether the new version of the prompt is not introducing any regression.

For the sake of our example, we assume that it's not and that we're proceed with deploying our 3 prompts in production and store the scores as tags/metadata.

In [None]:
def get_formated_metrics(key):
    res_dict = dict()
    for elt in evaluation_scores[key]:
        for key2 in elt.keys():
            name = key + "-" + key2
            value = elt[key2]
            res_dict[name] = str(value)
    return res_dict

In [None]:
import pprint
pprint.pprint(get_formated_metrics("sentiment"))
pprint.pprint(get_formated_metrics("summarisation"))
pprint.pprint(get_formated_metrics("extraction"))

In [None]:
from datetime import datetime

# Get the current date
today = datetime.now()
formatted_date = today.strftime("%d-%m-%y")

bedrock_agent_client = boto3.client('bedrock-agent')

response_sentiment = bedrock_agent_client.create_prompt_version(
    description="version created on:" + formatted_date,
    promptIdentifier=sentiment_categorisation_prompt_id,
    tags=get_formated_metrics("sentiment")
)

In [None]:
response_summarisation = bedrock_agent_client.create_prompt_version(
    description="version created on:" + formatted_date,
    promptIdentifier=summarisation_prompt_id,
    tags=get_formated_metrics("summarisation")
)

In [None]:
response_extraction = bedrock_agent_client.create_prompt_version(
    description="version created on:" + formatted_date,
    promptIdentifier=extraction_prompt_id,
    tags=get_formated_metrics("extraction")
)