# Text generation models evaluation

#### This notebook evaluates several LLMs from Bedrock, HuggingFace, Jumpstart, Bedrock finetuned models
#### Instance type used for the evaluation - ml.g4dn.2xlarge or m5.2xlarge, python 3.10
#### The metrics evaluated are N-gram matching-based ([ROUGE](https://en.wikipedia.org/wiki/ROUGE_(metric)), [METEOR](https://en.wikipedia.org/wiki/METEOR)) and sematic-based ([BERTScore](https://arxiv.org/abs/1904.09675)) from [FMEval](https://github.com/aws/fmeval/) library (can be further customized), and [BARTScore](https://arxiv.org/abs/2106.11520) using encoder-decoder architecture
#### The datasets used is [TweetSumm](https://github.com/guyfe/Tweetsumm) (A Dialog Summarization Dataset for Customer Service, published in EMNLP 21)

In [1]:
%load_ext autoreload
%autoreload 2

# Optional S3 path to upload results to (e.g. s3://yourbucket/results/ ) - Handy to as a way to download results and open html report on a local machine
S3_OUTPUT_PATH = None 

MODELS_TO_EVAL = [] # if empty list will evaluate all the models available. For specific models, mention their ids from the list below, for example ["anthropic.claude-v2:1", "amazon.titan-text-lite-v1"]

In [2]:
!pip install --upgrade pip --quiet
!pip install -r requirements.txt --quiet

### OPEN AI API key
This is relevant if you'll be using models from OpenAI

- Create a new file called `utils/key.py` in your project directory to store your API key.
- Go to your OpenAI account and navigate to "[View API keys](https://platform.openai.com/account/api-keys)."
- Select "Create new secret key."
- Copy the key and insert it into your file `utils/key.py` like this:
```
OPENAI_API_KEY = 'sk-actualLongKeyGoesHere123'
```
- Save the changes
- IMPORTANT: Do **not** commit `key.py` to source control as will contain your private key. (It should already be in `.gitgnore`.** Review [this information about API safety](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).
- Comment out `from utils.key import OPENAI_API_KEY` below.

In [3]:
OPENAI_API_KEY = None # uncommenting the line below will override this
#from utils.key import OPENAI_API_KEY

## Define bucket config

In [4]:
import json
from pathlib import Path
import boto3
import glob
import shutil
import os
from os import listdir

from fmeval.model_runners.bedrock_model_runner import BedrockModelRunner
from fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner

from utils.model_runners.gpt_model_runner import GPTModelConfig, GPTModelRunner
from utils.tweetsumm_data_creator import create_train_test_files
from utils.model_ranker import create_model_ranking
from utils.dashboard_creators.output_viewer_creator import create_response_output_view
from utils.dashboard_creators.comparative_dashboard_creator import create_comparive_dashboard
from utils.dashboard_creators.data_stats_viewer_creator import create_data_stats_view
from utils.dashboard_creators.data_preview_viewer import create_data_preview_view
from utils.dashboard_creators.main_html_creator import create_main_html
from utils.metrics.bart_score import calculate_bartscore

RESULT_FOLDER = "/tmp/final_result"
if os.path.exists(RESULT_FOLDER):
    shutil.rmtree(RESULT_FOLDER)
os.mkdir(RESULT_FOLDER)

TMP_JSON_FILES = "/tmp/jsonl_model_files"
if os.path.exists(TMP_JSON_FILES):
    shutil.rmtree(TMP_JSON_FILES)
os.mkdir(TMP_JSON_FILES)

TMP_DATASET_FILES = "/tmp/dataset_files"
if os.path.exists(TMP_DATASET_FILES):
    shutil.rmtree(TMP_DATASET_FILES)
os.mkdir(TMP_DATASET_FILES)

RESULT_HTML_FOLDER = RESULT_FOLDER + "/html_files"
if os.path.exists(RESULT_HTML_FOLDER):
    shutil.rmtree(RESULT_HTML_FOLDER)
os.mkdir(RESULT_HTML_FOLDER)

RESULT_IMG_FOLDER = RESULT_FOLDER + "/imgs"
if os.path.exists(RESULT_IMG_FOLDER):
    shutil.rmtree(RESULT_IMG_FOLDER)
os.mkdir(RESULT_IMG_FOLDER)

from utils.tweetsumm_data_creator import create_train_test_files
TEST_FILE_PATH = create_train_test_files(TMP_DATASET_FILES) # creating train and test files
print(TEST_FILE_PATH)



## List the models to benchmark

In [5]:
# Bedrock models
models_to_test = {}

# Add Bedrock Random text generating model to serve as baseline callibration for the various metrics
models_to_test.update({
    "random" : { 
        "model_id" : "amazon.titan-text-lite-v1", 
        "platform" : "bedrock",
        "output" : "results[0].outputText", 
        "content_template" : "{\"inputText\": $prompt, \"textGenerationConfig\":  {\"maxTokenCount\": 100, \"stopSequences\": [], \"temperature\": 1.0, \"topP\": 1.0}}",
        "prompt_template" : "Please ignore the following blob of text and create an unrelated text of around 2 sentences\n $model_input\n"
    }
})

# Add Bedrock Anthropic models in zero-shot
models_to_test.update({
    "anthropic.claude-3-sonnet" : { 
        "model_id" : "anthropic.claude-3-sonnet-20240229-v1:0", 
        "platform" : "bedrock",
        "output" : "content[0].text", 
        "content_template" : "{\"messages\": [{\"role\": \"user\", \"content\": $prompt}], \"max_tokens\": 100, \"anthropic_version\": \"bedrock-2023-05-31\"}",
        "prompt_template" : "Below is a dialog between a customer and an agent. Please provide a short and concise summary of the conversation. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Please write the summary in a human readable format. Start you answer directly with the summary without any additional prefix.\n Specify important and relevant amounts, dates and locations inside the summary. Here is the dialog: <dialog>$model_input</dialog>"
    },
    "anthropic.claude-3-haiku" : { 
        "model_id" : "anthropic.claude-3-haiku-20240307-v1:0", 
        "platform" : "bedrock",
        "output" : "content[0].text", 
        "content_template" : "{\"messages\": [{\"role\": \"user\", \"content\": $prompt}], \"max_tokens\": 100, \"anthropic_version\": \"bedrock-2023-05-31\"}",
        "prompt_template" : "Below is a dialog between a customer and an agent. Please provide a short and concise summary of the conversation. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Please write the summary in a human readable format. Start you answer directly with the summary without any additional prefix.\n Specify important and relevant amounts, dates and locations inside the summary. Here is the dialog: <dialog>$model_input</dialog>"
    }
})

# Add Bedrock Amazon Titan models in zero-shot
models_to_test.update({
    "amazon.titan-text-lite-v1" : { 
        "model_id" : "amazon.titan-text-lite-v1", 
        "platform" : "bedrock",
        "output" : "results[0].outputText", 
        "content_template" : "{\"inputText\": $prompt, \"textGenerationConfig\":  {\"maxTokenCount\": 100, \"stopSequences\": [], \"temperature\": 1.0, \"topP\": 1.0}}",
        "prompt_template" : "Please provide a short and concise summary of the conversation below. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Do not include any additional information that does not appear in the dialog.  Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\n$model_input\n\nsummary:\n"
    },
    "amazon.titan-text-express-v1" :{ 
        "model_id" : "amazon.titan-text-express-v1", 
        "platform" : "bedrock",
        "output" : "results[0].outputText", 
        "content_template" : "{\"inputText\": $prompt, \"textGenerationConfig\": {\"maxTokenCount\": 100, \"stopSequences\": [], \"temperature\": 1.0, \"topP\": 1.0}}",
        "prompt_template" : "Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent.  Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\n $model_input\n\nsummary:\n"
    },
})

# Add Cohere and Llama2 Bedrock models in zero-shot
models_to_test.update({
    "cohere.command-light-text-v14" :{ 
        "model_id" : "cohere.command-light-text-v14", 
        "platform" : "bedrock",
        "output" : "generations[0].text", 
        "content_template" : "{\"prompt\": $prompt, \"max_tokens\": 100}",
        "prompt_template" : "Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent.  Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\n $model_input\n\nsummary:\n"
    },
    "meta.llama2-13b-chat-v1" :{ 
        "model_id" : "meta.llama2-13b-chat-v1", 
        "platform" : "bedrock",
        "output" : "generation", 
        "content_template" : "{\"prompt\": $prompt, \"max_gen_len\": 100, \"top_p\": 1, \"temperature\": 1.0}",
        "prompt_template" : "[INST]Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent.  Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:[/INST]\n Transcript\n $model_input \n\n Summary:\n"
    },
})

# Add various Bedrock models in one-shot
models_to_test.update({
    "amazon.titan-text-lite-v1-one-shot" : { 
        "model_id" : "amazon.titan-text-lite-v1", 
        "platform" : "bedrock",
        "output" : "results[0].outputText", 
        "content_template" : "{\"inputText\": $prompt, \"textGenerationConfig\":  {\"maxTokenCount\": 100, \"stopSequences\": [], \"temperature\": 1.0, \"topP\": 1.0}}",
        "prompt_template" : "[INST]Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent.  Specify important and relevant amounts, dates and locations inside the sentences of the summary. \n Example Transcript:\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\n\n [/INST] </s><s>[INST]\n Transcript:\n $model_input [/INST]\n Summary:"
    },
    "meta.llama2-13b-chat-v1-one-shot" :{ 
        "model_id" : "meta.llama2-13b-chat-v1", 
        "platform" : "bedrock",
        "output" : "generation", 
        "content_template" : "{\"prompt\": $prompt, \"max_gen_len\": 100, \"top_p\": 1, \"temperature\": 1.0}",
        "prompt_template" : "[INST] <<SYS>> Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent.  Specify important and relevant amounts, dates and locations inside the sentences of the summary.<</SYS> \n Example Transcript:\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\n\n [/INST] </s><s>[INST]\n Transcript:\n $model_input [/INST] Summary:"
    },
    "cohere.command-light-text-v14-one-shot" :{ 
        "model_id" : "cohere.command-light-text-v14", 
        "platform" : "bedrock",
        "output" : "generations[0].text", 
        "content_template" : "{\"prompt\": $prompt, \"max_tokens\": 100}",
        "prompt_template" : "Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent.  Specify important and relevant amounts, dates and locations inside the sentences of the summary.\n\n Example Transcript:\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\n\nTranscript:\n $model_input\n Summary:"
    },
    "amazon.titan-text-express-v1-one-shot" :{ 
        "model_id" : "amazon.titan-text-express-v1", 
        "platform" : "bedrock",
        "output" : "results[0].outputText", 
        "content_template" : "{\"inputText\": $prompt, \"textGenerationConfig\": {\"maxTokenCount\": 100, \"stopSequences\": [], \"temperature\": 1.0, \"topP\": 1.0}}",
        "prompt_template" : "Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent.  Specify important and relevant amounts, dates and locations inside the sentences of the summary.\n\n Example Transcript:\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\n\nTranscript:\n $model_input\n Summary:"
    },
})

# Add OpenAI models in zero-shot
models_to_test.update({
    "gpt.3.5-turbu-0125" :{ 
        "model_id" : "gpt-3.5-turbo-0125", 
        "api_key" : OPENAI_API_KEY,
        "platform" : "openai",
        "temperature" : 1,
        "top_p" : 1,
        "max_tokens" : 100,
        "prompt_template" : "Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent.  Specify important and relevant amounts, dates and locations inside the sentences of the summary.\n Transcript:\n $model_input \n Summary:\n"
    }
})


## Adding your own custom models
In case you wish to add custom model, simply create custom model runner. For example, see custom model runner which wraps GPT-3.5 in the folder utils/model_runners/gpt_model_runner.py 


## Adding finetuned models
In case you wish to add Bedrock finetuned model: 
1. First finetune a model (for details on finetuning on Berdrock visit https://aws.amazon.com/blogs/aws/customize-models-in-amazon-bedrock-with-your-own-data-using-fine-tuning-and-continued-pre-training/).
2. Once training completed, from Bedrock copy the ARN from Bedrock 'provisioned throughput' dashboard and paste it as the model_id. A finetuning training set is provided. For more details see documentation
3. Add to the model_dict in the cell above the configuration of your finetuned model as follows:

<code>
{
    "finetuned_amazon.titan-text-lite-v1" : {
    "platform":"bedrock",
    "model_id": "arn:aws:bedrock:us-east-1:333333333:provisioned-model/879asd6s75",
    "output": "results[0].outputText",
    "content_template": {"inputText": $prompt, "textGenerationConfig":  {"maxTokenCount": 100, "stopSequences": [], "temperature": 1.0, "topP": 1.0}},
    "prompt_template": "YOUR PROMPT HERE"
    }
}
</code>


## Adding Jumpstart models
Example for evaluation Mistral-7B-Instruct from Jumpstart:
1. Go to Jumpstart (press home button -> Jumpstart)
2. Search in the bar for Mistral-7B-Instruct
3. Click deploy from the model card (don't forget to close the endpoint once you done from SageMaker->inference endpoints)
4. Add the following to the models list
<code>
{
    "platform":"jumpstart",
    "model_id": "huggingface-llm-mistral-7b-instruct",
    "endpoint_name": "jumpstart-dft-hf-llm-mistral-7b-instruct",
    "model_version": "*",
    "output": "[0].generated_text",
    "content_template":"{\"inputs\": $prompt, \"parameters\": {\"do_sample\": false, \"max_new_tokens\": 100}}",
    "prompt_template": "YOUR PROMPT HERE"
}
</code>


## Creating ModelRunner

In [6]:
from utils.model_runners.bedrock_counting_runner import CountingBedrockModelRunner


def get_models_to_eval():
    if len(MODELS_TO_EVAL) == 0:
        return list(models_to_test.keys())
    return MODELS_TO_EVAL

models = dict()        
for fm in get_models_to_eval():  
    
    data = models_to_test[fm]
    platform = data['platform']
    
    if platform == "bedrock":
        runner = CountingBedrockModelRunner(model_id=data["model_id"], output=data["output"], content_template=data["content_template"].replace("'","\""),metrics_folder = TMP_JSON_FILES, model_key = fm)
    elif platform == "jumpstart":
        runner = JumpStartModelRunner(endpoint_name=data["endpoint_name"], model_id=data["model_id"], model_version=data["model_version"], output=data["output"].replace("'","\""), content_template=data["content_template"].replace("'","\""))
    elif platform == "openai":
        if OPENAI_API_KEY:
            runner = GPTModelRunner(GPTModelConfig(model_id=data["model_id"], api_key=data["api_key"], temperature=data["temperature"], top_p=data["top_p"], max_tokens=data["max_tokens"]),metrics_folder = TMP_JSON_FILES, model_key = fm)
        else:
            print("Skipping OpenAI models - Cannot run without an API key")
            continue
        
    models[fm] = { "model_runner": runner, "prompt_template": data["prompt_template"]}


## Evaluation run
Evaluating METEOR, ROUGE, and BERTscore using FMEval library (https://github.com/aws/fmeval). This library is also used by Bedrock when finetuning or evaluating models.

#### Note - if while running this cell you encounter the message - "Error displaying widget: model not found" in the evaluation phase...", simply ignore it. It relates to the UI and does not effect the evaluation.

In [7]:
from fmeval.data_loaders.data_config import DataConfig
from fmeval.constants import MIME_TYPE_JSONLINES
from fmeval.eval_algorithms.summarization_accuracy import SummarizationAccuracy, SummarizationAccuracyConfig
from utils.model_runners.pricing_calculator import PricingCalculator
import pandas as pd
import os

os.environ["PARALLELIZATION_FACTOR"] = "1" # will use a single workder for FMEval
TMP_JSON_FILES = "/tmp/jsonl_model_files"
if os.path.exists(TMP_JSON_FILES):
    shutil.rmtree(TMP_JSON_FILES)
os.mkdir(TMP_JSON_FILES)

models_scores = dict()
models_usage = dict()
models_to_eval = get_models_to_eval()
for model_id in models_to_eval:
    print(f"### Starting model {model_id} evaluation")
    if not model_id in models:
        print(f"###model {model_id} doesn't have a valid/complete entry in the model list")
        continue
    model = models[model_id]
    config = DataConfig(
        dataset_name=f"data",
        dataset_uri=TEST_FILE_PATH,
        dataset_mime_type=MIME_TYPE_JSONLINES,
        model_input_location="document",
        target_output_location="summary"
    )

    model_runner = model['model_runner']
    eval_algo = SummarizationAccuracy(SummarizationAccuracyConfig())
    eval_output = eval_algo.evaluate(model=model_runner, 
                                     dataset_config=config,
                                     prompt_template=model["prompt_template"],
                                     num_records=10,
                                     save=True)

    scores = dict()
    for i in eval_output[0].dataset_scores:
        scores[i.name] = i.value
    
    models_scores[model_id] = scores
    models_usage[model_id] = PricingCalculator.read_model_score_aggregate(model_id, TMP_JSON_FILES)
    shutil.move('/tmp/eval_results/summarization_accuracy_data.jsonl', f'{TMP_JSON_FILES}/{model_id}_metrics.jsonl')


## Calculate BARTscore

In [8]:
### Metrics to calc
# BARTscore - for more details https://github.com/neulab/BARTScore/blob/main/README.md
CALC_BARTSCORE = True

PATH_TO_FINETUNED_BART = "" # if left empty will use vanilla BART. If you wish to load the finetuned BART, go to BARTscore's github, download the bart_score.pth (appear on the README) and provide the path here
if CALC_BARTSCORE:
    calculate_bartscore(TMP_JSON_FILES, models_scores, PATH_TO_FINETUNED_BART)

## Create Leaderboard Report HTML

In [9]:
from utils.model_ranker import create_model_ranking
create_response_output_view(RESULT_HTML_FOLDER, TMP_JSON_FILES, models_scores)
create_comparive_dashboard(RESULT_HTML_FOLDER, TMP_JSON_FILES)
create_data_stats_view(TEST_FILE_PATH, RESULT_IMG_FOLDER)
create_data_preview_view(TEST_FILE_PATH, RESULT_HTML_FOLDER)
main_html_filename = create_main_html(RESULT_FOLDER, models_scores, models_usage)

print(f"Created leaderboard in: {main_html_filename}")

# archive entire report
from datetime import datetime
today = datetime.now()
my_datetime = str(today.strftime("%d-%m-%Y_%H-%M-%S"))
zip_filename_fullpath = shutil.make_archive(f"/tmp/{my_datetime}", 'zip', "/tmp/final_result")
zip_filename = zip_filename_fullpath.split("/")[-1] # filename without folders
print(f"Archived report in: {zip_filename_fullpath}")

## Upload Report to S3

In [None]:
if S3_OUTPUT_PATH: # if defined S3
    s3_key = f"{S3_OUTPUT_PATH}/{zip_filename}"
    !aws s3 cp {zip_filename_fullpath} {s3_key}
    print(f"Uploaded to: {s3_key}")
else:
    print(f"No S3_OUTPUT_PATH set, not uploading {zip_filename}")

## Viewing results

In [10]:
if S3_OUTPUT_PATH:
    print(f'If running on a *remote* machine to view the results on your local computer copy-paste these commands in your terminal:\n\
    aws s3 cp {s3_key} /tmp/{zip_filename}\n\
    cd /tmp\n\
    unzip -d {zip_filename.replace(".zip","")} {zip_filename}\n\
    open /tmp/{zip_filename.replace(".zip","")}/index.html\n')

print(f'If running on a *local* machine copy-paste these commands in your terminal:\n\
    open {main_html_filename}')