# Summarization Scoring using AWS Bedrock

In this example notebook, we will be comparing how different models in AWS Bedrock perform at summarization tasks using **Arthur Bench**. 

The overall summarization comparison is setup as a **Bench TestSuite**, and each model is compared head-to-head against every other model
in a **Bench TestRun**. Bench provides the ability to view these comparisons in the provided User Interface, as well as access statistics
from the Test and TestRuns themselves for further analysis. For example, in the notebook below we provide a means for using the ELO Scoring
Algorithm to determine which model performs best at this summarization task. 
                                                                                                                 
The task is to summarize 49 News Articles, and comparison is done using ChatGPT 3.5 Turbo (see summary_quality.py)

## 1. Setup AWS Bedrock Client

In [None]:
"""
Authentication is handled using the AWS_PROFILE environment variable. Check the AWS Boto3 documentation and the provided
utility library for connecting to Bedrock for additional information
"""
from bedrock_client import client

bedrock_runtime = client.get_bedrock_client(region="us-east-1")

## 2. Load the data and prepare it for inferencing

In [None]:
import csv

articles = []

with open("data/news_summary/example_summaries.csv", "r") as f:
    dr = csv.DictReader(f)
    for row in dr: 
        articles.append(row["input_text"])

len(articles)

## 3. Generate inferences (summaries) for the articles

In [None]:
import json

prompt = """\
Summarize the following news document down to its most important points in less than 250 words.
{}
"""

def generate_summary_from_llama(model_id, article): 
    body = json.dumps({"prompt": prompt.format(article)})
    modelId = model_id
    accept = "application/json"
    contentType = "application/json"
    
    response = bedrock_runtime.invoke_model(
        body=body, modelId=modelId, accept=accept, contentType=contentType
    )
    response_body = json.loads(response.get("body").read())
    return response_body.get("generation")


def generate_summary_from_titan(model_id, article):
    body = json.dumps({"inputText": prompt.format(article)})
    modelId = model_id
    accept = "application/json"
    contentType = "application/json"
    
    response = bedrock_runtime.invoke_model(
        body=body, modelId=modelId, accept=accept, contentType=contentType
    )
    response_body = json.loads(response.get("body").read())
    return response_body.get("results")[0].get("outputText")


def generate_summary_from_claude(model_id, article):
    body = json.dumps({
        "prompt": f"Human:\n{prompt.format(article)}\nAssistant:\n",
        "max_tokens_to_sample": 4000,  # https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-claude.html
    })
    modelId = model_id
    accept = "application/json"
    contentType = "application/json"
    
    response = bedrock_runtime.invoke_model(
        body=body, modelId=modelId, accept=accept, contentType=contentType
    )
    response_body = json.loads(response.get("body").read())
    return response_body.get("completion")    

In [None]:
models = (
    "meta.llama2-13b-chat-v1",
    "meta.llama2-70b-chat-v1",
    "amazon.titan-text-lite-v1",
    "anthropic.claude-v2",
    "anthropic.claude-v1"
)

In [None]:
from collections import defaultdict
summaries = defaultdict(list)
summaries["input"] = articles

In [None]:
for model in [m for m in models if m not in summaries.keys()]:
    summaries.setdefault(model, [])
    for i, article in enumerate(articles):         
        try:
            print(f"Generating summary for article {i} using model {model}")
            if "meta" in model:
                summary = generate_summary_from_llama(model, article)
            elif "amazon" in model:
                summary = generate_summary_from_titan(model, article)
            elif "claude" in model:
                summary = generate_summary_from_claude(model, article)
            else:
                print(f"Unable to determine what {model} is")
                continue
                
            summaries[model].append(summary)
        except:
            print(f"Couldn't generate summary for article {i} using model {model}")
            summaries[model].append("Unable to summarize")
            continue

### 3.5 Save (and load) inferences to a pickle file

In [None]:
import pickle

with open("summaries.pkl", "wb") as f:
    pickle.dump(summaries, f)

In [None]:
import pickle
with open('summaries.pkl', 'rb') as f:
    summaries = pickle.load(f)

## 4. Setup Bench TestSuites for each model and run Bench TestRuns

The below sets up TestSuites + Runs for each unique combination so we have the ability to rank the models in a round-robin tournament for ELO rating. 

See the [Quickstart Guide](https://bench.readthedocs.io/en/latest/quickstart.html#view-results-in-local-ui) for additional information.

In [None]:
from arthur_bench.run.testsuite import TestSuite
from itertools import combinations
from collections import defaultdict


# Make sure you've set the BENCH_FILE_DIR environment variable

# The summary_quality scorer uses gpt-3.5-turbo to score the summary
# so make sure that your OPENAI_API_KEY environment variable is set

combos = list(combinations(models, 2))
d_combos = defaultdict(list)
for m1, m2 in combos:
    d_combos[m1].append(m2)

print(d_combos)

In [None]:
def create_test_suite(model):
    print(f"Creating test suite for model {model}")
    return TestSuite(
        f"News Summarization using {model} as reference", 
        'summary_quality',
        input_text_list=summaries["input"],
        reference_output_list=summaries[model]
    )    

suites = {}
for model in d_combos.keys():
    suites[model] = create_test_suite(model)

In [None]:
def run_test_suite(suite, ref_model, cand_model):
    run = suite.run(
        run_name=f"{ref_model}_vs_{cand_model}",
        candidate_output_list=summaries[cand_model]
    )
    return run

runs = defaultdict(dict)
for ref_model, suite in suites.items():
    for cand_model in d_combos[ref_model]:
        runs[ref_model][cand_model] = run_test_suite(suite, ref_model, cand_model)

In [16]:
from elo import Implementation

i = Implementation()
for model in models:
    i.addPlayer(model)

for ref_model in runs.keys():
    for cand_model, run in runs[ref_model].items():
        for test_case in run.test_cases:
            score = test_case.score
            if score == 1.0: 
                i.recordMatch(ref_model, cand_model, winner=cand_model)
            if score == 0.5: 
                i.recordMatch(ref_model, cand_model, draw=True)
            if score == 0.0:
                i.recordMatch(ref_model, cand_model, winner=ref_model)
            else:
                pass

sorted_list = sorted(i.getRatingList(), key=lambda x: x[1], reverse=True)

# Output the sorted list
for model, score in sorted_list:
    print(f"Model {model}: Rating {score}")


Model anthropic.claude-v2: Rating 1345.4153155340496
Model meta.llama2-70b-chat-v1: Rating 1079.6979024278733
Model amazon.titan-text-lite-v1: Rating 898.4211374051577
Model anthropic.claude-v1: Rating 864.5103513782595
Model meta.llama2-13b-chat-v1: Rating 811.9552932546615
