# Advanced Summary Evaluation
This notebook shows examples of how to perform an evaluation of the summary output

>Tested on SageMaker Studio with instance type ml.m5.8xlarge

### Initial Setup

In [None]:
!rm -Rf ~/.cache/pip/*
!pip3 install fmeval --upgrade-strategy only-if-needed --force-reinstall

!pip install -U ipywidgets

In [None]:
from fmeval.data_loaders.data_config import DataConfig
from fmeval.model_runners.bedrock_model_runner import BedrockModelRunner
from fmeval.constants import MIME_TYPE_JSONLINES
from fmeval.eval_algorithms.summarization_accuracy import SummarizationAccuracy

### Data Config Setup

Below, we create a DataConfig for the local dataset file, xsum_sample.jsonl.
- `dataset_name` is just an identifier for your own reference
- `dataset_uri` is either a local path to a file or an S3 URI
- `dataset_mime_type` is the MIME type of the dataset. Currently, JSON and JSON Lines are supported.
- `model_input_location` and `target_output_location` are JMESPath queries used to find the model inputs and target outputs within the dataset. The values that you specify here depend on the structure of the dataset itself. Take a look at xsum_sample.jsonl to see where "document" and "summary" show up.

In [None]:
config = DataConfig(
    dataset_name="xsum_sample",
    dataset_uri="xsum_sample.jsonl",
    dataset_mime_type=MIME_TYPE_JSONLINES,
    model_input_location="document",
    target_output_location="summary"
)

### Run Evaluation with Anthropic Claude

The model runner we create below will be used to perform inference on every sample in the dataset

In [None]:
model_id = 'anthropic.claude-v2'

bedrock_model_runner = BedrockModelRunner(
    model_id=model_id,
    output='completion',
    content_template='{"prompt": $prompt, "max_tokens_to_sample": 500}'
)

eval_algo = SummarizationAccuracy()
eval_output = eval_algo.evaluate(model=bedrock_model_runner, dataset_config=config, 
                                 prompt_template="Human: $feature\n\nAssistant:\n", save=True)

### Parse Evaluation Results

In [None]:
# Pretty-print the evaluation output (notice the scores).
# Create a Pandas DataFrame to visualize the results
import pandas as pd
import json

print(json.dumps(eval_output, default=vars, indent=4))
data = []

# We obtain the path to the results file from "output_path" in the cell above 
with open("/tmp/eval_results/summarization_accuracy_xsum_sample.jsonl", "r") as file:
    for line in file:
        data.append(json.loads(line))
        
df = pd.DataFrame(data)
df['eval_algo'] = df['scores'].apply(lambda x: x[0]['name'])
df['eval_score'] = df['scores'].apply(lambda x: x[0]['value'])
df

## Get Dependencies

In [14]:
!pip install ipynb -q
!pip install langchain -q
!pip install anthropic -q
!pip install tiktoken -q
!pip install nltk -q
!pip install rouge-score -q
!pip install evaluate -q
!pip3 install fmeval --upgrade-strategy only-if-needed --force-reinstall -q
!pip install transformers -q
!pip install detoxify -q

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.22.1 requires tokenizers!=0.11.3,<0.13,>=0.11.1, but you have tokenizers 0.15.0 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
spyder 5.3.3 requires pyqt5<5.16, which is not installed.
spyder 5.3.3 requires pyqtwebengine<5.16, which is not installed.
anthropic 0.8.1 requires tokenizers>=0.13.0, but you have tokenizers 0.12.1 which is incompatible.
awscli 1.31.9 requires botocore==1.33.9, but you have botocore 1.34.6 which is incompatible.
awscli 1.31.9 requires s3transfer<0.9.0,>=0.8.0, but you have s3transfer 0.10.0 which is incompatible.
distributed 2022.7.0

In [3]:
from ipynb.fs.full.simple_summarize import stuff_it_summary, map_reduce_summary
from ipynb.fs.full.advanced_summarize import generate_single_doc_summary, generate_multiple_docs_summary

## Get Model Output

Get summary output of all the different types of summarization

### Define helper functions

In [6]:
import json

def get_summary(dataset, sum_type="stuff_it_summary", func=stuff_it_summary):
    
    #set up some basic prompt options for the advanced summary functions.
    prompt_options = {}
    prompt_options['prompt_type'] = "summary"
    prompt_options['format_type'] = "narrative"
    prompt_options['manual_guidance'] = ""
    prompt_options['style_guide'] = ""
    
    with open(dataset) as f:
        data_w_model_summary = [json.loads(line) for line in f]
    
    for doc in data_w_model_summary:
        
        if sum_type=="multi_doc":
            
            #create a list of questions for the muti-doc guided process
            questions = [ "What is a brief, concise summary of this news report"]

            #create a discription of this set of documents, for the multi-doc guided process.
            doc_description = "The text is a news report of events that occured"
            
            answers = func({"input": doc['document']}, questions, doc_description, DEBUG=False)
            question = questions[0]
            model_output= (answers[question].replace("\n\n","\n"))
            
        elif sum_type=="auto_refine":
            model_output = func(doc['document'], prompt_options, AUTO_REFINE=True, DEBUG=False)
            
        elif sum_type=="map_reduce":
            model_output = func(doc['document'], DEBUG=False)
            
        else:
            model_output = func(doc['document'])
            
        doc["model_output"] = model_output
        
    return data_w_model_summary

### Stuff it Summary

In [7]:
dataset = "xsum_sample.jsonl"
model_output_stuff_it = get_summary(dataset, sum_type="stuff_it", func=stuff_it_summary)
print(model_output_stuff_it)



### Map Reduce Summary

In [8]:
dataset = "xsum_sample.jsonl"
model_output_map_reduce = get_summary(dataset, sum_type="map_reduce", func=map_reduce_summary)
print(model_output_map_reduce)



### Auto Refine Summary

In [9]:
dataset = "xsum_sample.jsonl"
model_output_auto_refine = get_summary(dataset, sum_type="auto_refine", func=generate_single_doc_summary)
print(model_output_auto_refine)



### Multi-Doc Summary

In [10]:
dataset = "xsum_sample.jsonl"
model_output_multi_doc = get_summary(dataset, sum_type="multi_doc", func=generate_multiple_docs_summary)
print(model_output_multi_doc)



## Summarization Accuracy

Accuracy evaluation with METEOR, ROUGE and BERTscore metrics

### Define helper functions

In [15]:
import json
from nltk.translate import meteor_score
from nltk import word_tokenize
import evaluate as hf_evaluate
import ray
from fmeval.eval_algorithms.helper_models.helper_model import BertscoreHelperModel


def get_meteor_score(target_output: str, model_output: str, **kwargs) -> float:
    """
    METEOR is a metric for text similarity between the machine-produced summary and human-produced reference summaries.
    Unigrams can be matched based on their surface forms, stemmed forms,
    and meanings; furthermore, METEOR can be easily extended to include more
    advanced matching strategies. Once all generalized unigram matches
    between the two strings have been found, METEOR computes a score for
    this matching using a combination of unigram-precision, unigram-recall, and
    a measure of fragmentation that is designed to directly capture how
    well-ordered the matched words in the machine translation are in relation
    to the reference.

    :param target_output: The expected responses from the model
    :param model_output: The output of a model that we want to evaluate.
    :returns: meteor score
    """
    return meteor_score.single_meteor_score(
        reference=word_tokenize(target_output), hypothesis=word_tokenize(model_output)
    )


def get_rouge_score(target_output: str, model_output: str, **kwargs) -> float:
    
    """
    The ROUGE-N, where N=[1,2,L], score is a standard metric for summarization quality.
    It computes the word overlap between the reference and model summary. Given that this metric is based on simple
    word overlap statistics, it works best for extractive summaries.
    Note that if we rephrase the summary without changing its meaning the ROUGE-N score will drop.

    Reference: https://huggingface.co/spaces/evaluate-metric/rouge

    :param target_output: The expected responses from the model
    :param model_output: The output of a model that we want to evaluate.
    :returns: rouge score
    """
    rouge_type = "rouge2"
    rouge = hf_evaluate.load("rouge")
    return rouge.compute(
        predictions=[model_output],
        references=[target_output],
        use_stemmer=True,
        rouge_types=[rouge_type],
    )[rouge_type]


def get_bert_score(target_output: str, model_output: str, **kwargs) -> float:
    """
    BERTscore is a similarity-based metric that compares the embedding of the prediction and target sentences
    under a learned model, typically, from the BERT family.
    This score may lead to increased flexibility compared to ROUGE and METEOR in terms of rephrasing since
    semantically similar sentences are (typically) embedded similarly.

    https://huggingface.co/spaces/evaluate-metric/bertscore

    :param target_output: The expected responses from the model
    :param model_output: The output of a model that we want to evaluate.
    :returns: bert score
    """
#     bert_score_model = "microsoft/deberta-xlarge-mnli"
    
#     # Initialize the shared BertscoreHelperModel actor that will be shared
#     # by every get_bert_score task.
#     bertscore_helper_model = BertscoreHelperModel.remote(
#         model_type=bert_score_model
#     )
    
#     return ray.get(bertscore_helper_model.get_helper_scores.remote(target_output, model_output))

    bertscore = hf_evaluate.load("bertscore")
    predictions=model_output,
    references=target_output,
    return bertscore.compute(
        predictions=predictions,
        references=references,
        lang="en"
    )["f1"][0]

def get_accuracy_evaluation(dataset):
    
    eval_scores = []
    
    meteor_scores = [get_meteor_score(data["summary"], data["model_output"]) for data in dataset]
    m_score = sum(meteor_scores) / len(meteor_scores)
    eval_scores.append({"name": "meteor", "value": m_score})
        
    rouge_scores = [get_rouge_score(data["summary"], data["model_output"]) for data in dataset]
    r_score = sum(rouge_scores) / len(rouge_scores)
    eval_scores.append({"name": "rouge", "value": r_score})
    
    bert_scores = [get_bert_score(data["summary"], data["model_output"]) for data in dataset]
    b_score = sum(bert_scores) / len(bert_scores)
    eval_scores.append({"name": "bertscore", "value": b_score})
    
    return eval_scores


2023-12-22 17:53:50,142	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


### Stuff it Evaluation

In [16]:
eval_scores = get_accuracy_evaluation(model_output_stuff_it)
print(json.dumps(eval_scores, default=vars, indent=4))

[
    {
        "name": "meteor",
        "value": 0.23104046732059233
    },
    {
        "name": "rouge",
        "value": 0.04466204671334081
    },
    {
        "name": "bertscore",
        "value": 0.8580165474038375
    }
]


### Map Reduce Evaluation

In [17]:
eval_scores = get_accuracy_evaluation(model_output_map_reduce)
print(json.dumps(eval_scores, default=vars, indent=4))

[
    {
        "name": "meteor",
        "value": 0.24072320988539928
    },
    {
        "name": "rouge",
        "value": 0.04206181292209505
    },
    {
        "name": "bertscore",
        "value": 0.8594577406582079
    }
]


### Auto Refine Evaluation

In [18]:
eval_scores = get_accuracy_evaluation(model_output_auto_refine)
print(json.dumps(eval_scores, default=vars, indent=4))

[
    {
        "name": "meteor",
        "value": 0.19527542772687698
    },
    {
        "name": "rouge",
        "value": 0.026000498700004277
    },
    {
        "name": "bertscore",
        "value": 0.8429720276280454
    }
]


### Multi-Doc Evaluation

In [19]:
eval_scores = get_accuracy_evaluation(model_output_multi_doc)
print(json.dumps(eval_scores, default=vars, indent=4))

[
    {
        "name": "meteor",
        "value": 0.21402161569957817
    },
    {
        "name": "rouge",
        "value": 0.03569174735570144
    },
    {
        "name": "bertscore",
        "value": 0.8546279731549715
    }
]


## Summarization Toxicity

Evaluate Toxicity of Summary generated, It uses the [Toxigen](https://huggingface.co/tomh/toxigen_roberta/tree/main) and [detotify](https://github.com/unitaryai/detoxify) to evaluate and score model responses in various toxicity categories

### Define helper functions

In [20]:
import json
from typing import Any, Dict, List
from detoxify import Detoxify
from transformers import pipeline


def get_toxicity(text_input: List[str], **kwargs) -> List[float]:
    
    """
    Helper model for toxigen model: https://huggingface.co/tomh/toxigen_roberta/tree/main
    """
    
    model_name = "tomh/toxigen_roberta"
    model = pipeline("text-classification", model=model_name)
    
    inference_output = model(text_input)
    result = [x["score"] if x["label"] == "LABEL_1" else 1.0 - x["score"] for x in inference_output]
    
    return result

def get_detoxify(text_input: List[str], **kwargs) -> Dict[str, List[float]]:
    
    """
    Helper model for Detoxify: https://github.com/unitaryai/detoxify
    returns dict with keys as score name and value being list of scores for text inputs
    """

    model = Detoxify(model_type="unbiased").predict
    
    return model(text_input)

def get_toxicity_evaluation(dataset):
    
    eval_scores = []
    model_outputs = [data["model_output"] for data in dataset]
    
    t_scores = get_toxicity(model_outputs)
    toxicity_score = sum(t_scores) / len(t_scores)
    eval_scores.append({"type": "toxigen", "name": "toxicity", "value": toxicity_score})
    
    dt_scores = get_detoxify(model_outputs)
    for k, v in dt_scores.items():
        avg_score = sum(v) / len(v)
        eval_scores.append({"type": "detoxify", "name": k, "value": avg_score})
    
    return eval_scores


### Stuff it Evaluation

In [21]:
eval_scores = get_toxicity_evaluation(model_output_stuff_it)
print(json.dumps(eval_scores, default=vars, indent=4))

[
    {
        "type": "toxigen",
        "name": "toxicity",
        "value": 0.0010530885897184674
    },
    {
        "type": "detoxify",
        "name": "toxicity",
        "value": 0.0040379522297849975
    },
    {
        "type": "detoxify",
        "name": "severe_toxicity",
        "value": 1.3265852939376169e-05
    },
    {
        "type": "detoxify",
        "name": "obscene",
        "value": 0.00020571946238066486
    },
    {
        "type": "detoxify",
        "name": "identity_attack",
        "value": 0.00020279706261231025
    },
    {
        "type": "detoxify",
        "name": "insult",
        "value": 0.000507087315016083
    },
    {
        "type": "detoxify",
        "name": "threat",
        "value": 8.523149800136085e-05
    },
    {
        "type": "detoxify",
        "name": "sexual_explicit",
        "value": 0.006912611168110361
    }
]


### Map Reduce Evaluation

In [22]:
eval_scores = get_toxicity_evaluation(model_output_map_reduce)
print(json.dumps(eval_scores, default=vars, indent=4))

[
    {
        "type": "toxigen",
        "name": "toxicity",
        "value": 0.0009589038397136488
    },
    {
        "type": "detoxify",
        "name": "toxicity",
        "value": 0.006883584137540311
    },
    {
        "type": "detoxify",
        "name": "severe_toxicity",
        "value": 1.5051876438950498e-05
    },
    {
        "type": "detoxify",
        "name": "obscene",
        "value": 0.0003344472857670074
    },
    {
        "type": "detoxify",
        "name": "identity_attack",
        "value": 0.00023116201153147573
    },
    {
        "type": "detoxify",
        "name": "insult",
        "value": 0.000573909168260319
    },
    {
        "type": "detoxify",
        "name": "threat",
        "value": 7.95652205469148e-05
    },
    {
        "type": "detoxify",
        "name": "sexual_explicit",
        "value": 0.01563473859732767
    }
]


### Auto Refine Evaluation

In [23]:
eval_scores = get_toxicity_evaluation(model_output_auto_refine)
print(json.dumps(eval_scores, default=vars, indent=4))

[
    {
        "type": "toxigen",
        "name": "toxicity",
        "value": 0.001304548037679572
    },
    {
        "type": "detoxify",
        "name": "toxicity",
        "value": 0.0031972504144313894
    },
    {
        "type": "detoxify",
        "name": "severe_toxicity",
        "value": 1.758379134068826e-05
    },
    {
        "type": "detoxify",
        "name": "obscene",
        "value": 0.00021133065667144316
    },
    {
        "type": "detoxify",
        "name": "identity_attack",
        "value": 0.00023939590564144679
    },
    {
        "type": "detoxify",
        "name": "insult",
        "value": 0.00043225372256098415
    },
    {
        "type": "detoxify",
        "name": "threat",
        "value": 9.041447849181407e-05
    },
    {
        "type": "detoxify",
        "name": "sexual_explicit",
        "value": 0.004624396794258102
    }
]


### Multi-Doc Evaluation

In [24]:
eval_scores = get_toxicity_evaluation(model_output_multi_doc)
print(json.dumps(eval_scores, default=vars, indent=4))

[
    {
        "type": "toxigen",
        "name": "toxicity",
        "value": 0.0010216581194024336
    },
    {
        "type": "detoxify",
        "name": "toxicity",
        "value": 0.007268656422016456
    },
    {
        "type": "detoxify",
        "name": "severe_toxicity",
        "value": 1.2331967414674613e-05
    },
    {
        "type": "detoxify",
        "name": "obscene",
        "value": 0.00040481699177594
    },
    {
        "type": "detoxify",
        "name": "identity_attack",
        "value": 0.00021462668097994633
    },
    {
        "type": "detoxify",
        "name": "insult",
        "value": 0.0005625353131970195
    },
    {
        "type": "detoxify",
        "name": "threat",
        "value": 0.00012712276283740435
    },
    {
        "type": "detoxify",
        "name": "sexual_explicit",
        "value": 0.010632976217972387
    }
]


## LLM Powered Unsupervised Evaluation

This demonstrates how to use a large language model (LLM) to evaluate output of other LLMs. This approach doesnt require ground truth dataset and can be used to evaluate generation of a small model using a large model.

### Summarization Quality

In this evaluation, the LLM evaluates the accuracy, coherence, factuality and completeness of the summary on a scale of 1-5, 5 being the best

#### Define helper functions

In [25]:
import boto3
import json
import os
import sys
import re

bedrock_runtime = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1',
)

def invoke_model(prompt_data):
    body = {"prompt": "Human: " + prompt_data + " \\nAssistant:",
            "max_tokens_to_sample": 1000, 
            "temperature": 1,
            "top_k": 250,
            "top_p": 0.999,
            "stop_sequences": ["\\n\\nHuman:"]}

    body = json.dumps(body) # Encode body as JSON string

    modelId = 'anthropic.claude-instant-v1' 
    accept = 'application/json'
    contentType = 'application/json'

    #Invoke the model
    response = bedrock_runtime.invoke_model(body=body.encode('utf-8'), # Encode to bytes
                                     modelId=modelId, 
                                     accept=accept, 
                                     contentType=contentType)

    response_body = json.loads(response.get('body').read())
    return response_body.get('completion')


def get_evaluation_from_model(text, summary):
    
    prompt = f"""Human: You will be given a summmary of a text. Your task is to evaluate the summary in four dimensions; accuracy, coherence, factuality and completeness.
    Provide a score of 1-5 in each dimension, with 5 being the best score.

    Original Text: {text}

    Summary: {summary}

    Output result in the form below:

    - Coherence: Evaluation Scores for coherence (1-5)
    - Accuracy: Evaluation Scores for accuracy (1-5)
    - Factuality: Evaluation Scores for factuality (1-5)
    - Completeness: Evaluation Scores for completness (1-5)
    
    Assistant:
    """.format(text=text, sumamry=summary)
    
    evaluation = invoke_model(prompt)
    
    return evaluation

def start_unsupervised_evaluation(dataset):
    
    results = []
    for data in dataset:
        resp = get_evaluation_from_model(data["summary"], data["model_output"])
        
        m = re.search("Accuracy: (\d)", resp)
        if m is None:
            accuracy = 0
        else:
            accuracy = int(m.group(1))

        m = re.search("Coherence: (\d)", resp)
        if m is None:
            coherence = 0
        else:
            coherence = int(m.group(1))

        m = re.search("Factuality: (\d)", resp)
        if m is None:
            factuality = 0
        else:
            factuality = int(m.group(1))

        m = re.search("Completeness: (\d)", resp)
        if m is None:
            completeness = 0
        else:
            completeness = int(m.group(1))
            
            
        eval_dict = {"Coherence": coherence, "Accuracy": accuracy, "Factuality": factuality, "Completeness": completeness}
        results.append(eval_dict)
    
    total_coherence = total_accuracy = total_factuality = total_completeness = 0
    
    # Calculate the sum
    for result in results:
        total_coherence += result["Coherence"]
        total_accuracy += result["Accuracy"]
        total_factuality += result["Factuality"]
        total_completeness += result["Completeness"]
    
    # Calculate the average
    num_records = len(results)
    avg_coherence = total_coherence / num_records
    avg_accuracy = total_accuracy / num_records
    avg_factuality = total_factuality / num_records
    avg_completeness = total_completeness / num_records
    
    evaluation = {"Coherence": avg_coherence, "Accuracy": avg_accuracy, "Factuality": avg_factuality, "Completeness": avg_completeness}
    
    return evaluation


#### Stuff it Evaluation

In [26]:
eval_scores = start_unsupervised_evaluation(model_output_stuff_it)
print(json.dumps(eval_scores, default=vars, indent=4))

{
    "Coherence": 4.7368421052631575,
    "Accuracy": 4.473684210526316,
    "Factuality": 4.7368421052631575,
    "Completeness": 3.789473684210526
}


#### Map Reduce Evaluation

In [27]:
eval_scores = start_unsupervised_evaluation(model_output_map_reduce)
print(json.dumps(eval_scores, default=vars, indent=4))

{
    "Coherence": 4.2631578947368425,
    "Accuracy": 3.736842105263158,
    "Factuality": 4.105263157894737,
    "Completeness": 3.473684210526316
}


#### Auto Refine Evaluation

In [28]:
eval_scores = start_unsupervised_evaluation(model_output_auto_refine)
print(json.dumps(eval_scores, default=vars, indent=4))

{
    "Coherence": 4.2631578947368425,
    "Accuracy": 4.315789473684211,
    "Factuality": 4.315789473684211,
    "Completeness": 3.4210526315789473
}


#### Multi-Doc Evaluation

In [29]:
eval_scores = start_unsupervised_evaluation(model_output_multi_doc)
print(json.dumps(eval_scores, default=vars, indent=4))

{
    "Coherence": 4.631578947368421,
    "Accuracy": 4.421052631578948,
    "Factuality": 4.473684210526316,
    "Completeness": 3.736842105263158
}
