# Evaluation Metrics
## ROUGE

In [6]:
from datasets import load_dataset,load_metric
from transformers import pipeline
import torch

#### load any dataset

In [3]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name, split="test[:10%]")

In [16]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer

model_name='google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

summarizer = pipeline("summarization",model=model,tokenizer=tokenizer)

In [4]:
dataset[0]

{'id': 'test_0_1',
 'summary': 'Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.',
 'topic': 'communication method'}

In [20]:
rouge = load_metric("rouge")

# Calculate ROUGE scores for each example in the dataset
for example in dataset.select(range(3)) :
    reference_summary = example["summary"]
    generated_summary = summarizer(example["dialogue"])[0]["summary_text"]
    
    # Calculate ROUGE scores
    rouge_scores = rouge.compute(predictions=[generated_summary], references=[[reference_summary]])
    print(f"reference_summary :{reference_summary}")
    print(f"generated_summary :{generated_summary}")
    print(f"\n")
    print(f"ROUGE-1: {rouge_scores['rouge1'].mid.fmeasure}")
    print(f"ROUGE-2: {rouge_scores['rouge2'].mid.fmeasure}")
    print(f"ROUGE-L: {rouge_scores['rougeL'].mid.fmeasure}")

    # Calculate and print ROUGE LSUM
    rouge_lsum = (rouge_scores['rouge1'].mid.fmeasure + rouge_scores['rouge2'].mid.fmeasure + rouge_scores['rougeL'].mid.fmeasure) / 3
    print(f"ROUGE-LSUM: {rouge_lsum}")
    print("---------------------")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


reference_summary :Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.
generated_summary :#Person1#: Ms. Dawson, I need you to take a dictation for me. Is it OK to use instant messaging in this office?


ROUGE-1: 0.36734693877551017
ROUGE-2: 0.12765957446808512
ROUGE-L: 0.32653061224489793
ROUGE-LSUM: 0.2738457084961644
---------------------
reference_summary :In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.
generated_summary :#Person1#: Ms. Dawson, I need you to take a dictation for me. Is it OK to use instant messaging in this office?


ROUGE-1: 0.31034482758620685
ROUGE-2: 0.03571428571428571
ROUGE-L: 0.20689655172413793
ROUGE-LSUM: 0.18431855500821015
---------------------
reference_summary :Ms. Dawson takes a dicta

## BLEU 

In [23]:
import numpy as np
bleu = load_metric("sacrebleu")

# Calculate BLEU scores for each example in the dataset
for example in dataset.select(range(3)):
    reference_summary = example["summary"]
    generated_summary = summarizer(example["dialogue"])[0]["summary_text"]
    
    # Calculate BLEU scores
    bleu_output = bleu.compute(predictions=[generated_summary], references=[[reference_summary]])
    
    print(f"reference_summary :{reference_summary}")
    print(f"generated_summary :{generated_summary}")
    # Print BLEU scores
    print(f"BLEU: {bleu_output['score']}")

    # Calculate brevity penalty
    reference_length = len(reference_summary.split())
    generated_length = len(generated_summary.split())
    brevity_penalty = min(1, np.exp(1 - reference_length / generated_length))
    
    # Calculate BLEU score with brevity penalty
    bleu_score_with_bp = bleu_output['score'] * brevity_penalty
    
    # Print BLEU score with brevity penalty
    print(f"BLEU (with brevity penalty): {bleu_score_with_bp}")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


reference_summary :Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.
generated_summary :#Person1#: Ms. Dawson, I need you to take a dictation for me. Is it OK to use instant messaging in this office?
BLEU: 8.748599594163373
BLEU (with brevity penalty): 6.970039653378489
reference_summary :In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.
generated_summary :#Person1#: Ms. Dawson, I need you to take a dictation for me. Is it OK to use instant messaging in this office?
BLEU: 6.33328398876227
BLEU (with brevity penalty): 3.3516583823616477
reference_summary :Ms. Dawson takes a dictation for #Person1# about prohibiting the use of Instant Message programs in the office. They argue about its reasonability but #Person1# still 