In [1]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=90a0cb512e8aab170cac039898dd1f704db787733a270bc79f2bffadf39976e5
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [2]:
from rouge_score import rouge_scorer

In [3]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [4]:
candidate_summary = "the cat was found under the bed"
reference_summary = "the cat was under the bed"
scores = scorer.score(reference_summary, candidate_summary)
for key in scores:
    print(f'{key}: {scores[key]}')

rouge1: Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923)
rouge2: Score(precision=0.6666666666666666, recall=0.8, fmeasure=0.7272727272727272)
rougeL: Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923)


# After finetuning

In [5]:
import pandas as pd
from rouge_score import rouge_scorer

# Load the dataset
df = pd.read_csv('finetuned-summaries-1.csv')

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Lists to store ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Compute ROUGE scores for each pair of reference and candidate summaries
for index, row in df.iterrows():
    reference_summary = row['Ground_Truth']
    candidate_summary = row['Predicted_Summary']

    scores = scorer.score(reference_summary, candidate_summary)

    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate mean and standard deviation
rouge1_mean = sum(rouge1_scores) / len(rouge1_scores)
rouge2_mean = sum(rouge2_scores) / len(rouge2_scores)
rougeL_mean = sum(rougeL_scores) / len(rougeL_scores)

rouge1_stddev = (sum((x - rouge1_mean) ** 2 for x in rouge1_scores) / len(rouge1_scores)) ** 0.5
rouge2_stddev = (sum((x - rouge2_mean) ** 2 for x in rouge2_scores) / len(rouge2_scores)) ** 0.5
rougeL_stddev = (sum((x - rougeL_mean) ** 2 for x in rougeL_scores) / len(rougeL_scores)) ** 0.5

# Print the results
print("ROUGE-1:")
print(f"Mean: {rouge1_mean}")
print(f"Standard Deviation: {rouge1_stddev}\n")

print("ROUGE-2:")
print(f"Mean: {rouge2_mean}")
print(f"Standard Deviation: {rouge2_stddev}\n")

print("ROUGE-L:")
print(f"Mean: {rougeL_mean}")
print(f"Standard Deviation: {rougeL_stddev}\n")


ROUGE-1:
Mean: 0.4707764798272591
Standard Deviation: 0.2709681156861192

ROUGE-2:
Mean: 0.29445298834031414
Standard Deviation: 0.2847002323914824

ROUGE-L:
Mean: 0.44088446398783226
Standard Deviation: 0.2736027004423736



In [7]:
import pandas as pd
from rouge_score import rouge_scorer

# Load the dataset
df = pd.read_csv('zero-shot-summaries-1.csv')

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Lists to store ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Compute ROUGE scores for each pair of reference and candidate summaries
for index, row in df.iterrows():
    reference_summary = row['Summary']
    candidate_summary = row['ExtractedSummary']

    scores = scorer.score(reference_summary, candidate_summary)

    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate mean and standard deviation
rouge1_mean = sum(rouge1_scores) / len(rouge1_scores)
rouge2_mean = sum(rouge2_scores) / len(rouge2_scores)
rougeL_mean = sum(rougeL_scores) / len(rougeL_scores)

rouge1_stddev = (sum((x - rouge1_mean) ** 2 for x in rouge1_scores) / len(rouge1_scores)) ** 0.5
rouge2_stddev = (sum((x - rouge2_mean) ** 2 for x in rouge2_scores) / len(rouge2_scores)) ** 0.5
rougeL_stddev = (sum((x - rougeL_mean) ** 2 for x in rougeL_scores) / len(rougeL_scores)) ** 0.5

# Print the results
print("ROUGE-1:")
print(f"Mean: {rouge1_mean}")
print(f"Standard Deviation: {rouge1_stddev}\n")

print("ROUGE-2:")
print(f"Mean: {rouge2_mean}")
print(f"Standard Deviation: {rouge2_stddev}\n")

print("ROUGE-L:")
print(f"Mean: {rougeL_mean}")
print(f"Standard Deviation: {rougeL_stddev}\n")


ROUGE-1:
Mean: 0.2945120223357154
Standard Deviation: 0.16896329158638548

ROUGE-2:
Mean: 0.1085033862575723
Standard Deviation: 0.12796306892280102

ROUGE-L:
Mean: 0.255755703156373
Standard Deviation: 0.1597617451682135

