# Import Libraries

In [1]:
!pip install evaluate --quiet
!pip install rouge_score --quiet
!pip install git+https://github.com/google-research/bleurt.git --quiet

import pandas as pd
import numpy as np
import evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


# Load Metrics

In [2]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bleurt = evaluate.load("bleurt", "bleurt-20")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.14G [00:00<?, ?B/s]

Computing checksums: 100%|##########| 1/1 [00:05<00:00,  5.86s/it]

# Define Function to Calculate Scores

In [3]:
def calcScores(filename):

    print("Now processing:", filename)

    results_df = pd.read_csv(filename)
    results_df.drop(columns=["Unnamed: 0"], inplace=True)

    sm_baseline_rouge_scores = rouge.compute(
        predictions=results_df["sm_baseline_output"],
        references=results_df["reference"]
    )

    sm_baseline_bleu_scores = bleu.compute(
        predictions=results_df["sm_baseline_output"],
        references=results_df["reference"]
    )

    bleurt_scores = bleurt.compute(
        predictions=results_df["sm_baseline_output"],
        references=results_df["reference"]
    )

    sm_baseline_bleurt_score = np.mean(bleurt_scores["scores"])

    sm_finetuned_rouge_scores = rouge.compute(
        predictions=results_df["sm_finetuned_output"],
        references=results_df["reference"]
    )

    sm_finetuned_bleu_scores = bleu.compute(
        predictions=results_df["sm_finetuned_output"],
        references=results_df["reference"]
    )

    bleurt_scores = bleurt.compute(
        predictions=results_df["sm_finetuned_output"],
        references=results_df["reference"]
    )

    sm_finetuned_bleurt_score = np.mean(bleurt_scores["scores"])

    lg_baseline_rouge_scores = rouge.compute(
        predictions=results_df["lg_baseline_output"],
        references=results_df["reference"]
    )

    lg_baseline_bleu_scores = bleu.compute(
        predictions=results_df["lg_baseline_output"],
        references=results_df["reference"]
    )

    bleurt_scores = bleurt.compute(
        predictions=results_df["lg_baseline_output"],
        references=results_df["reference"]
    )

    lg_baseline_bleurt_score = np.mean(bleurt_scores["scores"])

    lg_finetuned_rouge_scores = rouge.compute(
        predictions=results_df["lg_finetuned_output"],
        references=results_df["reference"]
    )

    lg_finetuned_bleu_scores = bleu.compute(
        predictions=results_df["lg_finetuned_output"],
        references=results_df["reference"]
    )

    bleurt_scores = bleurt.compute(
        predictions=results_df["lg_finetuned_output"],
        references=results_df["reference"]
    )

    lg_finetuned_bleurt_score = np.mean(bleurt_scores["scores"])

    score_dict = [None] * 4
    score_dict[0] = {
        "model" : "sm_baseline",
        "bleu" : sm_baseline_bleu_scores["bleu"],
        "bleurt" : sm_baseline_bleurt_score,
        "rouge1" : sm_baseline_rouge_scores["rouge1"],
        "rouge2" : sm_baseline_rouge_scores["rouge2"],
        "rougeL" : sm_baseline_rouge_scores["rougeL"]
    }
    score_dict[1] = {
        "model" : "sm_finetuned",
        "bleu" : sm_finetuned_bleu_scores["bleu"],
        "bleurt" : sm_finetuned_bleurt_score,
        "rouge1" : sm_finetuned_rouge_scores["rouge1"],
        "rouge2" : sm_finetuned_rouge_scores["rouge2"],
        "rougeL" : sm_finetuned_rouge_scores["rougeL"]
    }
    score_dict[2] = {
        "model" : "lg_baseline",
        "bleu" : lg_baseline_bleu_scores["bleu"],
        "bleurt" : lg_baseline_bleurt_score,
        "rouge1" : lg_baseline_rouge_scores["rouge1"],
        "rouge2" : lg_baseline_rouge_scores["rouge2"],
        "rougeL" : lg_baseline_rouge_scores["rougeL"]
    }
    score_dict[3] = {
        "model" : "lg_finetuned",
        "bleu" : lg_finetuned_bleu_scores["bleu"],
        "bleurt" : lg_finetuned_bleurt_score,
        "rouge1" : lg_finetuned_rouge_scores["rouge1"],
        "rouge2" : lg_finetuned_rouge_scores["rouge2"],
        "rougeL" : lg_finetuned_rouge_scores["rougeL"]
    }

    score_df = pd.DataFrame(score_dict)
    score_df.to_csv(filename.replace("results", "scores"))

# Calculate Scores for Each Results File

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
calcScores("/content/drive/MyDrive/W266 Final Project/t5_results_2_1.0.csv")
calcScores("/content/drive/MyDrive/W266 Final Project/t5_results_5_1.0.csv")
calcScores("/content/drive/MyDrive/W266 Final Project/t5_results_2_1.5.csv")
calcScores("/content/drive/MyDrive/W266 Final Project/t5_results_5_1.5.csv")

Now processing: /content/drive/MyDrive/W266 Final Project/t5_results_2_1.0.csv
Now processing: /content/drive/MyDrive/W266 Final Project/t5_results_5_1.0.csv
Now processing: /content/drive/MyDrive/W266 Final Project/t5_results_2_1.5.csv
Now processing: /content/drive/MyDrive/W266 Final Project/t5_results_5_1.5.csv
