## Install / import required libraries to facilitate model evaluation

In [None]:
!pip install evaluate
!pip install rouge-score
!pip install pandas
!pip install google-colab

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=d376afb9a676a3e1f69ae8e909c891a4ae81bf98f278b8a2e4e2380c51ee7c4e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from evaluate import load
from google.colab import drive
import pandas as pd

Mount the drive to allow for file access

In [None]:
drive.mount('/content/drive')

## Evaluate Models

Function to perform evaluations using the following metrics:
- BLEU (BiLingual Evaluation Understudy)
- ROUGE (Recall-Oriented Understudy for Gisting Evaluation)
- METEOR (Metric for Evaluation of Translation with Explicit ORdering)


In [None]:
def perform_eval(predictions, references, sources):
  bleu = load("bleu")
  bleu_results = bleu.compute(predictions=predictions, references=references)
  print("BLEU:", bleu_results)

  rouge = load("rouge")
  rouge_results = rouge.compute(predictions=predictions, references=references)
  print("ROUGE:", rouge_results)

  meteor = load("meteor")
  meteor_results = meteor.compute(predictions=predictions, references=references)
  print("METEOR:", meteor_results)

Load results from models trained on other notebooks and perform evaluation

In [None]:
'''
predictions = []
references = []
sources = []
'''

ibm_one_data = pd.read_csv("/content/drive/MyDrive/UNSW/COMP6713 Group Assignment/misc/stat-translation-out.csv")

t5_data = pd.read_csv("/content/drive/MyDrive/UNSW/COMP6713 Group Assignment/misc/en-fr-test-results-t5-model.csv")
t5_data = t5_data.dropna()

helsinki_data = pd.read_csv("/content/drive/MyDrive/UNSW/COMP6713 Group Assignment/misc/helsinki-translation-out.csv")
helsinki_data = helsinki_data.dropna()

nllb_data = pd.read_csv("/content/drive/MyDrive/UNSW/COMP6713 Group Assignment/misc/nllb-translation-out.csv")
nllb_data = nllb_data.dropna()


In [None]:
print("Statistical Model:")
perform_eval(ibm_one_data['stat_translation'].values, ibm_one_data['en'].values, ibm_one_data['fr'].values)

Statistical Model:
BLEU: {'bleu': 0.02916412173023624, 'precisions': [0.38353896953881755, 0.0708208803900306, 0.015780692617548647, 0.004661847172488345], 'brevity_penalty': 0.7756846093763311, 'length_ratio': 0.7974422692284381, 'translation_length': 118413, 'reference_length': 148491}
ROUGE: {'rouge1': 0.32375128178543056, 'rouge2': 0.056588294305022546, 'rougeL': 0.27423169271737213, 'rougeLsum': 0.2742542408755668}


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


METEOR: {'meteor': 0.1931899360856956}


In [None]:
print("Fine-tuned T5 Model:")
perform_eval(t5_data['translation'].values, t5_data['en'].values, t5_data['fr'].values)

Fine-tuned T5 Model:
BLEU: {'bleu': 0.2963954925590674, 'precisions': [0.5834296122113333, 0.3422585547050878, 0.23440211131038305, 0.16971498593083417], 'brevity_penalty': 0.9928084659591848, 'length_ratio': 0.9928342015060322, 'translation_length': 147142, 'reference_length': 148204}
ROUGE: {'rouge1': 0.5465061808642152, 'rouge2': 0.3150646265845904, 'rougeL': 0.4995183223027745, 'rougeLsum': 0.4995875553086412}


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


METEOR: {'meteor': 0.5281406234032721}


In [None]:
print("Helsinki-NLP Opus-MT Model:")
perform_eval(helsinki_data['helsinki'].values, helsinki_data['en'].values, helsinki_data['fr'].values)

Helsinki-NLP Opus-MT Model:
BLEU: {'bleu': 0.28850070005561196, 'precisions': [0.5557266413314114, 0.3301631815318336, 0.2282812138545982, 0.16539693375843362], 'brevity_penalty': 1.0, 'length_ratio': 1.0473726469801052, 'translation_length': 152620, 'reference_length': 145717}
ROUGE: {'rouge1': 0.5965933286440637, 'rouge2': 0.3841863629809863, 'rougeL': 0.5526891472078707, 'rougeLsum': 0.5525795953645746}


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


METEOR: {'meteor': 0.587212337002679}


In [None]:
print("Facebook NLLB-200 Model:")
perform_eval(nllb_data['nllb'].values, nllb_data['en'].values, nllb_data['fr'].values)

Facebook NLLB-200 Model:
BLEU: {'bleu': 0.28850070005561196, 'precisions': [0.5557266413314114, 0.3301631815318336, 0.2282812138545982, 0.16539693375843362], 'brevity_penalty': 1.0, 'length_ratio': 1.0473726469801052, 'translation_length': 152620, 'reference_length': 145717}
ROUGE: {'rouge1': 0.5965933286440637, 'rouge2': 0.3841863629809863, 'rougeL': 0.5526891472078707, 'rougeLsum': 0.5525795953645746}


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


METEOR: {'meteor': 0.587212337002679}
