# Translation Evaluation Notebook

This notebook evaluates machine translation (MT) quality using various NLP metrics:
- **BLEU**: Measures precision of n-grams.
- **METEOR**: Considers synonymy and stemming.
- **ROUGE**: Compares overlap with reference translations.
- **COMET**: Neural-based evaluation metric.
- **BERTScore**: Uses contextual embeddings.

Ensure that `multilingual_labelled_translated.csv` is available before running the notebook.


In [None]:
import json
from bert_score import score
import sacrebleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from comet import download_model, load_from_checkpoint
import csv


In [None]:
def eval_bert_score(references, hypothesis):
    P, R, F1 = score(
        hypothesis,
        references,
        model_type="microsoft/deberta-xlarge-mnli"
    )
    return {'P': P.numpy().tolist(), 'R': R.numpy().tolist(), 'F': F1.numpy().tolist()}


In [None]:
def eval_meteor(references, hypothesis):
    res = []
    for i, item in enumerate(hypothesis):
        score = meteor_score([references[i].split(' ')], item.split(' '))
        res.append(score)
    return res


In [None]:
def eval_bleu(references, hypothesis):
    res = []
    for i, item in enumerate(hypothesis):
        bleu = sacrebleu.corpus_bleu([item], [[references[i]]])
        res.append(bleu.score)
    return res


In [None]:
def eval_comet(references, hypothesis, sources):
    model_path = download_model("wmt20-comet-da")
    model = load_from_checkpoint(model_path)
    data = [{"src": src, "mt": mt, "ref": ref} for src, mt, ref in zip(sources, hypothesis, references)]
    predictions = model.predict(data, batch_size=2, gpus=0, num_workers=1)
    return predictions


In [None]:
def eval_rouge(references, hypothesis):
    res = []
    rouge_types = ["rougeL"]
    scorer = rouge_scorer.RougeScorer(rouge_types, use_stemmer=True)
    for i, item in enumerate(references):
        scores = scorer.score(item, hypothesis[i])
        res.append(scores)
    return res


In [None]:
# Load CSV file
file_path = "multilingual_labelled_translated.csv"  # Replace with your file path
with open(file_path, mode="r", newline="", encoding="utf-8") as file:
    reader = csv.DictReader(file)  # Reads as a list of dictionaries
    data = [row for row in reader]

mt_apis = ['gpt_translation', 'deepL_translation', 'aws_translation']


## BLEU Evaluation

In [None]:
for mt in mt_apis:
    drctr = mt.replace('translation', 'evaluation')
    print(f"Evaluating BLEU for", mt)

    ref, hypo, sources = [], [], []
    for item in data:
        ref.append(item['translation'])
        hypo.append(item.get(mt, ''))

    res = eval_bleu(ref, hypo)
    with open(f'{drctr}/BLEU_evaluation.json', 'w', encoding='utf-8') as f:
        json.dump(res, f, indent=4)
        
print("BLEU evaluation complete.")


## METEOR Evaluation

In [None]:
for mt in mt_apis:
    drctr = mt.replace('translation', 'evaluation')
    print(f"Evaluating METEOR for", mt)

    ref, hypo, sources = [], [], []
    for item in data:
        ref.append(item['translation'])
        hypo.append(item.get(mt, ''))

    res = eval_meteor(ref, hypo)
    
    with open(f'{drctr}/METEOR_evaluation.json', 'w', encoding='utf-8') as f:
        json.dump(res, f, indent=4)
        
print("METEOR evaluation complete.")


## ROUGE Evaluation

In [None]:
for mt in mt_apis:
    drctr = mt.replace('translation', 'evaluation')
    print(f"Evaluating ROUGE for", mt)

    ref, hypo, sources = [], [], []
    for item in data:
        ref.append(item['translation'])
        hypo.append(item.get(mt, ''))

    res = eval_rouge(ref, hypo)
    
    with open(f'{drctr}/ROUGE_evaluation.json', 'w', encoding='utf-8') as f:
        json.dump(res, f, indent=4)
        
print("ROUGE evaluation complete.")


## BERTScore Evaluation


In [None]:
for mt in mt_apis:
    drctr = mt.replace('translation', 'evaluation')
    print(f"Evaluating BERTScore for", mt)

    ref, hypo, sources = [], [], []
    for item in data:
        ref.append(item['translation'])
        hypo.append(item.get(mt, ''))

    res = eval_bert_score(ref, hypo)

    with open(f'{drctr}/BERTScore_evaluation.json', 'w', encoding='utf-8') as f:
        json.dump(res, f, indent=4)

print("BERTScore evaluation complete.")


## COMET Evaluation
Run COMET individually after commenting out all the imports in cell #1 and only importing comet specific library.
There is some method override that throws error when comet is run with all other imports from above.

In [None]:
for mt in mt_apis:
    drctr = mt.replace('translation', 'evaluation')
    print(f"Evaluating COMET for", mt)

    ref, hypo, sources = [], [], []
    for item in data:
        ref.append(item['translation'])
        hypo.append(item.get(mt, ''))
        sources.append(item['body'])

    res = eval_comet(ref, hypo, sources)

    with open(f'{drctr}/COMET_evaluation.json', 'w', encoding='utf-8') as f:
        json.dump(res, f, indent=4)

print("COMET evaluation complete.")