In [17]:
# ! pip install -q transformers datasets evaluate sentencepiece huggingface_hub

In [18]:
# ! pip install rouge_score

In [19]:
# ---------- Imports ----------
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from datasets import load_dataset, concatenate_datasets
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import torch

In [20]:
# ---------- Config ----------
HF_FINE_TUNED_REPO = "Eshan210352R/mt5-small-denoising-en-fr-final-2"
BASE_MODEL = "google/mt5-small"

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
# ---------- Load Models ----------
fine_tuned_model = MT5ForConditionalGeneration.from_pretrained(HF_FINE_TUNED_REPO).to(device)
fine_tuned_tokenizer = MT5Tokenizer.from_pretrained(HF_FINE_TUNED_REPO)

In [23]:
base_model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL).to(device)
base_tokenizer = MT5Tokenizer.from_pretrained(BASE_MODEL)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.


In [24]:
# ---------- Load Evaluation Dataset ----------
language_pair = "en-fr"
subset_size = 100
opus100_dataset = load_dataset("opus100", language_pair)
validation_subset = opus100_dataset["validation"].select(range(subset_size))

In [25]:
# Apply noise (same as training)
import random
def apply_noise(text):
    words = text.split()
    noisy_words = []
    for word in words:
        if random.random() < 0.1:  # 10% chance to delete a word
            continue
        if random.random() < 0.1 and len(word) > 1:
            char_list = list(word)
            del char_list[random.randint(0, len(char_list) - 1)]
            word = "".join(char_list)
        noisy_words.append(word)
    if len(noisy_words) > 1 and random.random() < 0.05:
        swap_index = random.randint(0, len(noisy_words) - 2)
        noisy_words[swap_index], noisy_words[swap_index + 1] = noisy_words[swap_index + 1], noisy_words[swap_index]
    return " ".join(noisy_words)

In [26]:
validation_subset = validation_subset.map(lambda ex: {
    "en_noisy": apply_noise(ex["translation"]["en"]),
    "fr_noisy": apply_noise(ex["translation"]["fr"]),
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [27]:
# ---------- Evaluation Function ----------
def evaluate_model(model, tokenizer, src_texts, tgt_texts, max_length=512):
    model.eval()
    predictions = []
    references = []

    for src, tgt in zip(src_texts, tgt_texts):
        inputs = tokenizer(src, return_tensors="pt", truncation=True, max_length=max_length).to(device)
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_length=max_length)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(tgt)

    # Compute BLEU
    references_tokenized = [[ref.split()] for ref in references]
    predictions_tokenized = [pred.split() for pred in predictions]
    bleu_score = corpus_bleu(references_tokenized, predictions_tokenized)

    # Compute ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1':0, 'rouge2':0, 'rougeL':0}
    for ref, pred in zip(references, predictions):
        score = scorer.score(ref, pred)
        for k in rouge_scores:
            rouge_scores[k] += score[k].fmeasure
    for k in rouge_scores:
        rouge_scores[k] /= len(predictions)

    return bleu_score, rouge_scores, predictions, references

In [28]:
# ---------- Prepare Source and Target Texts ----------

# Use list comprehension instead of .apply()
src_texts_en_fr = validation_subset["en_noisy"]
tgt_texts_en_fr = [x["fr"] for x in validation_subset["translation"]]

src_texts_fr_en = validation_subset["fr_noisy"]
tgt_texts_fr_en = [x["en"] for x in validation_subset["translation"]]


In [29]:
# ---------- Evaluate Fine-Tuned Model ----------
print("Evaluating Fine-Tuned MT5...")
bleu_en_fr_ft, rouge_en_fr_ft, _, _ = evaluate_model(fine_tuned_model, fine_tuned_tokenizer, src_texts_en_fr, tgt_texts_en_fr)
bleu_fr_en_ft, rouge_fr_en_ft, _, _ = evaluate_model(fine_tuned_model, fine_tuned_tokenizer, src_texts_fr_en, tgt_texts_fr_en)

Evaluating Fine-Tuned MT5...


In [30]:
# ---------- Evaluate Base Model ----------
print("Evaluating Base MT5...")
bleu_en_fr_base, rouge_en_fr_base, _, _ = evaluate_model(base_model, base_tokenizer, src_texts_en_fr, tgt_texts_en_fr)
bleu_fr_en_base, rouge_fr_en_base, _, _ = evaluate_model(base_model, base_tokenizer, src_texts_fr_en, tgt_texts_fr_en)

Evaluating Base MT5...


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [31]:
# ---------- Print Comparison ----------
print("\n==== EN -> FR ====")
print(f"Fine-Tuned BLEU: {bleu_en_fr_ft:.4f}, ROUGE1: {rouge_en_fr_ft['rouge1']:.4f}")
print(f"Base BLEU: {bleu_en_fr_base:.4f}, ROUGE1: {rouge_en_fr_base['rouge1']:.4f}")

print("\n==== FR -> EN ====")
print(f"Fine-Tuned BLEU: {bleu_fr_en_ft:.4f}, ROUGE1: {rouge_fr_en_ft['rouge1']:.4f}")
print(f"Base BLEU: {bleu_fr_en_base:.4f}, ROUGE1: {rouge_fr_en_base['rouge1']:.4f}")


==== EN -> FR ====
Fine-Tuned BLEU: 0.0087, ROUGE1: 0.1920
Base BLEU: 0.0000, ROUGE1: 0.0013

==== FR -> EN ====
Fine-Tuned BLEU: 0.0122, ROUGE1: 0.2997
Base BLEU: 0.0000, ROUGE1: 0.0019


In [32]:
# ---------- Perplexity Evaluation Function ----------
from torch.nn import CrossEntropyLoss
import math

In [33]:
def evaluate_perplexity(model, tokenizer, src_texts, tgt_texts, max_length=512):
    model.eval()
    loss_fct = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='sum')
    total_loss = 0
    total_tokens = 0

    for src, tgt in zip(src_texts, tgt_texts):
        # Encode source and target
        inputs = tokenizer(src, return_tensors="pt", truncation=True, max_length=max_length).to(device)
        labels = tokenizer(tgt, return_tensors="pt", truncation=True, max_length=max_length).to(device)["input_ids"]

        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels)
            # outputs.loss is averaged over batch, we compute sum manually
            loss = loss_fct(outputs.logits.view(-1, outputs.logits.size(-1)), labels.view(-1))

        total_loss += loss.item()
        total_tokens += (labels != tokenizer.pad_token_id).sum().item()

    perplexity = math.exp(total_loss / total_tokens)
    return perplexity


In [34]:
# ---------- Evaluate Perplexity ----------
print("\nEvaluating Perplexity...")


Evaluating Perplexity...


In [35]:
# EN -> FR
ppl_en_fr_ft = evaluate_perplexity(fine_tuned_model, fine_tuned_tokenizer, src_texts_en_fr, tgt_texts_en_fr)
ppl_en_fr_base = evaluate_perplexity(base_model, base_tokenizer, src_texts_en_fr, tgt_texts_en_fr)

In [36]:
# FR -> EN
ppl_fr_en_ft = evaluate_perplexity(fine_tuned_model, fine_tuned_tokenizer, src_texts_fr_en, tgt_texts_fr_en)
ppl_fr_en_base = evaluate_perplexity(base_model, base_tokenizer, src_texts_fr_en, tgt_texts_fr_en)

In [37]:
# ---------- Print Comparison ----------
print("\n==== Perplexity Comparison ====")
print(f"EN -> FR | Fine-Tuned: {ppl_en_fr_ft:.4f}, Base: {ppl_en_fr_base:.4f}")
print(f"FR -> EN | Fine-Tuned: {ppl_fr_en_ft:.4f}, Base: {ppl_fr_en_base:.4f}")


==== Perplexity Comparison ====
EN -> FR | Fine-Tuned: 13.5817, Base: 72017926346.2707
FR -> EN | Fine-Tuned: 20.2271, Base: 49482446559.1583
