In [1]:
! pip install transformers datasets nltk rouge-score



In [2]:
# ---------- Imports ----------
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import torch
import random

In [3]:
# ---------- Config ----------
HF_FINE_TUNED_REPO = "Eshan210352R/mt5-span-denoising-en-it-final"
BASE_MODEL_REPO = "google/mt5-small"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# ---------- Load Models ----------
tokenizer = MT5Tokenizer.from_pretrained(HF_FINE_TUNED_REPO)

fine_tuned_model = MT5ForConditionalGeneration.from_pretrained(HF_FINE_TUNED_REPO).to(device)
base_model = MT5ForConditionalGeneration.from_pretrained(BASE_MODEL_REPO).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/893 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/757 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [5]:
# ---------- Load Dataset ----------
language_pair = "en-it"
subset_size = 200
opus100_dataset = load_dataset("opus100", language_pair)
validation_subset = opus100_dataset["validation"].select(range(subset_size))

In [6]:
# ---------- Apply Noise ----------
def apply_noise(text):
    words = text.split()
    noisy_words = []
    for word in words:
        if random.random() < 0.1:  # 10% deletion
            continue
        if random.random() < 0.1 and len(word) > 1:  # 10% char drop
            char_list = list(word)
            del char_list[random.randint(0, len(char_list) - 1)]
            word = "".join(char_list)
        noisy_words.append(word)
    if len(noisy_words) > 1 and random.random() < 0.05:  # occasional swap
        swap_index = random.randint(0, len(noisy_words) - 2)
        noisy_words[swap_index], noisy_words[swap_index + 1] = noisy_words[swap_index + 1], noisy_words[swap_index]
    return " ".join(noisy_words)

validation_subset = validation_subset.map(lambda ex: {
    "en_noisy": apply_noise(ex["translation"]["en"]),
    "it_noisy": apply_noise(ex["translation"]["it"]),
})


In [7]:
# ---------- Evaluation Function ----------
def evaluate_denoising(model, tokenizer, noisy_texts, clean_texts, max_length=512):
    model.eval()
    predictions, references = [], []

    for noisy, clean in zip(noisy_texts, clean_texts):
        inputs = tokenizer(noisy, return_tensors="pt", truncation=True, max_length=max_length).to(device)
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_length=max_length)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(clean)

    # BLEU
    references_tokenized = [[ref.split()] for ref in references]
    predictions_tokenized = [pred.split() for pred in predictions]
    bleu_score = corpus_bleu(references_tokenized, predictions_tokenized)

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1':0, 'rouge2':0, 'rougeL':0}
    for ref, pred in zip(references, predictions):
        score = scorer.score(ref, pred)
        for k in rouge_scores:
            rouge_scores[k] += score[k].fmeasure
    for k in rouge_scores:
        rouge_scores[k] /= len(predictions)

    return bleu_score, rouge_scores

In [8]:
# ---------- Run Evaluation ----------
print("Evaluating denoising task...")

results = {}
for model_name, model in [("Base mT5", base_model), ("Fine-tuned mT5", fine_tuned_model)]:
    print(f"\n==== {model_name} ====")

    bleu_en, rouge_en = evaluate_denoising(
        model, tokenizer,
        validation_subset["en_noisy"],
        [x["en"] for x in validation_subset["translation"]]
    )

    bleu_it, rouge_it = evaluate_denoising(
        model, tokenizer,
        validation_subset["it_noisy"],
        [x["it"] for x in validation_subset["translation"]]
    )

    results[model_name] = {
        "EN": {"BLEU": bleu_en, "ROUGE1": rouge_en['rouge1'], "ROUGE-L": rouge_en['rougeL']},
        "IT": {"BLEU": bleu_it, "ROUGE1": rouge_it['rouge1'], "ROUGE-L": rouge_it['rougeL']},
    }

    print(f"EN -> BLEU: {bleu_en:.4f}, ROUGE1: {rouge_en['rouge1']:.4f}, ROUGE-L: {rouge_en['rougeL']:.4f}")
    print(f"IT -> BLEU: {bleu_it:.4f}, ROUGE1: {rouge_it['rouge1']:.4f}, ROUGE-L: {rouge_it['rougeL']:.4f}")

Evaluating denoising task...

==== Base mT5 ====


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


EN -> BLEU: 0.0000, ROUGE1: 0.0009, ROUGE-L: 0.0009
IT -> BLEU: 0.0000, ROUGE1: 0.0020, ROUGE-L: 0.0020

==== Fine-tuned mT5 ====
EN -> BLEU: 0.0000, ROUGE1: 0.0045, ROUGE-L: 0.0045
IT -> BLEU: 0.0000, ROUGE1: 0.0016, ROUGE-L: 0.0016


In [9]:
# ---------- Compare Models ----------
print("\n==== Summary Comparison ====")
for lang in ["EN", "IT"]:
    print(f"\n{lang} Denoising:")
    for model_name in results:
        scores = results[model_name][lang]
        print(f"{model_name:15s} | BLEU: {scores['BLEU']:.4f}, ROUGE1: {scores['ROUGE1']:.4f}, ROUGE-L: {scores['ROUGE-L']:.4f}")


==== Summary Comparison ====

EN Denoising:
Base mT5        | BLEU: 0.0000, ROUGE1: 0.0009, ROUGE-L: 0.0009
Fine-tuned mT5  | BLEU: 0.0000, ROUGE1: 0.0045, ROUGE-L: 0.0045

IT Denoising:
Base mT5        | BLEU: 0.0000, ROUGE1: 0.0020, ROUGE-L: 0.0020
Fine-tuned mT5  | BLEU: 0.0000, ROUGE1: 0.0016, ROUGE-L: 0.0016
