In [None]:
!pip -q uninstall -y datasets
!pip -q install "datasets<4.0.0"


In [None]:
!pip uninstall -y datasets -q


In [None]:
!pip install -q \
  datasets==2.19.2 \
  transformers sentencepiece accelerate \
  sacrebleu nltk bert-score


In [None]:
from datasets import load_dataset

ds = load_dataset(
    "Helsinki-NLP/tatoeba_mt",
    "eng-spa",
    trust_remote_code=True,
    verification_mode="no_checks",     # split size mismatch fix
    cache_dir="/content/hf_cache"
)

print("Splits:", list(ds.keys()))
print(ds)


In [None]:
base_split = "train" if "train" in ds else "validation"
base = ds[base_split]

N = 1000
subset = base.shuffle(seed=42).select(range(N))

print("Using split:", base_split)
print("Base size:", len(base))
print("Subset size:", len(subset))
print("Example raw row:", subset[0])


In [None]:
processed = subset.map(
    lambda ex: {
        "src": ex["targetString"].strip(),  # Spanish
        "tgt": ex["sourceString"].strip(),  # English
    },
    remove_columns=subset.column_names
)

print(processed[0])


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/m2m100_418M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Device:", device)

# language codes
tokenizer.src_lang = "es"
forced_bos_token_id = tokenizer.get_lang_id("en")
print("forced_bos_token_id:", forced_bos_token_id)


In [None]:
def m2m_translate_one(text, max_len=128, num_beams=4):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len).to(device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_length=max_len,
            num_beams=num_beams,
            forced_bos_token_id=forced_bos_token_id
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

for i in range(5):
    src = processed[i]["src"]
    ref = processed[i]["tgt"]
    pred = m2m_translate_one(src)
    print("\nSRC:", src)
    print("PRED:", pred)
    print("REF:", ref)


In [None]:
from tqdm.auto import tqdm

def m2m_translate_batch(texts, batch_size=16, max_len=128, num_beams=4):
    preds = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_len
        ).to(device)

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_length=max_len,
                num_beams=num_beams,
                forced_bos_token_id=forced_bos_token_id
            )

        preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return preds

src_texts = [processed[i]["src"] for i in range(len(processed))]
refs = [processed[i]["tgt"] for i in range(len(processed))]

preds_m2m = m2m_translate_batch(src_texts, batch_size=16)
print("Pred count:", len(preds_m2m))
print("Example:", src_texts[0], "=>", preds_m2m[0])


In [None]:
import pandas as pd

df = pd.DataFrame({
    "src_es": src_texts,
    "ref_en": refs,
    "pred_m2m100_es_en": preds_m2m
})

df.to_csv("m2m100_es_en_preds.csv", index=False, encoding="utf-8")
df.head()


In [None]:
import sacrebleu
bleu = sacrebleu.corpus_bleu(preds_m2m, [refs]).score
print("M2M-100 BLEU:", bleu)


In [None]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

from nltk.translate.meteor_score import meteor_score

meteor = sum(
    meteor_score([r.split()], p.split())
    for r, p in zip(refs, preds_m2m)
) / len(refs)

print("M2M-100 METEOR (%):", meteor * 100)


In [None]:
from bert_score import score

P, R, F1 = score(preds_m2m, refs, lang="en", verbose=True)
print("M2M-100 BERTScore F1:", F1.mean().item())


In [None]:
print(f"M2M-100 | BLEU: {bleu:.2f} | METEOR(%): {meteor*100:.2f} | BERTScore F1: {F1.mean().item():.4f}")
