In [2]:
# Install Transformers and Metrics
!pip install transformers datasets evaluate sacrebleu bert_score accelerate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import nltk
# Download necessary NLTK data for METEOR metric
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
import torch
import evaluate
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Setup GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [6]:
import pandas as pd
from datasets import Dataset

# 1. Download the raw data file directly
data_url = "https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt/resolve/main/test/tatoeba-test.eng-spa.tsv"
df = pd.read_csv(data_url, sep="\t", header=None, names=["src_lang", "tgt_lang", "sourceString", "targetString"])

# 2. Convert to Hugging Face Dataset
base = Dataset.from_pandas(df)

# 3. Shuffle with seed 42 to get the EXACT same 1000 sentences
N = 1000
subset = base.shuffle(seed=42).select(range(N))

# 4. Preprocess: Swap columns to match Spanish -> English task
# Note: In 'eng-spa', sourceString is English, targetString is Spanish.
# We swap them because your model is Spanish (Input) -> English (Target)
dataset = subset.map(
    lambda ex: {
        "src": ex["targetString"].strip(),  # Spanish
        "tgt": ex["sourceString"].strip()   # English
    },
    remove_columns=subset.column_names
)

print(f"Test Set Size: {len(dataset)}")
print("Example Input (Spa):", dataset[0]["src"])
print("Example Target (Eng):", dataset[0]["tgt"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Test Set Size: 1000
Example Input (Spa): El niño está bebiendo leche.
Example Target (Eng): The boy is drinking milk.


In [7]:
model_checkpoint = "facebook/nllb-200-distilled-600M"

# Load Tokenizer with NLLB language codes
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint,
    src_lang="spa_Latn",
    tgt_lang="eng_Latn"
)

# Load Model and move to GPU
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [9]:
# Parameters for generation
batch_size = 16
predictions = []
references = []

print("Starting inference...")

# Loop through dataset in batches
for i in tqdm(range(0, len(dataset), batch_size)):
    # Get batch of spanish sentences
    batch_src = dataset[i : i + batch_size]["src"]
    batch_tgt = dataset[i : i + batch_size]["tgt"]

    # Tokenize inputs
    inputs = tokenizer(batch_src, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translations
    # FIX: Use convert_tokens_to_ids instead of lang_code_to_id
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
            max_new_tokens=128
        )

    # Decode generated IDs to text
    batch_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    predictions.extend(batch_preds)
    references.extend(batch_tgt)

print("\nInference complete!")
print("Prediction 0:", predictions[0])
print("Reference 0:", references[0])

Starting inference...


100%|██████████| 63/63 [00:32<00:00,  1.93it/s]


Inference complete!
Prediction 0: The child is drinking milk.
Reference 0: The boy is drinking milk.





In [10]:
# Load metrics
metric_bleu = evaluate.load("sacrebleu")
metric_meteor = evaluate.load("meteor")
metric_bert = evaluate.load("bertscore")

print("Calculating metrics... (this might take a moment)")

# 1. BLEU
# References for BLEU need to be a list of lists: [['ref1'], ['ref2']]
bleu_refs = [[r] for r in references]
results_bleu = metric_bleu.compute(predictions=predictions, references=bleu_refs)

# 2. METEOR
results_meteor = metric_meteor.compute(predictions=predictions, references=references)

# 3. BERTScore (Uses the 'roberta-large' model by default for English)
results_bert = metric_bert.compute(predictions=predictions, references=references, lang="en")

# Print Final Results
print("-" * 30)
print(f"NLLB-200 Model Results:")
print(f"BLEU Score:      {results_bleu['score']:.2f}")
print(f"METEOR Score:    {results_meteor['meteor']:.4f}")
print(f"BERTScore (F1):  {torch.mean(torch.tensor(results_bert['f1'])):.4f}")
print("-" * 30)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Downloading builder script: 0.00B [00:00, ?B/s]

Calculating metrics... (this might take a moment)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


------------------------------
NLLB-200 Model Results:
BLEU Score:      58.96
METEOR Score:    0.8107
BERTScore (F1):  0.9763
------------------------------
