# Multilingual Translator (mBART-50)
A compact notebook to fine-tune a single multilingual model that can translate between all 10 languages.

This setup is CPU-friendly, but it keeps the training tiny. Increase `SAMPLE_ROWS` or `NUM_EPOCHS` if you have more time or a GPU.

## Step 1: Install dependencies
Run this once if needed.

In [4]:
# If you already have these, you can skip this cell.
# %pip is supported in notebooks
%pip install -q transformers datasets sentencepiece accelerate sacrebleu protobuf tiktoken

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Step 2: Imports and configuration

In [5]:
import random
import os
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
    )

MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
SAMPLE_ROWS = 500  # smaller subset for CPU
MAX_TRAIN_PAIRS = 500
MAX_LEN = 48
NUM_EPOCHS = 1
MAX_STEPS = 100  # hard cap on steps
BATCH_SIZE = 1
GRAD_ACCUM_STEPS = 1
SEED = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_seed(SEED)
print(f"Using device: {device}")

Using device: cpu


## Step 3: Load dataset and build multilingual pairs
Each example randomly picks a source language and a different target language. This gives coverage across all languages.

In [6]:
languages = ["en_US", "de_DE", "hi_IN", "es_ES", "fr_FR", "it_IT", "ar_SA", "nl_NL", "ja_JP", "pt_PT"]

# Map dataset codes to mBART-50 language codes
mbart_lang_map = {
    "en_US": "en_XX",
    "de_DE": "de_DE",
    "hi_IN": "hi_IN",
    "es_ES": "es_XX",
    "fr_FR": "fr_XX",
    "it_IT": "it_IT",
    "ar_SA": "ar_AR",
    "nl_NL": "nl_XX",
    "ja_JP": "ja_XX",
    "pt_PT": "pt_XX",
}

ds = load_dataset("Amani27/massive_translation_dataset")
train_df = pd.DataFrame(ds["train"])

if SAMPLE_ROWS < len(train_df):
    train_df = train_df.sample(SAMPLE_ROWS, random_state=SEED)

def build_pairs(df, seed=SEED):
    rnd = random.Random(seed)
    pairs = []
    for _, row in df.iterrows():
        src_lang = rnd.choice(languages)
        tgt_lang = rnd.choice([l for l in languages if l != src_lang])
        src_text = str(row[src_lang])
        tgt_text = str(row[tgt_lang])
        if not src_text or not tgt_text:
            continue
        pairs.append({
            "src_text": src_text,
            "tgt_text": tgt_text,
            "src_lang": mbart_lang_map[src_lang],
            "tgt_lang": mbart_lang_map[tgt_lang],
        })
    return pairs

train_pairs = build_pairs(train_df)
if MAX_TRAIN_PAIRS < len(train_pairs):
    train_pairs = train_pairs[:MAX_TRAIN_PAIRS]
train_dataset = Dataset.from_list(train_pairs)
print(f"Training pairs: {len(train_dataset)}")

Training pairs: 500


## Step 4: Tokenization

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

def preprocess(example):
    tokenizer.src_lang = example["src_lang"]
    tokenizer.tgt_lang = example["tgt_lang"]
    model_inputs = tokenizer(
        example["src_text"],
        max_length=MAX_LEN,
        truncation=True,
    )
    labels = tokenizer(
        text_target=example["tgt_text"],
        max_length=MAX_LEN,
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_NAME)
print("Tokenization complete")

Map: 100%|██████████| 500/500 [00:00<00:00, 2051.29 examples/s]

Tokenization complete





## Step 5: Train (lightweight fine-tune)

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

output_dir = "models/mbart_multilingual"
os.makedirs(output_dir, exist_ok=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=NUM_EPOCHS,
    max_steps=MAX_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=5e-5,
    logging_steps=20,
    save_strategy="no",
    report_to="none",
    fp16=torch.cuda.is_available(),
    predict_with_generate=False,
    )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
    )

trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to: {output_dir}")

Loading weights: 100%|██████████| 516/516 [00:01<00:00, 515.55it/s, Materializing param=model.shared.weight]                                   
  super().__init__(loader)


Step,Training Loss
20,3.569272
40,3.356655
60,2.476489
80,2.810697
100,2.864709


Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.91s/it]


Model saved to: models/mbart_multilingual


## Step 6: Translate between any languages

In [10]:
from transformers import MBart50Tokenizer

# Load saved model/tokenizer if available; otherwise use the base model
if os.path.isdir(output_dir) and (
    os.path.exists(os.path.join(output_dir, "pytorch_model.bin"))
    or os.path.exists(os.path.join(output_dir, "model.safetensors"))
    ):
    model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)
    tokenizer = MBart50Tokenizer.from_pretrained(output_dir)
else:
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
    tokenizer = MBart50Tokenizer.from_pretrained(MODEL_NAME)

model.eval()

def translate(text, src_lang, tgt_lang, max_length=64):
    src_code = mbart_lang_map[src_lang]
    tgt_code = mbart_lang_map[tgt_lang]

    tokenizer.src_lang = src_code
    encoded = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
    ).to(device)

    # Get the token id for the target language
    forced_bos = tokenizer.convert_tokens_to_ids(tgt_code)

    with torch.no_grad():
        generated = model.generate(
            **encoded,
            forced_bos_token_id=forced_bos,
            max_length=max_length,
            num_beams=4,
        )
    return tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

examples = [
    ("Hello, how are you?", "en_US", "es_ES"),
    ("I love programming.", "en_US", "hi_IN"),
    ("The weather is beautiful today.", "en_US", "fr_FR"),
    ("What time is it?", "en_US", "ja_JP"),
    ("Thank you very much.", "en_US", "de_DE"),
]

for text, src, tgt in examples:
    print(f"{src} -> {tgt}: {translate(text, src, tgt)}")

Loading weights: 100%|██████████| 516/516 [00:01<00:00, 486.46it/s, Materializing param=model.shared.weight]                                   


en_US -> es_ES: Hola, como está?
en_US -> hi_IN: मैं प्रोग्रामिंग को पसंद करता हूँ
en_US -> fr_FR: Le temps est beau aujourd'hui.
en_US -> ja_JP: 何時ですか?
en_US -> de_DE: Vielen Dank.
