Step 1 — Install and import libraries

In [None]:
!pip install transformers datasets sentencepiece sacrebleu --quiet



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

Step 2 — Initialize model and tokenizer

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from datasets import Dataset, DatasetDict
import torch
import os
import random

# Define the languages you want
LANGS = {
    "hi": "hi_IN",
    "mr": "mr_IN",
    "ta": "ta_IN",
    "te": "te_IN",
    "gu": "gu_IN",
    "pa": "pa_IN",
    "bn": "bn_IN",
    "en": "en_XX"
}

SRC_LANG = "hi"  # example: Hindi
TGT_LANG = "mr"  # example: Marathi

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


Step 3 — Prepare tiny Indic→Indic dataset

In [None]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# Set source language for tokenizer
tokenizer.src_lang = LANGS[SRC_LANG]


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Step 3 — Prepare tiny Indic→Indic dataset

In [None]:
# Example small dataset
train_pairs = [
    ("तुम्ही कसे आहात?", "तू कसा आहेस?"),
    ("माझं नाव अंकिता आहे.", "माझं नाव अंकिता आहे."),
    ("ही एक सुंदर फुलं आहे.", "ही एक सुंदर फुलं आहे.")
]

dev_pairs = [
    ("तुम्ही कुठे जाता?", "तू कुठे जातोस?")
]

test_pairs = [
    ("मी शाळेत जात आहे.", "मी शाळेत जात आहे.")
]

# Convert to Hugging Face Dataset
def make_hf_dataset(pairs):
    data = [{"translation": {SRC_LANG:s, TGT_LANG:t}} for s,t in pairs]
    return Dataset.from_list(data)

train_ds = make_hf_dataset(train_pairs)
dev_ds = make_hf_dataset(dev_pairs)
test_ds = make_hf_dataset(test_pairs)

hf_ds = DatasetDict({"train": train_ds, "validation": dev_ds, "test": test_ds})
print(hf_ds)



DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 3
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1
    })
})


Step 4 — Tokenize dataset

In [None]:
MAX_LEN = 128

# Make sure SRC_LANG and TGT_LANG are mBART50 language codes
tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

def tokenize(batch):
    # Convert all texts to strings and skip empty ones
    src_texts = [str(x[SRC_LANG]) if x[SRC_LANG] else " " for x in batch["translation"]]
    tgt_texts = [str(x[TGT_LANG]) if x[TGT_LANG] else " " for x in batch["translation"]]

    # Tokenize with text_target
    model_inputs = tokenizer(
        src_texts,
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length",
        text_target=tgt_texts
    )

    # Replace padding tokens with -100 in labels
    labels = model_inputs["labels"]
    labels = [[(id if id != tokenizer.pad_token_id else -100) for id in l] for l in labels]
    model_inputs["labels"] = labels

    return model_inputs

# Map the function over datasets
tokenized_train = train_ds.map(tokenize, batched=True)
tokenized_valid = dev_ds.map(tokenize, batched=True)
tokenized_test  = test_ds.map(tokenize, batched=True)

print("Tokenization complete!")


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenization complete!


Step 5 — Metrics (BLEU)

In [None]:
# First, install evaluate (if not already)
!pip install evaluate

# Import the library
import evaluate

# Load BLEU metric
bleu = evaluate.load("sacrebleu")

# Compute metrics function
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # If predictions are logits, take argmax
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels with pad_token_id
    labels = [[(id if id != -100 else tokenizer.pad_token_id) for id in l] for l in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU expects list of lists for references
    decoded_labels = [[l] for l in decoded_labels]

    # Compute BLEU
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    return {"bleu": result["score"]}


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


Downloading builder script: 0.00B [00:00, ?B/s]

Step 6 — Fine-tune model

In [None]:
!pip install --upgrade transformers datasets




In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart50-indic-indic-small",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,  # needed for metrics
    logging_steps=1,
    eval_steps=1  # use eval_steps instead of evaluation_strategy if old version
)


Step 7 — Test translations

In [None]:
# Multilingual Indian Languages Translation using mBART50
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch

# 1. Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# 3. Language mapping (mBART50 codes)
LANGS = {
    "hi": "hi_IN",
    "mr": "mr_IN",
    "ta": "ta_IN",
    "te": "te_IN",
    "gu": "gu_IN",
    "pa": "pa_IN",
    "bn": "bn_IN",
    "kn": "kn_IN",
    "ml": "ml_IN"
}

# 4. List of test cases (SRC_LANG, TGT_LANG, sentences)
test_cases = [
    ("hi", "mr", ["तुम्ही कसे आहात?", "ही एक सुंदर फुलं आहे.", "माझं नाव अंकिता आहे."]),
    ("ta", "te", ["நீங்கள் எப்படி இருக்கிறீர்கள்?", "இது ஒரு அழகான பூவாகும்.", "என் பெயர் அன்கிதா."]),
    #("gu", "pa", ["તમે કેમ છો?", "આ એક સુંદર ફૂલ છે.", "મારું નામ અંકિતા છે."]),
    ("bn", "mr", ["আপনি কেমন আছেন?", "এটি একটি সুন্দর ফুল।", "আমার নাম অঙ্কিতা।"]),
    ("kn", "ml", ["ನೀವು ಹೇಗಿದ್ದೀರಾ?", "ಇದು ಒಂದು ಸುಂದರ ಹೂವು.", "ನನ್ನ ಹೆಸರು ಅಂಕಿತಾ."])
]

# 5. Iterate over test cases
for SRC_LANG, TGT_LANG, sentences in test_cases:
    print(f"Translating {SRC_LANG} → {TGT_LANG}:\n")

    # Set source language
    tokenizer.src_lang = LANGS[SRC_LANG]

    # Tokenize input and move to device
    inputs = tokenizer(
        sentences,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate translations
    generated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[LANGS[TGT_LANG]]
    )

    # Decode output
    translated_texts = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    # Print results
    for src, tgt in zip(sentences, translated_texts):
        print(f"Source ({SRC_LANG}): {src}")
        print(f"Translated ({TGT_LANG}): {tgt}\n")

    print("--------------------------------------------------\n")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Translating hi → mr:

Source (hi): तुम्ही कसे आहात?
Translated (mr): कसे आहात?

Source (hi): ही एक सुंदर फुलं आहे.
Translated (mr): हा एक सुंदर फुल आहे.

Source (hi): माझं नाव अंकिता आहे.
Translated (mr): माझं नाव अंकिता आहे.

--------------------------------------------------

Translating ta → te:

Source (ta): நீங்கள் எப்படி இருக்கிறீர்கள்?
Translated (te): ఎలా మీరు

Source (ta): இது ஒரு அழகான பூவாகும்.
Translated (te): ఇది ఒక అందమైన பூ.

Source (ta): என் பெயர் அன்கிதா.
Translated (te): నా పేరు అంకిதா.

--------------------------------------------------

Translating bn → mr:

Source (bn): আপনি কেমন আছেন?
Translated (mr): How are you?

Source (bn): এটি একটি সুন্দর ফুল।
Translated (mr): this is a beautiful flower.

Source (bn): আমার নাম অঙ্কিতা।
Translated (mr): My name is अहीता.

--------------------------------------------------

Translating kn → ml:

Source (kn): ನೀವು ಹೇಗಿದ್ದೀರಾ?
Translated (ml): Articles, Articles, Articles?

Source (kn): ಇದು ಒಂದು ಸುಂದರ ಹೂವು.
Translated (ml): Artic