<a href="https://colab.research.google.com/github/bodadineshreddy/indictrans2/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# !rm -rf /content/

In [1]:
%%capture
# !git clone https://github.com/AI4Bharat/IndicTrans2.git
# %cd /content/IndicTransToolkit
# !git clone https://github.com/VarunGumma/IndicTransToolkit.git
# !pip install git+https://github.com/VarunGumma/IndicTransToolkit.git
# # !python3 -m pip install --editable ./
# !python3 -c "import nltk; nltk.download('punkt')"

!pip install transformers datasets torch sentencepiece sacrebleu bitsandbytes scipy accelerate
!pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer

In [None]:
import torch
import sacrebleu
from nltk.translate.meteor_score import meteor_score
from sacrebleu.metrics import TER
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, concatenate_datasets

# ==============================
# Configuration
# ==============================

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

# ==============================
# Define Language Pairs (Unidirectional)
# ==============================

lang_pairs = [
    "eng_Latn-tel_Telu",
    "eng_Latn-tam_Taml",
    "eng_Latn-hin_Deva",
    "hin_Deva-tam_Taml",
    "hin_Deva-tel_Telu",
    "tam_Taml-tel_Telu",
]

# Convert to mBART language codes
nllb_to_mbart = {
    "eng_Latn": "en_XX",
    "tel_Telu": "te_IN",
    "tam_Taml": "ta_IN",
    "hin_Deva": "hi_IN"
}

# ==============================
# Load and Merge Datasets with Reversal
# ==============================

datasets_list = []
samples_per_pair = 2  # Reduce dataset size for efficiency

for pair in lang_pairs:
    dataset = load_dataset("allenai/nllb", pair, split="train", streaming=True, trust_remote_code=True)

    # Collect only 2000 samples per pair from the streamed dataset (for memory efficiency)
    dataset_iter = iter(dataset)
    batch_samples = [next(dataset_iter) for _ in range(samples_per_pair)]
    dataset = Dataset.from_list(batch_samples)  # Convert streamed samples to dataset

    # Store original dataset (Forward direction)
    datasets_list.append(dataset)

    # ======= Create Reversed Dataset Manually =======
    # Create reversed dataset while preserving metadata
    reversed_samples = []
    src_lang, tgt_lang = pair.split("-")

    for example in dataset:
        if src_lang in example["translation"] and tgt_lang in example["translation"]:
            reversed_example = example.copy()  # Copy all metadata
            reversed_example["translation"] = {
                tgt_lang: example["translation"][tgt_lang],  # Now source
                src_lang: example["translation"][src_lang]   # Now target
            }
            reversed_samples.append(reversed_example)

    # Convert reversed dataset to Hugging Face format
    reversed_dataset = Dataset.from_list(reversed_samples)
    datasets_list.append(reversed_dataset)

# Combine all datasets
combined_dataset = concatenate_datasets(datasets_list)

In [2]:


# ==============================
# Preprocessing for Fine-Tuning
# ==============================

def preprocess_function(examples):
    """
    Tokenizes a dataset example for training.
    """
    src_texts = [ex["translation"][list(ex["translation"].keys())[0]] for ex in examples]
    tgt_texts = [ex["translation"][list(ex["translation"].keys())[1]] for ex in examples]

    tokenizer.src_lang = nllb_to_mbart[list(examples[0]["translation"].keys())[0]]
    tokenizer.tgt_lang = nllb_to_mbart[list(examples[0]["translation"].keys())[1]]

    model_inputs = tokenizer(src_texts, truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(tgt_texts, truncation=True, padding="max_length", max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing (batched for efficiency)
tokenized_dataset = combined_dataset.map(preprocess_function, remove_columns=["translation"], batched=True)

# ==============================
# Fine-Tuning Step
# ==============================

# Check if bf16 is supported by GPU
bf16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

training_args = TrainingArguments(
    output_dir="./fine_tuned_mbart",
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=1000,
    evaluation_strategy="no",
    report_to=None,  # Disable external logging
    bf16=bf16_supported if bf16_supported else False,  # Use bf16 only if supported
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save fine-tuned model & tokenizer
trainer.save_model("./fine_tuned_mbart")
tokenizer.save_pretrained("./fine_tuned_mbart")

# ==============================
# Inference (Using Fine-Tuned Model)
# ==============================

def batch_translate(input_sentences, model, tokenizer, src_lang, tgt_lang):
    """
    Translates a batch of input sentences using the fine-tuned model.
    """
    tokenizer.src_lang = src_lang  # Ensure correct source language
    inputs = tokenizer(input_sentences, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)

    forced_bos_token_id = tokenizer.lang_code_to_id.get(tgt_lang, None)
    if forced_bos_token_id is None:
        raise ValueError(f"Target language {tgt_lang} not found in tokenizer.")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=forced_bos_token_id,  # Target language for mBART
            max_length=256,
            num_beams=8,  # Increased num_beams for better quality
        )

    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

# ==============================
# Test Translation & Evaluation
# ==============================

test_pairs = [
    ("en_XX", "te_IN"),
    ("en_XX", "ta_IN"),
    ("en_XX", "hi_IN"),
    ("te_IN", "en_XX"),
    ("te_IN", "ta_IN"),
    ("ta_IN", "hi_IN"),
    ("hi_IN", "te_IN"),
]

# SacreBLEU & METEOR evaluation
ter = TER()

for src_lang, tgt_lang in test_pairs:
    print(f"\n Translating from {src_lang} → {tgt_lang}")

    # Sample sentences
    test_sentences = {
        "en_XX": ["Hello, how are you?", "This is a beautiful day.", "I love learning new languages."],
        "te_IN": ["హలో, మీరు ఎలా ఉన్నారు?", "ఇది ఒక అందమైన రోజు.", "నేను కొత్త భాషలు నేర్చుకోవాలని ఇష్టపడుతున్నాను."],
        "ta_IN": ["வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?", "இது ஒரு அழகான நாள்.", "எனக்கு புதிய மொழிகளை கற்க விருப்பம்."],
        "hi_IN": ["नमस्ते, आप कैसे हैं?", "यह एक सुंदर दिन है।", "मुझे नई भाषाएँ सीखना पसंद है।"]
    }

    source_sentences = test_sentences[src_lang]
    reference_sentences = test_sentences[tgt_lang]

    translations = batch_translate(source_sentences, model, tokenizer, src_lang, tgt_lang)

    for src, tgt in zip(source_sentences, translations):
        print(f"Source ({src_lang}): {src}")
        print(f"Translation ({tgt_lang}): {tgt}")
        print("-" * 50)

    # ==============================
    # Evaluation Metrics
    # ==============================

    # Compute BLEU Score
    bleu_score = sacrebleu.corpus_bleu(translations, [reference_sentences]).score

    # Compute METEOR Score (Average across sentences)
    meteor_scores = [meteor_score([ref], pred) for ref, pred in zip(reference_sentences, translations)]
    avg_meteor_score = sum(meteor_scores) / len(meteor_scores)

    # Compute TER Score (Translation Edit Rate)
    ter_scores = [ter.sentence_score(pred, [ref]).score for ref, pred in zip(reference_sentences, translations)]
    avg_ter_score = sum(ter_scores) / len(ter_scores)

    # Print Scores
    print(f"\nEvaluation for {src_lang} → {tgt_lang}")
    print(f"\t\tBLEU Score: {bleu_score:.2f}")
    print(f"\t\tMETEOR Score: {avg_meteor_score:.2f}")
    print(f"\t\tTER Score: {avg_ter_score:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/38.6k [00:00<?, ?B/s]

nllb.py:   0%|          | 0.00/9.49k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/5.05M [00:00<?, ?B/s]

nllb_lang_pairs.py:   0%|          | 0.00/81.9k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


ArrowTypeError: struct fields don't match or are in the wrong order: Input fields: struct<eng_Latn: string, tel_Telu: string> output fields: struct<eng_Latn: string, tel_Telu: string, tam_Taml: string, hin_Deva: string>

In [11]:
# Merge datasets into one
print(f"Number of datasets: {len(datasets_list)}, {dataset}")
# combined_dataset = concatenate_datasets(datasets_list)

Number of datasets: 12, Dataset({
    features: ['translation', 'laser_score', 'source_sentence_lid', 'target_sentence_lid', 'source_sentence_source', 'source_sentence_url', 'target_sentence_source', 'target_sentence_url'],
    num_rows: 2000
})


In [6]:
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, TrainingArguments, Trainer
from datasets import load_dataset, Dataset

# ==============================
# Configuration
# ==============================

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# ==============================
# Model Initialization
# ==============================

# Load tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

# ==============================
# Dataset Loading
# ==============================

# Define source and target languages (Convert from NLLB format to mBART format)
nllb_to_mbart = {
    "eng_Latn": "en_XX",
    "tel_Telu": "te_IN"
}
src_lang, tgt_lang = nllb_to_mbart["eng_Latn"], nllb_to_mbart["tel_Telu"]

# Load dataset (Streaming mode for efficiency)
dataset = load_dataset("allenai/nllb", "eng_Latn-tel_Telu", split="train", streaming=True, trust_remote_code=True)

# Collect 10,000 samples efficiently
dataset_iter = iter(dataset)
batch_samples = [next(dataset_iter) for _ in range(10000)]
dataset = Dataset.from_list(batch_samples)

print(dataset[0])  # Debug: Show sample data

# ==============================
# Preprocessing for Fine-Tuning
# ==============================

def preprocess_function(example):
    """
    Tokenizes a dataset example for training.
    """
    src_text = example["translation"].get("eng_Latn", None)
    tgt_text = example["translation"].get("tel_Telu", None)

    # Skip bad data (some samples may not have a target translation)
    if not src_text or not tgt_text:
        return None  # This will automatically drop bad samples

    # Set language codes for mBART
    tokenizer.src_lang = "en_XX"
    tokenizer.tgt_lang = "te_IN"

    # Tokenize input text
    model_inputs = tokenizer(src_text, truncation=True, padding="max_length", max_length=128)

    # Ensure target language is valid
    if tgt_lang not in tokenizer.lang_code_to_id:
        raise ValueError(f"Target language {tgt_lang} is not recognized by tokenizer. Available: {tokenizer.lang_code_to_id.keys()}")

    # Tokenize target text
    labels = tokenizer(text_target=tgt_text, truncation=True, padding="max_length", max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize dataset with filtering (removes None values)
tokenized_dataset = dataset.map(preprocess_function, remove_columns=["translation", "laser_score"], batched=False)


# ==============================
# Fine-Tuning Step
# ==============================

# Check if bf16 is supported by GPU
bf16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

training_args = TrainingArguments(
    output_dir="./fine_tuned_mbart",
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=1000,
    evaluation_strategy="no",
    report_to=None,  # Disable external logging
    bf16=bf16_supported,  # Use bf16 only if supported
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save fine-tuned model & tokenizer
trainer.save_model("./fine_tuned_mbart")
tokenizer.save_pretrained("./fine_tuned_mbart")

# ==============================
# Inference (Using Fine-Tuned Model)
# ==============================

def batch_translate(input_sentences, model, tokenizer):
    """
    Translates a batch of input sentences using the fine-tuned model.
    """
    tokenizer.src_lang = src_lang  # Ensure correct source language
    inputs = tokenizer(input_sentences, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],  # Target language for mBART
            max_length=256,
            num_beams=5,
        )

    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

# Sample test sentences
en_sents = [
    "When I was young, I used to go to the park every day.",
    "He has many old books, which he inherited from his ancestors.",
    "I can't figure out how to solve my problem.",
    "She is very hardworking and intelligent, which is why she got all the good marks.",
]

# Translate sentences
translations = batch_translate(en_sents, model, tokenizer)

# Print translations
print(f"\n{src_lang} → {tgt_lang}")
for src, tgt in zip(en_sents, translations):
    print(f"English: {src}")
    print(f"Telugu: {tgt}")


Repo card metadata block was not found. Setting CardData to empty.


{'translation': {'eng_Latn': 'I fear for you a day on which will be a great outcry!"', 'tel_Telu': 'వాస్తవానికి నేను మీపై రాబోయే ఆ గొప్ప దినపు శిక్షను గురించి భయపడుతున్నాను\'అని అన్నాడు" (7:59)'}, 'laser_score': 1.2498809, 'source_sentence_lid': 1.00001, 'target_sentence_lid': 0.99998, 'source_sentence_source': 'crawl-data/CC-MAIN-2018-17/segments/1524125946165.56/wet/CC-MAIN-20180423184427-20180423204427-00442.warc.wet.gz', 'source_sentence_url': 'http://ahlesunnat.biz/kanz/holyquran/surah-al-momin/', 'target_sentence_source': 'paracrawl9_philipp', 'target_sentence_url': '_'}


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbodareddy143[0m ([33mbodareddy143-bits-pilani[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1000,0.7219
2000,0.4191
3000,0.321
4000,0.2428
5000,0.2329
6000,0.1444
7000,0.1442


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



en_XX → te_IN
English: When I was young, I used to go to the park every day.
Telugu: నేను యువ ఉన్నప్పుడు, నేను ప్రతి రోజు పార్క్ వెళ్ళేది.
English: He has many old books, which he inherited from his ancestors.
Telugu: అతనికి చాలా పాత పుస్తకాలు ఉన్నాయి, అతను తన పూర్వీకులు నుండి వారసత్వంగా చేసిన.
English: I can't figure out how to solve my problem.
Telugu: నా సమస్యను ఎలా పరిష్కరించాలి అని నేను అర్థం చెప్పలేను.
English: She is very hardworking and intelligent, which is why she got all the good marks.
Telugu: ఆమె చాలా హార్డ్ పని మరియు తెలివైనది, అందువల్ల ఆమె అన్ని మంచి మార్క్లను పొందింది.


In [2]:
# import torch
# from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments, Trainer
# from datasets import load_dataset, Dataset
# from IndicTransToolkit import IndicProcessor

# BATCH_SIZE = 4
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# quantization = None

# def initialize_model_and_tokenizer(ckpt_dir, quantization):
#     if quantization == "4-bit":
#         qconfig = BitsAndBytesConfig(
#             load_in_4bit=True,
#             bnb_4bit_use_double_quant=True,
#             bnb_4bit_compute_dtype=torch.bfloat16,
#         )
#     elif quantization == "8-bit":
#         qconfig = BitsAndBytesConfig(
#             load_in_8bit=True,
#             bnb_8bit_use_double_quant=True,
#             bnb_8bit_compute_dtype=torch.bfloat16,
#         )
#     else:
#         qconfig = None

#     tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
#     model = AutoModelForSeq2SeqLM.from_pretrained(
#         ckpt_dir,
#         trust_remote_code=True,
#         low_cpu_mem_usage=True,
#         quantization_config=qconfig,
#     )

#     if qconfig == None:
#         model = model.to(DEVICE)
#         if DEVICE == "cuda":
#             model.bfloat16()

#     return tokenizer, model


# def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
#     translations = []
#     for i in range(0, len(input_sentences), BATCH_SIZE):
#         batch = input_sentences[i : i + BATCH_SIZE]

#         # Preprocess the batch and extract entity mappings
#         batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

#         # Tokenize the batch and generate input encodings
#         inputs = tokenizer(
#             batch,
#             truncation=True,
#             padding="longest",
#             return_tensors="pt",
#             return_attention_mask=True,
#         ).to(DEVICE)

#         # Generate translations using the model
#         with torch.no_grad():
#             generated_tokens = model.generate(
#                 **inputs,
#                 use_cache=True,
#                 min_length=0,
#                 max_length=256,
#                 num_beams=5,
#                 num_return_sequences=1,
#             )

#         # Decode the generated tokens into text

#         with tokenizer.as_target_tokenizer():
#             generated_tokens = tokenizer.batch_decode(
#                 generated_tokens.detach().cpu().tolist(),
#                 skip_special_tokens=True,
#                 clean_up_tokenization_spaces=True,
#             )

#         # Postprocess the translations, including entity replacement
#         translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

#         del inputs
#         torch.cuda.empty_cache()

#     return translations

# # ==============================
# # Dataset Loading (Using allenai/nllb)
# # ==============================

# # Define source and target languages (must match both dataset & model)
# src_lang, tgt_lang = "eng_Latn", "tel_Telu"

# # Load dataset specifying the language pair (STREAMING mode enabled)
# dataset = load_dataset("allenai/nllb", f"{src_lang}-{tgt_lang}", split="train", streaming=True, trust_remote_code=True)

# # Collect 10,000 samples into a list (since streaming datasets do not support .map() directly)
# batch_samples = [x for _, x in zip(range(10000), dataset)]

# # Convert the list into a Hugging Face Dataset
# dataset = Dataset.from_list(batch_samples)

# print(dataset[0])

# # ==============================
# # Model Initialization (Uses Existing Functions)
# # ==============================

# # Load the pretrained English-to-Indic model
# en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-dist-200M"
# en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, quantization)

# # Create an instance of IndicProcessor
# ip = IndicProcessor(inference=True)

# # ==============================
# # Preprocessing for Fine-Tuning
# # ==============================

# # Define the preprocessing function (handling single dictionary input)
# def preprocess_function(example):
#     """
#     Preprocesses a single example using IndicProcessor & Tokenizer.
#     """
#     src_text = example["translation"][src_lang]
#     tgt_text = example["translation"][tgt_lang]

#     # Use IndicProcessor for source text preprocessing
#     processed_src = ip.preprocess_batch([src_text], src_lang=src_lang, tgt_lang=tgt_lang)[0]

#     # Tokenize source text
#     model_inputs = en_indic_tokenizer(processed_src, truncation=True, padding="max_length", max_length=128)

#     # Tokenize target text (labels for training)
#     with en_indic_tokenizer.as_target_tokenizer():
#         labels = en_indic_tokenizer(tgt_text, truncation=True, padding="max_length", max_length=128)

#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

# # Apply preprocessing using .map()
# tokenized_dataset = dataset.map(preprocess_function)

# # ==============================
# # Fine-Tuning Step
# # ==============================

# # Set up fine-tuning parameters
# training_args = TrainingArguments(
#     output_dir="./fine_tuned_model",
#     per_device_train_batch_size=4,  # Adjust based on GPU memory
#     num_train_epochs=3,
#     save_steps=500,
#     logging_steps=1000,
#     evaluation_strategy="no",
#     report_to=None,  # Disable logging to external tools
#     bf16=True,
#     # fp16_full_eval=True
# )

# # Fine-tune using Hugging Face Trainer
# trainer = Trainer(
#     model=en_indic_model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     tokenizer=en_indic_tokenizer,
# )

# # en_indic_model.train()
# trainer.train()

# # Save fine-tuned model & tokenizer
# trainer.save_model("./fine_tuned_model")
# en_indic_tokenizer.save_pretrained("./fine_tuned_model")

# # Set model back to evaluation mode after fine-tuning
# en_indic_model.eval()

# # ==============================
# # Inference (Using Fine-Tuned Model)
# # ==============================

# # Sample test sentences for inference
# en_sents = [
#     "When I was young, I used to go to the park every day.",
#     "He has many old books, which he inherited from his ancestors.",
#     "I can't figure out how to solve my problem.",
#     "She is very hardworking and intelligent, which is why she got all the good marks.",
# ]

# # Translate using the fine-tuned model
# translations = batch_translate(en_sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer, ip)

# # Print translations
# print(f"\n{src_lang} - {tgt_lang}")
# for src, tgt in zip(en_sents, translations):
#     print(f"{src_lang}: {src}")
#     print(f"{tgt_lang}: {tgt}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


{'translation': {'eng_Latn': 'I fear for you a day on which will be a great outcry!"', 'tel_Telu': 'వాస్తవానికి నేను మీపై రాబోయే ఆ గొప్ప దినపు శిక్షను గురించి భయపడుతున్నాను\'అని అన్నాడు" (7:59)'}, 'laser_score': 1.2498809, 'source_sentence_lid': 1.00001, 'target_sentence_lid': 0.99998, 'source_sentence_source': 'crawl-data/CC-MAIN-2018-17/segments/1524125946165.56/wet/CC-MAIN-20180423184427-20180423204427-00442.warc.wet.gz', 'source_sentence_url': 'http://ahlesunnat.biz/kanz/holyquran/surah-al-momin/', 'target_sentence_source': 'paracrawl9_philipp', 'target_sentence_url': '_'}


tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-dist-200M:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json:   0%|          | 0.00/645k [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/759k [00:00<?, ?B/s]

model.TGT:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

Sample Tokens from IndicBERT: ['!', '!!', '!!!', '!!!!', '!!!!!', '!"', '!)', '!’', '!”', '"', '#', '$', '%', '&', "'", "''", '(', '(1', '(1)', '(2']


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


NotImplementedError: 

In [None]:
# import torch
# torch.cuda.empty_cache()

In [3]:
import os
import json
from datasets import load_dataset, Dataset
from itertools import islice

# Define source and target languages (must match both dataset & model)
src_lang, tgt_lang = "eng_Latn", "tel_Telu"

# Load dataset specifying the language pair (STREAMING mode enabled)
print(f"Loading dataset: {src_lang} → {tgt_lang}")
dataset_train = load_dataset("allenai/nllb", f"{src_lang}-{tgt_lang}", split="train", streaming=True, trust_remote_code=True)

# Reduce dataset size (e.g., 2000 samples instead of full dataset)
SAMPLE_SIZE = 2000  # Adjust as needed
batch_samples = list(islice(dataset_train, SAMPLE_SIZE))  # Efficiently fetch only needed samples

# Convert lists into Hugging Face Datasets
dataset = Dataset.from_list(batch_samples)['translation']

print(f"Loaded {len(dataset)}, {dataset[:2]}")

# Convert dataset to list format
en_to_te = [{"src": ex[src_lang], "tgt": ex[tgt_lang]} for ex in dataset]

# Generate bilingual pairs
te_to_en = [{"src": ex[tgt_lang], "tgt": ex[src_lang]} for ex in dataset]

# Convert generators to lists
json_data = {
    "en-indic": list(en_to_te),
    "indic-en": list(te_to_en)
}

# Save to a single JSON file
with open("nllb_en_te.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

print("Dataset saved as nllb_en_te.json")

Loading dataset: eng_Latn → tel_Telu


Repo card metadata block was not found. Setting CardData to empty.


Loaded 2000, [{'eng_Latn': 'I fear for you a day on which will be a great outcry!"', 'tel_Telu': 'వాస్తవానికి నేను మీపై రాబోయే ఆ గొప్ప దినపు శిక్షను గురించి భయపడుతున్నాను\'అని అన్నాడు" (7:59)'}, {'eng_Latn': 'Which is indeed a great oath if only you knew it.', 'tel_Telu': 'మీరు గ్రహించగలిగితే ఇది గొప్ప ప్రమాణం.'}]
Dataset saved as nllb_en_te.json


In [8]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset

# Load Pre-trained Model and Tokenizer
model_name = "ai4bharat/indictrans2-indic-en-dist-200M"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set padding token explicitly
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
model.config.pad_token_id = tokenizer.pad_token_id

# Load Dataset
dataset = load_dataset("json", data_files="nllb_en_te.json", split="train")

# Convert Dataset to Required Format
dataset = Dataset.from_list(dataset["indic-en"])

# Display a sample to verify structure
print("Sample Data:", dataset[0])

# Split dataset into train and validation sets
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset["train"]
valid_dataset = split_dataset["test"]

# Function to Tokenize Data with Fixes
def tokenize(batch):
    source_texts = batch["src"]
    target_texts = batch["tgt"]

    # Tokenize inputs without forcing max_length
    source_encodings = tokenizer(source_texts, padding="longest", truncation=True, max_length=128)

    # Tokenize targets without forcing max_length
    target_encodings = tokenizer(target_texts, padding="longest", truncation=True, max_length=128)
    labels = target_encodings["input_ids"]

    # Ensure `labels` is always a list of lists
    if isinstance(labels[0], int):
        labels = [labels]

    # Replace **only padding tokens** in labels with -100
    labels = [
        [token if token != tokenizer.pad_token_id else -100 for token in sequence]
        for sequence in labels
    ]

    return {
        "input_ids": source_encodings["input_ids"],
        "attention_mask": source_encodings["attention_mask"],
        "labels": labels
    }


# Apply tokenization separately to train and validation datasets
tokenized_train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_dataset = valid_dataset.map(tokenize, batched=True, remove_columns=valid_dataset.column_names)

# Remove empty or corrupt samples
tokenized_train_dataset = tokenized_train_dataset.filter(lambda x: len(x["input_ids"]) > 0 and len(x["labels"]) > 0)
tokenized_valid_dataset = tokenized_valid_dataset.filter(lambda x: len(x["input_ids"]) > 0 and len(x["labels"]) > 0)

# Set Training Parameters
training_args = TrainingArguments(
    output_dir="fine_tuned_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.00003,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    logging_dir="logs",
    logging_steps=50
)

# Initialize Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    data_collator=data_collator
)

# Train Model
trainer.train()

# Save Model and Tokenizer
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

# Test Translation
input_text = "मुझे स्कूल जाना है।"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Ensure model uses the correct pad token during inference
output_ids = model.generate(**inputs, pad_token_id=tokenizer.pad_token_id)

output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]

# Print Translated Output
print("Translated Output:", output_text)


Sample Data: {'src': 'వాస్తవానికి నేను మీపై రాబోయే ఆ గొప్ప దినపు శిక్షను గురించి భయపడుతున్నాను\'అని అన్నాడు" (7:59)', 'tgt': 'I fear for you a day on which will be a great outcry!"'}


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]



IndexError: index out of range in self