<a href="https://colab.research.google.com/github/bodadineshreddy/indictrans2/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers datasets torch sentencepiece sacrebleu

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==1

In [None]:
# import torch
# torch.cuda.empty_cache()

In [10]:
import os
import json
from datasets import load_dataset, Dataset
from itertools import islice

# Define source and target languages (must match both dataset & model)
src_lang, tgt_lang = "eng_Latn", "tel_Telu"

# Load dataset specifying the language pair (STREAMING mode enabled)
print(f"Loading dataset: {src_lang} → {tgt_lang}")
dataset_train = load_dataset("allenai/nllb", f"{src_lang}-{tgt_lang}", split="train", streaming=True, trust_remote_code=True)

# Reduce dataset size (e.g., 2000 samples instead of full dataset)
SAMPLE_SIZE = 2000  # Adjust as needed
batch_samples = list(islice(dataset_train, SAMPLE_SIZE))  # Efficiently fetch only needed samples

# Convert lists into Hugging Face Datasets
dataset = Dataset.from_list(batch_samples)['translation']

print(f"Loaded {len(dataset)}, {dataset[:2]}")

# Convert dataset to list format
en_to_te = [{"src": ex[src_lang], "tgt": ex[tgt_lang]} for ex in dataset]

# Generate bilingual pairs
te_to_en = [{"src": ex[tgt_lang], "tgt": ex[src_lang]} for ex in dataset]

# Convert generators to lists
json_data = {
    "en-indic": list(en_to_te),
    "indic-en": list(te_to_en)
}

# Save to a single JSON file
with open("nllb_en_te.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

print("Dataset saved as nllb_en_te.json")

Loading dataset: eng_Latn → tel_Telu


Repo card metadata block was not found. Setting CardData to empty.


Loaded 2000, [{'eng_Latn': 'I fear for you a day on which will be a great outcry!"', 'tel_Telu': 'వాస్తవానికి నేను మీపై రాబోయే ఆ గొప్ప దినపు శిక్షను గురించి భయపడుతున్నాను\'అని అన్నాడు" (7:59)'}, {'eng_Latn': 'Which is indeed a great oath if only you knew it.', 'tel_Telu': 'మీరు గ్రహించగలిగితే ఇది గొప్ప ప్రమాణం.'}]
Dataset saved as nllb_en_te.json


In [14]:


import torch
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load Pre-trained Model and Tokenizer
model_name = "ai4bharat/indictrans2-indic-en-dist-200M"
print(model_name)
tokenizer = MarianTokenizer.from_pretrained("ai4bharat/indictrans2-indic-en-dist-200M")
model = MarianMTModel.from_pretrained("ai4bharat/indictrans2-indic-en-dist-200M")

# Load and Prepare Dataset
dataset = load_dataset("json", data_files="/content/nllb_en_te.json", split="train")
dataset = dataset['indic-en']

print(dataset[:2])

# Tokenization function
def tokenize(batch):
    src_texts = batch["src"]
    tgt_texts = batch["tgt"]

    src_encodings = tokenizer(src_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    tgt_encodings = tokenizer(tgt_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    return {
        "input_ids": src_encodings["input_ids"],
        "attention_mask": src_encodings["attention_mask"],
        "labels": tgt_encodings["input_ids"]
    }

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True if torch.cuda.is_available() else False,  # Enable mixed precision if GPU is available
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Train the Model
trainer.train()

# Save Model and Tokenizer
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

# Test Translation
input_text = "मुझे स्कूल जाना है।"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
output_ids = model.generate(**inputs)
output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]

print("Translated Output:", output_text)  # Expected output: "I have to go to school."


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'IndicTransTokenizer'. 
The class this function is called from is 'MarianTokenizer'.


ai4bharat/indictrans2-indic-en-dist-200M


TypeError: expected str, bytes or os.PathLike object, not NoneType