<a href="https://colab.research.google.com/github/bodadineshreddy/indictrans2/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
from datasets import load_dataset

# Load dataset with streaming enabled (Base language: English)
dataset = load_dataset("facebook/nllb", "eng_Latn", streaming=True)

# Function to take only N samples
def take_n_samples(iterable, n):
    return (x for i, x in enumerate(iterable) if i < n)

# Extract 10,000 samples for English → Telugu
en_to_te = take_n_samples(
    ({"src": x["translation"]["eng_Latn"], "tgt": x["translation"]["tel_Telu"]}
     for x in dataset["train"] if "tel_Telu" in x["translation"]),
    10_000
)

# Extract 10,000 samples for Telugu → English
te_to_en = take_n_samples(
    ({"src": x["translation"]["tel_Telu"], "tgt": x["translation"]["eng_Latn"]}
     for x in dataset["train"] if "tel_Telu" in x["translation"]),
    10_000
)

# Convert generators to lists
json_data = {
    "en-indic": list(en_to_te),
    "indic-en": list(te_to_en)
}

# Save to a single JSON file
with open("nllb_en_te.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

print("Dataset saved as nllb_en_te.json")


In [None]:
!pip install transformers datasets torch sentencepiece sacrebleu

import torch
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load Pre-trained Model and Tokenizer
model_name = "ai4bharat/indictrans2-indic-en-dist-300m"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Load and Prepare Dataset
dataset = load_dataset("json", data_files="your_dataset.json", split="train")

# Tokenization function
def tokenize(batch):
    src_texts = batch["src"]
    tgt_texts = batch["tgt"]

    src_encodings = tokenizer(src_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    tgt_encodings = tokenizer(tgt_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    return {
        "input_ids": src_encodings["input_ids"],
        "attention_mask": src_encodings["attention_mask"],
        "labels": tgt_encodings["input_ids"]
    }

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True if torch.cuda.is_available() else False,  # Enable mixed precision if GPU is available
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Train the Model
trainer.train()

# Save Model and Tokenizer
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

# Test Translation
input_text = "मुझे स्कूल जाना है।"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
output_ids = model.generate(**inputs)
output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]

print("Translated Output:", output_text)  # Expected output: "I have to go to school."
