In [4]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, get_scheduler
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset class
class ParallelCorpusDataset(Dataset):
    def __init__(self, english_sentences, urdu_sentences, tokenizer, max_length=128):
        self.english_sentences = english_sentences
        self.urdu_sentences = urdu_sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        english_text = self.english_sentences[idx].strip()
        urdu_text = self.urdu_sentences[idx].strip()

        encodings = self.tokenizer(english_text, truncation=True, padding="max_length", max_length=self.max_length)
        decodings = self.tokenizer(urdu_text, truncation=True, padding="max_length", max_length=self.max_length)

        input_ids = torch.tensor(encodings['input_ids'])
        attention_mask = torch.tensor(encodings['attention_mask'])
        labels = torch.tensor(decodings['input_ids'])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
            'target_text': urdu_text
        }

# Load data
with open('/kaggle/input/parallel-corpus-for-english-urdu-language/Dataset/english-corpus.txt', 'r', encoding='utf-8') as f:
    english_sentences = f.readlines()
with open('/kaggle/input/parallel-corpus-for-english-urdu-language/Dataset/urdu-corpus.txt', 'r', encoding='utf-8') as f:
    urdu_sentences = f.readlines()

# Tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ur").to(device)

# Tokenizer uses SentencePiece (BPE-based)
#print(f"Tokenizer backend: {type(tokenizer.backend_tokenizer)}")

# Train-validation split
train_eng, val_eng, train_urd, val_urd = train_test_split(english_sentences, urdu_sentences, test_size=0.1, random_state=42)

# Dataset & Dataloaders
train_dataset = ParallelCorpusDataset(train_eng, train_urd, tokenizer)
val_dataset = ParallelCorpusDataset(val_eng, val_urd, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Optimizer & scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# BLEU helper
smoothie = SmoothingFunction().method4

def calculate_bleu(pred, ref):
    return sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)

# Training Loop
epochs = 3
best_val_bleu = 0
patience, patience_counter = 2, 0  # early stopping

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Train Loss: {avg_train_loss:.4f}")

    # Validation + BLEU
    model.eval()
    total_bleu = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target_texts = batch['target_text']

            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
            predictions = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]

            for pred, ref in zip(predictions, target_texts):
                total_bleu += calculate_bleu(pred, ref)

    avg_bleu = total_bleu / len(val_dataset)
    print(f"Validation BLEU: {avg_bleu:.4f}")

    # Early stopping
    if avg_bleu > best_val_bleu:
        best_val_bleu = avg_bleu
        patience_counter = 0
        # Save best model
        model.save_pretrained("/kaggle/working/urdu_translation_model")
        tokenizer.save_pretrained("/kaggle/working/urdu_translation_model")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# Sample Translation
model.eval()
sample_text = "Where is the library?"
encoded = tokenizer(sample_text, return_tensors="pt").to(device)
generated = model.generate(**encoded, max_length=128)
translation = tokenizer.decode(generated[0], skip_special_tokens=True)
print(f"\nSample Translation:\nEN: {sample_text}\nUR: {translation}")


Epoch 1/3: 100%|██████████| 1380/1380 [04:23<00:00,  5.23it/s]


Train Loss: 0.2486
Validation BLEU: 0.1772


Epoch 2/3: 100%|██████████| 1380/1380 [04:24<00:00,  5.22it/s]


Train Loss: 0.1434
Validation BLEU: 0.2522


Epoch 3/3: 100%|██████████| 1380/1380 [04:23<00:00,  5.23it/s]


Train Loss: 0.1145
Validation BLEU: 0.2877

Sample Translation:
EN: Where is the library?
UR: جاپنی کیسی کہاں


In [5]:
sample_text = "Who am I?"
encoded = tokenizer(sample_text, return_tensors="pt").to(device)
generated = model.generate(**encoded, max_length=128)
translation = tokenizer.decode(generated[0], skip_special_tokens=True)
print(f"\nSample Translation:\nEN: {sample_text}\nUR: {translation}")


Sample Translation:
EN: Who am I?
UR: میں کون کون ہوں
