In [8]:
# Cell 1: imports, seeds, paths, device
import os
import re
import time
import json
import math
import unicodedata
import random
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Paths (adjust as needed)
INPUT_DATA_ROOT = "/kaggle/input/urdu2roman/dataset"  # where your urdu/ and roman/ folders live
WORKDIR = "/kaggle/working"
DATA_DIR = os.path.join(WORKDIR, "data")
MODELS_DIR = os.path.join(WORKDIR, "models")
CHECKPOINT_DIR = os.path.join(MODELS_DIR, "seq2seq_checkpoints")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
print("Working data dir:", DATA_DIR)


Device: cuda
Working data dir: /kaggle/working/data


In [7]:
# Cell 2: load Urdu/Roman files (line-wise) and build pairs.csv
import os

URDU_DIR = os.path.join(INPUT_DATA_ROOT, "urdu")
ROMAN_DIR = os.path.join(INPUT_DATA_ROOT, "roman")

if not (os.path.isdir(URDU_DIR) and os.path.isdir(ROMAN_DIR)):
    raise FileNotFoundError(f"Expected folders 'urdu' and 'roman' inside {INPUT_DATA_ROOT}. Found: {os.listdir(INPUT_DATA_ROOT)}")

pairs = []
for fname in sorted(os.listdir(URDU_DIR)):
    urdu_path = os.path.join(URDU_DIR, fname)
    roman_path = os.path.join(ROMAN_DIR, fname)
    if not os.path.isfile(roman_path):
        print(f"Skipping {fname}: missing roman file")
        continue

    # Read both files; split into non-empty lines; pair by line index
    with open(urdu_path, "r", encoding="utf-8") as f1:
        urdu_lines = [ln.strip() for ln in f1.read().splitlines() if ln.strip()]
    with open(roman_path, "r", encoding="utf-8") as f2:
        roman_lines = [ln.strip() for ln in f2.read().splitlines() if ln.strip()]

    # If lengths differ, pair up to min length. Usually poetry lines align.
    n = min(len(urdu_lines), len(roman_lines))
    if n == 0:
        continue
    for i in range(n):
        pairs.append({"urdu": urdu_lines[i], "roman": roman_lines[i]})

df = pd.DataFrame(pairs)
pairs_csv = os.path.join(DATA_DIR, "pairs.csv")
df.to_csv(pairs_csv, index=False, encoding="utf-8-sig")
print(f"Built pairs.csv with {len(df)} pairs -> {pairs_csv}")
df.head()


Built pairs.csv with 21003 pairs -> /kaggle/working/data/pairs.csv


Unnamed: 0,urdu,roman
0,آ گیا پھر رمضاں کیا ہوگا,aa gayā phir ramazāñ kyā hogā
1,ہائے اے پیر مغاں کیا ہوگا,haa.e ai pīr-e-muġhāñ kyā hogā
2,باغ جنت میں سماں کیا ہوگا,bāġh-e-jannat meñ samāñ kyā hogā
3,تو نہیں جب تو وہاں کیا ہوگا,tū nahīñ jab to vahāñ kyā hogā
4,خوش وہ ہوتا ہے مرے نالوں سے,ḳhush vo hotā hai mire nāloñ se


In [8]:
# Cell 3: normalization / cleaning
import unicodedata, re
from tqdm import tqdm
tqdm.pandas()

def normalize_urdu(text):
    text = "" if pd.isna(text) else str(text)
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("\u0640", "")  # tatweel
    text = re.sub(r"[\u064B-\u0652]", "", text)  # tashkeel
    text = re.sub("[\u0622\u0623\u0625]", "\u0627", text)  # normalize alef
    text = text.replace("ي", "ی").replace("ك", "ک")
    text = re.sub(r"[^\S\r\n]+", " ", text).strip()
    return text

def normalize_roman(text):
    text = "" if pd.isna(text) else str(text)
    text = unicodedata.normalize("NFKC", text).lower()
    text = text.replace("\u2013", "-").replace("\u2014", "-")
    text = text.replace("’", "'").replace("‘", "'")
    allow_punct = set(" .,!?\-\'\":;()[]{}")
    out = []
    for ch in text:
        cat = unicodedata.category(ch)
        if cat[0] in ("L", "M", "N") or ch in allow_punct:
            out.append(ch)
    text = "".join(out)
    text = re.sub(r"\s+", " ", text).strip()
    return text

pairs_csv = os.path.join(DATA_DIR, "pairs.csv")
if not os.path.exists(pairs_csv):
    raise FileNotFoundError(f"Missing {pairs_csv}; run the previous cell to build it.")

df = pd.read_csv(pairs_csv, encoding="utf-8")
df["urdu_clean"] = df["urdu"].progress_apply(normalize_urdu)
df["roman_clean"] = df["roman"].progress_apply(normalize_roman)

clean_csv = os.path.join(DATA_DIR, "pairs_clean.csv")
df.to_csv(clean_csv, index=False, encoding="utf-8-sig")
print("Saved cleaned pairs to", clean_csv)
df.head()


100%|██████████| 21003/21003 [00:00<00:00, 130979.04it/s]
100%|██████████| 21003/21003 [00:00<00:00, 75382.11it/s]


Saved cleaned pairs to /kaggle/working/data/pairs_clean.csv


Unnamed: 0,urdu,roman,urdu_clean,roman_clean
0,آ گیا پھر رمضاں کیا ہوگا,aa gayā phir ramazāñ kyā hogā,ا گیا پھر رمضاں کیا ہوگا,aa gayā phir ramazāñ kyā hogā
1,ہائے اے پیر مغاں کیا ہوگا,haa.e ai pīr-e-muġhāñ kyā hogā,ہائے اے پیر مغاں کیا ہوگا,haa.e ai pīr-e-muġhāñ kyā hogā
2,باغ جنت میں سماں کیا ہوگا,bāġh-e-jannat meñ samāñ kyā hogā,باغ جنت میں سماں کیا ہوگا,bāġh-e-jannat meñ samāñ kyā hogā
3,تو نہیں جب تو وہاں کیا ہوگا,tū nahīñ jab to vahāñ kyā hogā,تو نہیں جب تو وہاں کیا ہوگا,tū nahīñ jab to vahāñ kyā hogā
4,خوش وہ ہوتا ہے مرے نالوں سے,ḳhush vo hotā hai mire nāloñ se,خوش وہ ہوتا ہے مرے نالوں سے,ḳhush vo hotā hai mire nāloñ se


In [18]:
# Cell 4: split dataset into train/val/test (50/25/25)
from sklearn.model_selection import train_test_split
import numpy as np, os

clean_csv = os.path.join(DATA_DIR, "pairs_clean.csv")
df = pd.read_csv(clean_csv, encoding="utf-8")

# Optional: stratify by length buckets to ensure splits have similar length distribution
lengths = df["urdu_clean"].astype(str).apply(len)
bins = np.minimum((lengths // 20).astype(int), 9)  # coarse buckets
df["len_bucket"] = bins

train_df, rest = train_test_split(df, test_size=0.5, random_state=SEED, stratify=df["len_bucket"])
val_df, test_df = train_test_split(rest, test_size=0.5, random_state=SEED, stratify=rest["len_bucket"])

for name, d in [("train", train_df), ("val", val_df), ("test", test_df)]:
    path = os.path.join(DATA_DIR, f"{name}.csv")
    d.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved {name}.csv -> {path} ({len(d)} rows)")

# clean up temporary column
for d in (train_df, val_df, test_df):
    if "len_bucket" in d.columns:
        d.drop(columns=["len_bucket"], inplace=True)


Saved train.csv -> /kaggle/working/data/train.csv (10501 rows)
Saved val.csv -> /kaggle/working/data/val.csv (5251 rows)
Saved test.csv -> /kaggle/working/data/test.csv (5251 rows)


In [13]:
# Cell 5: character-level tokenizer (recommended for transliteration)
import json, os

class CharTokenizer:
    def __init__(self, specials=["<pad>","<sos>","<eos>","<unk>"]):
        self.specials = specials
        self.vocab = None
        self.inv_vocab = None

    def build(self, texts):
        chars = set()
        for t in texts:
            for ch in str(t):
                chars.add(ch)
        chars = sorted(chars)
        tokens = list(self.specials) + chars
        self.vocab = {tok:i for i,tok in enumerate(tokens)}
        self.inv_vocab = {i:tok for tok,i in self.vocab.items()}
        return self.vocab

    def save(self, prefix):
        with open(prefix + "_vocab.json","w",encoding="utf-8") as f:
            json.dump(self.vocab, f, ensure_ascii=False, indent=2)

    def load(self, prefix):
        with open(prefix + "_vocab.json","r",encoding="utf-8") as f:
            self.vocab = json.load(f)
            self.inv_vocab = {int(i):tok for tok,i in enumerate(self.vocab)} if False else {v:int(k) for k,v in self.vocab.items()} 
            # fix: build inv_vocab properly
            self.inv_vocab = {int(v):k for k,v in self.vocab.items()}

    def encode(self, text):
        return [self.vocab.get(ch, self.vocab["<unk>"]) for ch in str(text)]

    def decode(self, ids, stop_at_eos=True):
        out = []
        for i in ids:
            tok = self.inv_vocab.get(int(i), "<unk>")
            if tok in ("<pad>","<sos>"):
                continue
            if tok == "<eos>" and stop_at_eos:
                break
            out.append(tok)
        return "".join(out)

# Build tokenizers from training data
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"), encoding="utf-8")
char_tok_src = CharTokenizer()  # Urdu
char_tok_tgt = CharTokenizer()  # Roman

print("Building src (Urdu) tokenizer...")
char_tok_src.build(train_df["urdu_clean"].astype(str).tolist())
print("Building tgt (Roman) tokenizer...")
char_tok_tgt.build(train_df["roman_clean"].astype(str).tolist())

char_tok_src.save(os.path.join(MODELS_DIR, "char_urdu"))
char_tok_tgt.save(os.path.join(MODELS_DIR, "char_roman"))
print("Saved char tokenizers to", MODELS_DIR)

# Expose vocab sizes & pad/sos/eos ids
SRC_VOCAB_SIZE = len(char_tok_src.vocab)
TGT_VOCAB_SIZE = len(char_tok_tgt.vocab)
SRC_PAD = int(char_tok_src.vocab["<pad>"])
TGT_PAD = int(char_tok_tgt.vocab["<pad>"])
TGT_SOS = int(char_tok_tgt.vocab["<sos>"])
TGT_EOS = int(char_tok_tgt.vocab["<eos>"])

print("SRC_VOCAB_SIZE", SRC_VOCAB_SIZE, "TGT_VOCAB_SIZE", TGT_VOCAB_SIZE)


Building src (Urdu) tokenizer...
Building tgt (Roman) tokenizer...
Saved char tokenizers to /kaggle/working/models
SRC_VOCAB_SIZE 56 TGT_VOCAB_SIZE 44


In [20]:
# Cell 6: Dataset and DataLoader (char tokens)
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class UrduRomanCharDataset(Dataset):
    def __init__(self, df, src_tok, tgt_tok, max_src_len=200, max_tgt_len=250):
        self.df = df.reset_index(drop=True)
        self.src_tok = src_tok
        self.tgt_tok = tgt_tok
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len
        self.src_pad = int(src_tok.vocab["<pad>"])
        self.tgt_pad = int(tgt_tok.vocab["<pad>"])
        self.tgt_sos = int(tgt_tok.vocab["<sos>"])
        self.tgt_eos = int(tgt_tok.vocab["<eos>"])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        src = str(row["urdu_clean"])
        tgt = str(row["roman_clean"])
        src_ids = self.src_tok.encode(src)[:self.max_src_len]
        tgt_ids = self.tgt_tok.encode(tgt)[:(self.max_tgt_len-1)]  # leave space for eos
        tgt_in = [self.tgt_sos] + tgt_ids
        tgt_out = tgt_ids + [self.tgt_eos]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_in, dtype=torch.long), torch.tensor(tgt_out, dtype=torch.long)

def collate_fn(batch):
    srcs, tgts_in, tgts_out = zip(*batch)
    src_pad = SRC_PAD
    tgt_pad = TGT_PAD
    src_lens = torch.tensor([s.size(0) for s in srcs], dtype=torch.long)
    tgt_lens = torch.tensor([t.size(0) for t in tgts_in], dtype=torch.long)
    src_padded = pad_sequence(srcs, batch_first=True, padding_value=src_pad)
    tgt_in_padded = pad_sequence(tgts_in, batch_first=True, padding_value=tgt_pad)
    tgt_out_padded = pad_sequence(tgts_out, batch_first=True, padding_value=tgt_pad)
    src_mask = (src_padded != src_pad)
    tgt_mask = (tgt_in_padded != tgt_pad)
    return src_padded, src_lens, src_mask, tgt_in_padded, tgt_out_padded, tgt_lens

# build datasets & loaders
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"), encoding="utf-8")
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"), encoding="utf-8")
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"), encoding="utf-8")

# compute reasonable max lens (optional)
def compute_stats(tok, texts, n_samples=2000):
    lens = [len(tok.encode(str(t))) for t in texts[:n_samples]]
    arr = np.array(lens)
    return int(np.percentile(arr, 99))

MAX_SRC_LEN = min(200, compute_stats(char_tok_src, train_df["urdu_clean"].astype(str).tolist(), n_samples=2000) + 5)
MAX_TGT_LEN = min(250, compute_stats(char_tok_tgt, train_df["roman_clean"].astype(str).tolist(), n_samples=2000) + 5)

train_dataset = UrduRomanCharDataset(train_df, char_tok_src, char_tok_tgt, max_src_len=MAX_SRC_LEN, max_tgt_len=MAX_TGT_LEN)
val_dataset   = UrduRomanCharDataset(val_df,   char_tok_src, char_tok_tgt, max_src_len=MAX_SRC_LEN, max_tgt_len=MAX_TGT_LEN)
test_dataset  = UrduRomanCharDataset(test_df,  char_tok_src, char_tok_tgt, max_src_len=MAX_SRC_LEN, max_tgt_len=MAX_TGT_LEN)

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)

print("DataLoaders ready. Example batch shapes:")
b = next(iter(train_loader))
print("src:", b[0].shape, "tgt_in:", b[3].shape, "tgt_out:", b[4].shape)


DataLoaders ready. Example batch shapes:
src: torch.Size([32, 53]) tgt_in: torch.Size([32, 64]) tgt_out: torch.Size([32, 64])


In [11]:
# Cell 7: model definitions
EMB_DIM = 128
HID_DIM = 256
ENC_LAYERS = 2
DEC_LAYERS = 2
DROPOUT = 0.2

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.2, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(input_size=emb_dim, hidden_size=hid_dim, num_layers=n_layers,
                           dropout=dropout if n_layers>1 else 0.0, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.n_layers = n_layers
        self.hid_dim = hid_dim

    def forward(self, src, src_lengths):
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_outputs, (h_n, c_n) = self.rnn(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
        return outputs, h_n, c_n

class LuongAttention(nn.Module):
    def __init__(self, enc_dim, dec_dim):
        super().__init__()
        self.W = nn.Linear(dec_dim, enc_dim, bias=False)

    def forward(self, dec_hidden, enc_outputs, mask=None):
        proj = self.W(dec_hidden).unsqueeze(2)
        scores = torch.bmm(enc_outputs, proj).squeeze(2)
        if mask is not None:
            scores = scores.masked_fill(~mask, -1e9)
        attn_weights = F.softmax(scores, dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), enc_outputs).squeeze(1)
        return attn_weights, context

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers=2, dropout=0.2, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.attn = LuongAttention(enc_dim=enc_hid_dim*2, dec_dim=dec_hid_dim)
        self.rnn = nn.LSTM(input_size=emb_dim + enc_hid_dim*2, hidden_size=dec_hid_dim,
                           num_layers=n_layers, batch_first=True, dropout=dropout if n_layers>1 else 0.0)
        self.fc_out = nn.Linear(dec_hid_dim + enc_hid_dim*2 + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.n_layers = n_layers
        self.dec_hid_dim = dec_hid_dim

    def forward_step(self, input_tok, last_hidden, last_cell, enc_outputs, enc_mask):
        emb = self.embedding(input_tok).unsqueeze(1)
        emb = self.dropout(emb)
        dec_top_hidden = last_hidden[-1]
        attn_weights, context = self.attn(dec_top_hidden, enc_outputs, enc_mask)
        rnn_input = torch.cat([emb, context.unsqueeze(1)], dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (last_hidden, last_cell))
        output = output.squeeze(1)
        emb_s = emb.squeeze(1)
        concat = torch.cat([output, context, emb_s], dim=1)
        logits = self.fc_out(concat)
        return logits, hidden, cell, attn_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_hid_dim, dec_hid_dim, dec_n_layers):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dec_n_layers = dec_n_layers
        enc_total = encoder.n_layers * 2 * enc_hid_dim
        self.enc2dec_h = nn.Linear(enc_total, dec_n_layers * dec_hid_dim)
        self.enc2dec_c = nn.Linear(enc_total, dec_n_layers * dec_hid_dim)

    def forward(self, src, src_lens, tgt_in, teacher_forcing_ratio=0.5):
        B = src.size(0)
        max_tgt_len = tgt_in.size(1)
        enc_outputs, enc_h, enc_c = self.encoder(src, src_lens)
        enc_mask = (src != SRC_PAD)
        enc_h_flat = enc_h.permute(1,0,2).contiguous().view(B, -1)
        enc_c_flat = enc_c.permute(1,0,2).contiguous().view(B, -1)
        dec_h_flat = self.enc2dec_h(enc_h_flat)
        dec_c_flat = self.enc2dec_c(enc_c_flat)
        dec_h = dec_h_flat.view(self.dec_n_layers, B, self.dec_hid_dim).contiguous()
        dec_c = dec_c_flat.view(self.dec_n_layers, B, self.dec_hid_dim).contiguous()
        outputs = torch.zeros(B, max_tgt_len, TGT_VOCAB_SIZE, device=src.device)
        input_tok = tgt_in[:, 0]
        for t in range(1, max_tgt_len):
            logits, dec_h, dec_c, attn = self.decoder.forward_step(input_tok, dec_h, dec_c, enc_outputs, enc_mask)
            outputs[:, t, :] = logits
            teacher_force = (random.random() < teacher_forcing_ratio)
            if teacher_force:
                input_tok = tgt_in[:, t]
            else:
                input_tok = logits.argmax(dim=1)
        return outputs

    def greedy_decode(self, src, src_lens, max_tgt_len=120):
        B = src.size(0)
        enc_outputs, enc_h, enc_c = self.encoder(src, src_lens)
        enc_mask = (src != SRC_PAD)
        enc_h_flat = enc_h.permute(1,0,2).contiguous().view(B, -1)
        enc_c_flat = enc_c.permute(1,0,2).contiguous().view(B, -1)
        dec_h_flat = self.enc2dec_h(enc_h_flat)
        dec_c_flat = self.enc2dec_c(enc_c_flat)
        dec_h = dec_h_flat.view(self.dec_n_layers, B, self.dec_hid_dim).contiguous()
        dec_c = dec_c_flat.view(self.dec_n_layers, B, self.dec_hid_dim).contiguous()
        preds = torch.full((B, max_tgt_len), TGT_PAD, dtype=torch.long, device=src.device)
        input_tok = torch.full((B,), TGT_SOS, dtype=torch.long, device=src.device)
        finished = torch.zeros(B, dtype=torch.bool, device=src.device)
        for t in range(max_tgt_len):
            logits, dec_h, dec_c, attn = self.decoder.forward_step(input_tok, dec_h, dec_c, enc_outputs, enc_mask)
            next_tok = logits.argmax(dim=1)
            preds[:, t] = next_tok
            finished = finished | (next_tok == TGT_EOS)
            if finished.all():
                break
            input_tok = next_tok
        return preds


In [14]:
# Cell 8: construct model, optimizer, criterion
enc = Encoder(input_dim=SRC_VOCAB_SIZE, emb_dim=EMB_DIM, hid_dim=HID_DIM, n_layers=ENC_LAYERS, dropout=DROPOUT, pad_idx=SRC_PAD)
dec = Decoder(output_dim=TGT_VOCAB_SIZE, emb_dim=EMB_DIM, enc_hid_dim=HID_DIM, dec_hid_dim=HID_DIM, n_layers=DEC_LAYERS, dropout=DROPOUT, pad_idx=TGT_PAD)
model = Seq2Seq(enc, dec, enc_hid_dim=HID_DIM, dec_hid_dim=HID_DIM, dec_n_layers=DEC_LAYERS).to(device)
print("Model parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

criterion = nn.CrossEntropyLoss(ignore_index=TGT_PAD, reduction="sum")
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
print("Current LR:", scheduler.get_last_lr())


Model parameters: 5046316
Current LR: [0.0003]


In [23]:
# Cell 9: helpers for training & evaluation
import math

def compute_loss_and_tokens(preds, tgt_out, pad_idx=TGT_PAD):
    B, T, V = preds.size()
    preds_flat = preds.reshape(-1, V)
    tgt_flat = tgt_out.reshape(-1)
    loss = criterion(preds_flat, tgt_flat)
    n_tokens = (tgt_out != pad_idx).sum().item()
    return loss, n_tokens

def _ngrams(seq, n):
    return [tuple(seq[i:i+n]) for i in range(len(seq)-n+1)] if len(seq) >= n else []

def corpus_bleu(references, hypotheses, max_n=4):
    matches_by_order = [0]*max_n
    possible_by_order = [0]*max_n
    ref_len = 0
    hyp_len = 0
    for ref, hyp in zip(references, hypotheses):
        ref_len += len(ref)
        hyp_len += len(hyp)
        for n in range(1, max_n+1):
            ref_ngrams = Counter(_ngrams(ref, n))
            hyp_ngrams = Counter(_ngrams(hyp, n))
            overlap = sum((hyp_ngrams & ref_ngrams).values())
            matches_by_order[n-1] += overlap
            possible_by_order[n-1] += max(0, len(hyp)-n+1)
    precisions = []
    for i in range(max_n):
        if possible_by_order[i] == 0:
            precisions.append(0.0)
        else:
            precisions.append(matches_by_order[i] / possible_by_order[i])
    smooth = 1e-9
    log_prec = sum((1.0/max_n) * math.log(max(p, smooth)) for p in precisions)
    bp = math.exp(1 - ref_len/hyp_len) if hyp_len < ref_len and hyp_len>0 else 1.0
    bleu = bp * math.exp(log_prec)
    return bleu * 100

def cer(refs, hyps):
    total = 0
    edits = 0
    for r, h in zip(refs, hyps):
        a = list(r)
        b = list(h)
        n, m = len(a), len(b)
        dp = np.zeros((n+1, m+1), dtype=int)
        for i in range(n+1):
            dp[i,0] = i
        for j in range(m+1):
            dp[0,j] = j
        for i in range(1,n+1):
            for j in range(1,m+1):
                if a[i-1] == b[j-1]:
                    dp[i,j] = dp[i-1,j-1]
                else:
                    dp[i,j] = 1 + min(dp[i-1,j], dp[i,j-1], dp[i-1,j-1])
        edits += dp[n,m]
        total += n
    return edits / max(1, total)

# decode helpers using char_tok_tgt.inv_vocab
def ids_to_string_tokenwise(id_seq):
    return char_tok_tgt.decode(id_seq, stop_at_eos=True)


In [24]:
# Cell 10: training loop
NUM_EPOCHS = 30
best_val_bleu = -1.0
patience = 5
no_improve = 0

def train_epoch(model, loader, optimizer, epoch, teacher_forcing_ratio=0.5, clip=1.0):
    model.train()
    total_loss = 0.0
    total_tokens = 0
    t0 = time.time()
    for i, batch in enumerate(loader):
        src_padded, src_lens, src_mask, tgt_in_padded, tgt_out_padded, tgt_lens = batch
        src_padded = src_padded.to(device)
        src_lens = src_lens.to(device)
        tgt_in_padded = tgt_in_padded.to(device)
        tgt_out_padded = tgt_out_padded.to(device)
        optimizer.zero_grad()
        outputs = model(src_padded, src_lens, tgt_in_padded, teacher_forcing_ratio=teacher_forcing_ratio)
        preds = outputs[:,1:,:]
        targets = tgt_out_padded[:, :-1]
        loss, n_tokens = compute_loss_and_tokens(preds, targets, pad_idx=TGT_PAD)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        total_loss += loss.item()
        total_tokens += n_tokens
    avg_loss = total_loss / max(1, total_tokens)
    return avg_loss

def evaluate(model, loader, n_samples=200, max_gen_len=MAX_TGT_LEN):
    model.eval()
    references = []
    hypotheses = []
    total_loss = 0.0
    total_tokens = 0
    with torch.no_grad():
        for i, batch in enumerate(loader):
            src_padded, src_lens, src_mask, tgt_in_padded, tgt_out_padded, tgt_lens = batch
            src_padded = src_padded.to(device)
            src_lens = src_lens.to(device)
            tgt_in_padded = tgt_in_padded.to(device)
            tgt_out_padded = tgt_out_padded.to(device)
            outputs = model(src_padded, src_lens, tgt_in_padded, teacher_forcing_ratio=0.0)
            preds = outputs[:,1:,:]
            targets = tgt_out_padded[:, :-1]
            loss, n_tokens = compute_loss_and_tokens(preds, targets, pad_idx=TGT_PAD)
            total_loss += loss.item()
            total_tokens += n_tokens
            # sample generated outputs
            if len(hypotheses) < n_samples:
                gen = model.greedy_decode(src_padded, src_lens, max_tgt_len=max_gen_len)
                B = gen.size(0)
                for b in range(B):
                    ref_ids = tgt_out_padded[b,:tgt_lens[b]].tolist()
                    hyp_ids = gen[b].tolist()
                    ref_str = char_tok_tgt.decode(ref_ids)
                    hyp_str = char_tok_tgt.decode(hyp_ids)
                    references.append(list(ref_str))
                    hypotheses.append(list(hyp_str))
    avg_loss = total_loss / max(1, total_tokens)
    bleu_score = corpus_bleu(references, hypotheses) if len(hypotheses)>0 else 0.0
    cer_score = cer(["".join(r) for r in references], ["".join(h) for h in hypotheses]) if len(hypotheses)>0 else 1.0
    return avg_loss, bleu_score, cer_score

# teacher forcing schedule (linear decay)
tf_start = 1.0
tf_end = 0.5

os.makedirs(CHECKPOINT_DIR, exist_ok=True)
for epoch in range(1, NUM_EPOCHS+1):
    tf_ratio = tf_start + (tf_end - tf_start) * (epoch-1)/(NUM_EPOCHS-1)
    t0 = time.time()
    train_loss = train_epoch(model, train_loader, optimizer, epoch, teacher_forcing_ratio=tf_ratio, clip=1.0)
    val_loss, val_bleu, val_cer = evaluate(model, val_loader, n_samples=200, max_gen_len=MAX_TGT_LEN)
    scheduler.step(val_loss)
    print(f"Epoch {epoch} | train_loss {train_loss:.4f} | val_loss {val_loss:.4f} | val_bleu {val_bleu:.2f} | val_cer {val_cer:.4f} | tf {tf_ratio:.3f} | time {time.time()-t0:.1f}s")

    # save checkpoint if improved
    if val_bleu > best_val_bleu:
        best_val_bleu = val_bleu
        no_improve = 0
        ckpt_path = os.path.join(CHECKPOINT_DIR, f"best_model_epoch{epoch}_bleu{val_bleu:.2f}.pt")
        torch.save({
            "epoch": epoch,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "val_bleu": val_bleu,
            "val_cer": val_cer,
            "char_src_vocab": char_tok_src.vocab,
            "char_tgt_vocab": char_tok_tgt.vocab
        }, ckpt_path)
        print("Saved checkpoint:", ckpt_path)
    else:
        no_improve += 1
    if no_improve >= patience:
        print(f"No improvement for {patience} epochs. Early stopping.")
        break


Epoch 1 | train_loss 1.9648 | val_loss 3.3297 | val_bleu 53.75 | val_cer 0.3340 | tf 1.000 | time 60.9s
Saved checkpoint: /kaggle/working/models/seq2seq_checkpoints/best_model_epoch1_bleu53.75.pt
Epoch 2 | train_loss 0.6860 | val_loss 3.4103 | val_bleu 69.85 | val_cer 0.1814 | tf 0.983 | time 60.5s
Saved checkpoint: /kaggle/working/models/seq2seq_checkpoints/best_model_epoch2_bleu69.85.pt
Epoch 3 | train_loss 0.4288 | val_loss 3.2232 | val_bleu 79.76 | val_cer 0.1206 | tf 0.966 | time 60.6s
Saved checkpoint: /kaggle/working/models/seq2seq_checkpoints/best_model_epoch3_bleu79.76.pt
Epoch 4 | train_loss 0.3231 | val_loss 3.0013 | val_bleu 85.13 | val_cer 0.0918 | tf 0.948 | time 59.7s
Saved checkpoint: /kaggle/working/models/seq2seq_checkpoints/best_model_epoch4_bleu85.13.pt
Epoch 5 | train_loss 0.2637 | val_loss 2.8016 | val_bleu 87.12 | val_cer 0.0804 | tf 0.931 | time 60.0s
Saved checkpoint: /kaggle/working/models/seq2seq_checkpoints/best_model_epoch5_bleu87.12.pt
Epoch 6 | train_loss

In [15]:
# === Rebuild model and load best checkpoint ===
import torch
import os
import glob
import torch.serialization
from numpy.core.multiarray import scalar

# Allow safe numpy unpickling (needed for old checkpoints)
torch.serialization.add_safe_globals([scalar])

# Correct paths based on your actual structure
WORKDIR = "/kaggle/working"
MODELS_DIR = os.path.join(WORKDIR, "models")
CHECKPOINT_DIR = os.path.join(MODELS_DIR, "seq2seq_checkpoints")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Find best checkpoint
ckpts = sorted(glob.glob(os.path.join(CHECKPOINT_DIR, "best_model_epoch*_bleu*.pt")))
if not ckpts:
    raise FileNotFoundError(f"No checkpoints found in {CHECKPOINT_DIR}")
best_ckpt = ckpts[-1]
print("Loading checkpoint:", best_ckpt)

# Load safely (PyTorch ≥ 2.6 fix)
checkpoint = torch.load(best_ckpt, map_location=device, weights_only=False)

# --- Rebuild model (must match training settings) ---
enc = Encoder(
    input_dim=SRC_VOCAB_SIZE, emb_dim=EMB_DIM, hid_dim=HID_DIM,
    n_layers=ENC_LAYERS, dropout=DROPOUT, pad_idx=SRC_PAD
)
dec = Decoder(
    output_dim=TGT_VOCAB_SIZE, emb_dim=EMB_DIM, enc_hid_dim=HID_DIM,
    dec_hid_dim=HID_DIM, n_layers=DEC_LAYERS, dropout=DROPOUT, pad_idx=TGT_PAD
)
model = Seq2Seq(enc, dec, enc_hid_dim=HID_DIM, dec_hid_dim=HID_DIM,
                dec_n_layers=DEC_LAYERS).to(device)

model.load_state_dict(checkpoint["model_state"])
model.eval()
print(f"✅ Model loaded successfully from {best_ckpt}")


Loading checkpoint: /kaggle/working/models/seq2seq_checkpoints/best_model_epoch9_bleu91.57.pt
✅ Model loaded successfully from /kaggle/working/models/seq2seq_checkpoints/best_model_epoch9_bleu91.57.pt


In [21]:
# ============================
# ✅ Cell 11 — Final Evaluation & Inference
# ============================

import os
import torch
import pandas as pd

# Define data + model directories
BASE_DIR = "/kaggle/working"  # adjust if different
DATA_DIR = os.path.join(BASE_DIR, "dataset")
MODELS_DIR = os.path.join(BASE_DIR, "models")
CHECKPOINT_DIR = os.path.join(MODELS_DIR, "seq2seq_checkpoints")

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- decoding utils ---

def greedy_decode_sentence(urdu_sentence, model, src_tok, tgt_tok, max_len=80):
    """Greedy decoding for a single Urdu sentence."""
    model.eval()
    with torch.no_grad():
        sids = src_tok.encode(urdu_sentence)
        if len(sids) == 0:
            return ""
        src = torch.tensor([sids], dtype=torch.long, device=device)
        src_len = torch.tensor([len(sids)], dtype=torch.long, device=device)

        preds = model.greedy_decode(src, src_len, max_tgt_len=max_len)
        hyp_ids = preds[0].tolist()

        # stop at EOS if exists
        if TGT_EOS in hyp_ids:
            hyp_ids = hyp_ids[:hyp_ids.index(TGT_EOS)]

        out_str = tgt_tok.decode(hyp_ids)
        return out_str.replace("</w>", "").strip()


def beam_search_decode_sentence(urdu_sentence, model, src_tok, tgt_tok,
                                beam_width=4, max_len=80):
    """Optimized beam search decoding with EOS safety."""
    model.eval()
    with torch.no_grad():
        sids = src_tok.encode(urdu_sentence)
        if len(sids) == 0:
            return ""

        src_tensor = torch.tensor([sids], dtype=torch.long, device=device)
        src_len = torch.tensor([len(sids)], dtype=torch.long, device=device)
        enc_outputs, enc_h, enc_c = model.encoder(src_tensor, src_len)
        enc_mask = (src_tensor != SRC_PAD)
        B = 1
        enc_h_flat = enc_h.permute(1, 0, 2).contiguous().view(B, -1)
        enc_c_flat = enc_c.permute(1, 0, 2).contiguous().view(B, -1)
        dec_h_flat = model.enc2dec_h(enc_h_flat)
        dec_c_flat = model.enc2dec_c(enc_c_flat)
        dec_h = dec_h_flat.view(model.dec_n_layers, B, model.dec_hid_dim).contiguous()
        dec_c = dec_c_flat.view(model.dec_n_layers, B, model.dec_hid_dim).contiguous()

        beams = [([TGT_SOS], 0.0, dec_h, dec_c)]
        completed = []

        for step in range(max_len):
            new_beams = []
            for tokens, score, h, c in beams:
                if tokens[-1] == TGT_EOS:
                    completed.append((tokens, score))
                    continue

                inp = torch.tensor([tokens[-1]], dtype=torch.long, device=device)
                logits, h2, c2, _ = model.decoder.forward_step(inp, h, c, enc_outputs, enc_mask)
                logp = F.log_softmax(logits.squeeze(0), dim=-1)
                topk_vals, topk_ids = torch.topk(logp, beam_width)

                for kid, val in zip(topk_ids.tolist(), topk_vals.tolist()):
                    new_tokens = tokens + [kid]
                    new_score = score + val
                    new_beams.append((new_tokens, new_score, h2, c2))

            if not new_beams:
                break
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

            # stop early if enough EOS found
            if len(completed) >= beam_width:
                break

        if completed:
            completed = sorted(completed, key=lambda x: x[1], reverse=True)
            best_tokens = completed[0][0]
        else:
            best_tokens = beams[0][0]

        # strip special tokens
        if TGT_EOS in best_tokens:
            best_tokens = best_tokens[:best_tokens.index(TGT_EOS)]
        if TGT_SOS in best_tokens:
            best_tokens.remove(TGT_SOS)

        out_str = tgt_tok.decode(best_tokens)
        return out_str.replace("</w>", "").strip()


# ============================
# ✅ Test on sample(s)
# ============================

test_csv_path = os.path.join(DATA_DIR, "test.csv")

if os.path.exists(test_csv_path):
    test_df = pd.read_csv(test_csv_path, encoding="utf-8")
    print(f"Loaded test dataset with {len(test_df)} samples.\n")

    n_samples = min(5, len(test_df))  # test first 5
    for i in range(n_samples):
        urdu_text = str(test_df.iloc[i]["urdu_clean"])
        ref = str(test_df.iloc[i].get("roman_clean", ""))

        start_time = time.time()
        greedy_out = greedy_decode_sentence(urdu_text, model, char_tok_src, char_tok_tgt)
        beam_out = beam_search_decode_sentence(urdu_text, model, char_tok_src, char_tok_tgt)
        t = time.time() - start_time

        print(f"\n--- SAMPLE {i+1}/{n_samples} --- ({t:.1f}s)")
        print("🟩 URDU  :", urdu_text)
        print("🟨 REF   :", ref)
        print("🟦 GREEDY:", greedy_out)
        print("🟪 BEAM  :", beam_out)
else:
    print("No test.csv found — run manual test below instead.")


# ============================
# ✅ Manual sentence test
# ============================

sample_sentence = "میرے دل کی بات سنو"
print("\n--- Manual Test ---")
print("URDU :", sample_sentence)
print("GREEDY:", greedy_decode_sentence(sample_sentence, model, char_tok_src, char_tok_tgt))
print("BEAM  :", beam_search_decode_sentence(sample_sentence, model, char_tok_src, char_tok_tgt))


No test.csv found — run manual test below instead.

--- Manual Test ---
URDU : میرے دل کی بات سنو
GREEDY: mere dil kī baat sunū
BEAM  : mere dil kī baat sunū


In [24]:
# Cell 12: utilities for loading best checkpoint and inference
import glob

def load_best_checkpoint(checkpoint_dir=CHECKPOINT_DIR):
    ckpts = sorted(glob.glob(os.path.join(checkpoint_dir, "best_model_epoch*_bleu*.pt")))
    if not ckpts:
        raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
    best = ckpts[-1]
    data = torch.load(best, map_location=device, weights_only=False)
    enc = Encoder(input_dim=SRC_VOCAB_SIZE, emb_dim=EMB_DIM, hid_dim=HID_DIM, n_layers=ENC_LAYERS, dropout=DROPOUT, pad_idx=SRC_PAD)
    dec = Decoder(output_dim=TGT_VOCAB_SIZE, emb_dim=EMB_DIM, enc_hid_dim=HID_DIM, dec_hid_dim=HID_DIM, n_layers=DEC_LAYERS, dropout=DROPOUT, pad_idx=TGT_PAD)
    modelx = Seq2Seq(enc, dec, enc_hid_dim=HID_DIM, dec_hid_dim=HID_DIM, dec_n_layers=DEC_LAYERS).to(device)
    modelx.load_state_dict(data["model_state"])
    modelx.eval()
    print("Loaded checkpoint:", best, "val_bleu:", data.get("val_bleu"))
    return modelx

# example:
# best_model = load_best_checkpoint()
# print(greedy_decode_sentence("گلاب", best_model, char_tok_src, char_tok_tgt))


In [29]:
# ============================
# ✅ Cell 13 — Urdu → Roman Urdu Inference
# ============================

import random
import torch

# Ensure model and tokenizers are loaded
best_model = load_best_checkpoint()
assert "char_tok_src" in locals() and "char_tok_tgt" in locals(), "Tokenizers not loaded!"

def translate_urdu(text, model=best_model, method="beam", max_len=80):
    """
    Translate Urdu → Roman Urdu using greedy or beam decoding.
    """
    text = text.strip()
    if not text:
        return ""
    if method == "beam":
        out = beam_search_decode_sentence(text, model, char_tok_src, char_tok_tgt, beam_width=4, max_len=max_len)
    else:
        out = greedy_decode_sentence(text, model, char_tok_src, char_tok_tgt, max_len=max_len)
    return out.strip()


# --- ✅ Manual single test ---
#sample_text = "میرے خواب ادھورے ہیں"
#print("🟩 Urdu :", sample_text)
#print("🟦 Greedy:", translate_urdu(sample_text, method="greedy"))
#print("🟪 Beam  :", translate_urdu(sample_text, method="beam"))


# --- ✅ Random sample tests (built-in Urdu lines) ---
sample_lines = [
    "یہ کیا بیوقوف ہے"
]

print("\n============================")
print("🌙 Random Urdu → Roman Urdu Tests")
print("============================")
for line in random.sample(sample_lines, 1):
    print(f"\n🟩 Urdu: {line}")
    print("🟦 Greedy:", translate_urdu(line, method="greedy"))
    print("🟪 Beam  :", translate_urdu(line, method="beam"))


Loaded checkpoint: /kaggle/working/models/seq2seq_checkpoints/best_model_epoch9_bleu91.57.pt val_bleu: 91.57070189244088

🌙 Random Urdu → Roman Urdu Tests

🟩 Urdu: یہ کیا بیوقوف ہے
🟦 Greedy: ye kyā bavauqūf hai
🟪 Beam  : ye kyā bavauqūf hai


In [17]:
# Cell 13: tiny augmentation helpers (duplicate with roman variants)
import itertools

def augment_roman_variants(roman_text):
    # basic, low-rate augmentations: repeated vowels, alternate translit of long vowels
    variants = set([roman_text])
    # replace ā-like sequences with aa and vice versa
    variants.add(roman_text.replace("ā","aa"))
    variants.add(roman_text.replace("aa","a"))
    # common replacements (add more as needed)
    repls = [("kh","x"), ("sh","s h"), ("oo","u"), ("ou","u")]
    for a,b in repls:
        variants.add(roman_text.replace(a,b))
    return list(variants)

# Example: augment train set by adding 1 variant per sample with low prob
aug_rows = []
for _, r in train_df.sample(frac=0.05, random_state=SEED).iterrows():  # augment 5% of data
    v = augment_roman_variants(r["roman_clean"])
    if len(v) > 1:
        aug_rows.append({"urdu": r["urdu_clean"], "roman": v[1], "urdu_clean": r["urdu_clean"], "roman_clean": v[1]})

if aug_rows:
    aug_df = pd.DataFrame(aug_rows)
    aug_df.to_csv(os.path.join(DATA_DIR, "augmented_samples.csv"), index=False)
    print("Created augmented samples:", len(aug_df))


Created augmented samples: 512


In [18]:
!zip -r /kaggle/working/working_dir.zip /kaggle/working/ -x "/kaggle/working/state.db"


  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/models/ (stored 0%)
  adding: kaggle/working/models/roman_bpe_merges.json (deflated 74%)
  adding: kaggle/working/models/urdu_bpe_vocab.json (deflated 75%)
  adding: kaggle/working/models/roman_bpe_vocab.json (deflated 71%)
  adding: kaggle/working/models/char_urdu_vocab.json (deflated 59%)
  adding: kaggle/working/models/urdu_bpe_merges.json (deflated 77%)
  adding: kaggle/working/models/char_roman_vocab.json (deflated 59%)
  adding: kaggle/working/models/seq2seq_checkpoints/ (stored 0%)
  adding: kaggle/working/models/seq2seq_checkpoints/best_model_epoch15_bleu93.79.pt (deflated 8%)
  adding: kaggle/working/models/seq2seq_checkpoints/best_model_epoch9_bleu91.57.pt (deflated 8%)
  adding: kaggle/working/models/seq2seq_checkpoints/best_model_epoch20_bleu95.24.pt (deflated 8%)
  adding: kaggle/working/models/seq2seq_checkpoints/best_model_epoch16_bleu94.28.pt (deflated