<a href="https://colab.research.google.com/github/aneelabashir786/NLP/blob/main/22f8816_22f3414_NLP_A1_7D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading


In [None]:
import zipfile, os

# Unzip dataset.zip into a folder named "data/dataset"
zip_path = "dataset.zip"          # your zip file is at the root
extract_path = "data/dataset"     # create this folder

os.makedirs("data", exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extracted folders:", os.listdir(extract_path))


Extracted folders: ['dataset', '__MACOSX']


In [None]:
import zipfile

with zipfile.ZipFile("dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("data")


# Mapping Urdu (source) -> Roman Urdu ( Target )

In [None]:
import os

def load_dataset(dataset_path="data"):
    urdu_sentences = []
    roman_urdu_sentences = []

    # Walk through poet folders
    for poet in os.listdir(dataset_path):
        poet_path = os.path.join(dataset_path, poet)
        if not os.path.isdir(poet_path):
            continue

        ur_path = os.path.join(poet_path, "ur")
        en_path = os.path.join(poet_path, "en")

        # Skip poets that don't have both ur & en
        if not (os.path.isdir(ur_path) and os.path.isdir(en_path)):
            continue

        # Match files by name
        ur_files = sorted(os.listdir(ur_path))
        en_files = sorted(os.listdir(en_path))

        # Keep only common files
        common_files = set(ur_files).intersection(set(en_files))

        for fname in common_files:
            ur_file_path = os.path.join(ur_path, fname)
            en_file_path = os.path.join(en_path, fname)

            with open(ur_file_path, "r", encoding="utf-8") as f_ur, \
                 open(en_file_path, "r", encoding="utf-8") as f_en:

                ur_lines = f_ur.readlines()
                en_lines = f_en.readlines()

                # Pair line by line
                for ur, en in zip(ur_lines, en_lines):
                    ur = ur.strip()
                    en = en.strip()
                    if ur and en:  # skip empty lines
                        urdu_sentences.append(ur)
                        roman_urdu_sentences.append(en)

    print(f"Loaded {len(urdu_sentences)} sentence pairs.")
    return urdu_sentences, roman_urdu_sentences


# Example usage
urdu, roman = load_dataset("data/dataset")
print("Sample Urdu:", urdu[:5])
print("Sample Roman Urdu:", roman[:5])


Loaded 21003 sentence pairs.
Sample Urdu: ['غمزہ نہیں ہوتا کہ اشارا نہیں ہوتا', 'آنکھ ان سے جو ملتی ہے تو کیا کیا نہیں ہوتا', 'جلوہ نہ ہو معنی کا تو صورت کا اثر کیا', 'بلبل گل تصویر کا شیدا نہیں ہوتا', 'اللہ بچائے مرض عشق سے دل کو']
Sample Roman Urdu: ['ġhamza nahīñ hotā ki ishārā nahīñ hotā', 'aañkh un se jo miltī hai to kyā kyā nahīñ hotā', 'jalva na ho ma.anī kā to sūrat kā asar kyā', 'bulbul gul-e-tasvīr kā shaidā nahīñ hotā', 'allāh bachā.e maraz-e-ishq se dil ko']


# Preprocessing

In [None]:
import re, unicodedata

# --- Put this where your cleaning funcs are (above save_corpus) ---
# Extra-strong Urdu normalization & cleaning

# 1) Canonical Unicode normalization
def _nfc(s: str) -> str:
    return unicodedata.normalize("NFC", s)

# 2) Common letter unifications (expand as needed)
_URDU_MAP = {
    # Yeh/Kaf/Heh variants
    "ي": "ی", "ى": "ی", "ئ": "ی", "یٰ": "ی",
    "ك": "ک",
    "ھ": "ہ", "ۀ": "ہ",
    "ة": "ہ",  # taa marbuta → heh (common in loanwords)
    "ؤ": "و",  # optional: hamza-on-waw → waw
    "أ": "ا", "إ": "ا", "آ": "ا",  # alef variants → alef
    "ٱ": "ا",
    "ۃ": "ہ",

    # Heh-doachashmee handling (keep as "ہ" for simplicity)
    "ہٰ": "ہ",
}

# 3) Marks to remove: tashkeel/diacritics, tatweel, etc.
_DIACRITICS_RE = re.compile(r"[\u064B-\u0652\u0670\u0653-\u065F\u06D6-\u06ED]")  # fathatan..sukun, superscript alef, Quranic marks
_TATWEEL_RE   = re.compile(r"\u0640")  # tatweel

# 4) Allowed characters: Urdu block + space
_URDU_KEEP_RE = re.compile(r"[^\u0600-\u06FF\s]")

def clean_urdu_text(text: str) -> str:
    # Canonical normalize
    text = _nfc(text)

    # Map common variants
    for src, dst in _URDU_MAP.items():
        text = text.replace(src, dst)

    # Remove diacritics & tatweel
    text = _DIACRITICS_RE.sub("", text)
    text = _TATWEEL_RE.sub("", text)

    # Keep only Urdu letters + space
    text = _URDU_KEEP_RE.sub("", text)

    # Collapse spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [None]:
!pip install sentencepiece



In [None]:
import sentencepiece as spm
from sklearn.model_selection import train_test_split
import torch

# Train SentencePiece Models

def train_tokenizers(vocab_size=8000):
    spm.SentencePieceTrainer.Train(
        input="all_urdu.txt",
        model_prefix="urdu_spm",
        vocab_size=vocab_size,
        character_coverage=0.9995,
        model_type="bpe",
        pad_id=0,
        bos_id=1,
        eos_id=2,
        unk_id=3
    )
    spm.SentencePieceTrainer.Train(
        input="all_roman.txt",
        model_prefix="roman_spm",
        vocab_size=vocab_size,
        character_coverage=1.0,
        model_type="bpe",
        pad_id=0,
        bos_id=1,
        eos_id=2,
        unk_id=3
    )

# Load Tokenizers

def load_tokenizers():
    urdu_sp = spm.SentencePieceProcessor()
    urdu_sp.load("urdu_spm.model")
    roman_sp = spm.SentencePieceProcessor()
    roman_sp.load("roman_spm.model")
    return urdu_sp, roman_sp



# Encode Sentences



def encode_sentences(texts, sp, max_len=50):
    bos, eos, pad = sp.bos_id(), sp.eos_id(), sp.pad_id()
    out = []
    for s in texts:
        if not isinstance(s, str):
            s = str(s)
        ids = sp.encode(s, out_type=int)          # ← returns list[int]
        ids = [bos] + ids[:max_len-2] + [eos]     # keep room for BOS/EOS
        if len(ids) < max_len:
            ids += [pad] * (max_len - len(ids))
        else:
            # ensure last token is EOS after trim
            ids[-1] = eos
        out.append(ids)
    return torch.tensor(out, dtype=torch.long)


In [None]:
# ==== MASTER SETUP (run this after any Colab restart / GPU switch) ====
!pip -q install sentencepiece

import os, re, unicodedata
import torch
from sklearn.model_selection import train_test_split
import sentencepiece as spm

# ---------- Cleaning ----------
def clean_urdu_text(text):
    text = unicodedata.normalize("NFC", text)
    repl = {
        "ي":"ی","ى":"ی","ئ":"ی","یٰ":"ی",
        "ك":"ک",
        "ھ":"ہ","ۀ":"ہ","ة":"ہ","ۃ":"ہ",
        "ؤ":"و",
        "أ":"ا","إ":"ا","آ":"ا","ٱ":"ا",
        "ہٰ":"ہ",
    }
    for s,d in repl.items(): text = text.replace(s,d)
    text = re.sub(r"[\u064B-\u0652\u0670\u0653-\u065F\u06D6-\u06ED]", "", text)  # diacritics
    text = re.sub(r"\u0640", "", text)                                            # tatweel
    text = re.sub(r"[^\u0600-\u06FF\s]", "", text)                               # Urdu-only
    return re.sub(r"\s+", " ", text).strip()

def clean_roman_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()

def save_corpus(urdu_lines, roman_lines):
    with open("all_urdu.txt","w",encoding="utf-8") as f:
        for s in urdu_lines: f.write(clean_urdu_text(s)+"\n")
    with open("all_roman.txt","w",encoding="utf-8") as f:
        for s in roman_lines: f.write(clean_roman_text(s)+"\n")

# ---------- SentencePiece train/load ----------
def train_tokenizers(vocab_size=8000):
    def _train_with_backoff(input_path, prefix, target_vs):
        vs = target_vs
        while vs >= 2000:  # don’t go too tiny
            try:
                spm.SentencePieceTrainer.train(
                    input=input_path, model_prefix=prefix, vocab_size=vs,
                    pad_id=0, bos_id=1, eos_id=2, unk_id=3, character_coverage=1.0
                )
                print(f"[OK] Trained {prefix} with vocab_size={vs}")
                return vs
            except RuntimeError as e:
                msg = str(e)
                if "Vocabulary size too high" in msg:
                    # back off by 10% (round to int)
                    new_vs = max(int(vs * 0.9), vs - 500)
                    print(f"[Retry] {prefix}: {msg.strip()} → trying vocab_size={new_vs}")
                    vs = new_vs
                else:
                    raise
        raise RuntimeError(f"Failed to train {prefix}: corpus too small.")

    # remove existing models if partially created
    for f in ["urdu.model","urdu.vocab","roman.model","roman.vocab"]:
        if os.path.exists(f): os.remove(f)

    ur_vs = _train_with_backoff("all_urdu.txt",  "urdu",  vocab_size)
    ro_vs = _train_with_backoff("all_roman.txt", "roman", vocab_size)
    return ur_vs, ro_vs

def load_tokenizers():
    ur_sp = spm.SentencePieceProcessor(model_file="urdu.model")
    ro_sp = spm.SentencePieceProcessor(model_file="roman.model")
    return ur_sp, ro_sp

# ---------- Encoding (signature matches your current calls) ----------
# NOTE: you are calling encode_sentences( sp, texts, max_len )
import torch
def encode_sentences(sp, texts, max_len=50):
    bos, eos, pad = sp.bos_id(), sp.eos_id(), sp.pad_id()
    out = []
    for s in texts:
        s = s if isinstance(s,str) else str(s)
        ids = sp.encode(s, out_type=int)
        ids = [bos] + ids[:max_len-2] + [eos]
        if len(ids) < max_len:
            ids += [pad]*(max_len - len(ids))
        else:
            ids[-1] = eos
        out.append(ids)
    return torch.tensor(out, dtype=torch.long)

# ---------- Main preprocessing pipeline ----------
def preprocess_pipeline(urdu, roman, vocab_size=8000, max_len=50):
    # clean
    urdu_clean  = [clean_urdu_text(s) for s in urdu]
    roman_clean = [clean_roman_text(s) for s in roman]
    # save
    save_corpus(urdu_clean, roman_clean)
    # tokenizers
    train_tokenizers(vocab_size=vocab_size)
    urdu_sp, roman_sp = load_tokenizers()
    # splits (50/25/25)
    train_ur, temp_ur, train_ro, temp_ro = train_test_split(urdu_clean, roman_clean, test_size=0.5, random_state=42)
    val_ur,   test_ur,  val_ro,  test_ro = train_test_split(temp_ur,  temp_ro,  test_size=0.5, random_state=42)
    # encode
    train_X = encode_sentences(urdu_sp,  train_ur, max_len)
    train_Y = encode_sentences(roman_sp, train_ro, max_len)
    val_X   = encode_sentences(urdu_sp,  val_ur,   max_len)
    val_Y   = encode_sentences(roman_sp, val_ro,   max_len)
    test_X  = encode_sentences(urdu_sp,  test_ur,  max_len)
    test_Y  = encode_sentences(roman_sp, test_ro,  max_len)
    return (train_X, train_Y), (val_X, val_Y), (test_X, test_Y), urdu_sp, roman_sp


In [None]:
# Prereq: you must already have python lists `urdu` and `roman` in memory (aligned).
(train_X, train_Y), (val_X, val_Y), (test_X, test_Y), ur_sp, ro_sp = preprocess_pipeline(
    urdu, roman, vocab_size=8000, max_len=50
)

print("Train shapes:", train_X.shape, train_Y.shape)
print("Validation shapes:", val_X.shape, val_Y.shape)
print("Test shapes:", test_X.shape, test_Y.shape)


[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (8000). Please set it to a value <= 5927. → trying vocab_size=7500
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (7500). Please set it to a value <= 5927. → trying vocab_size=7000
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (7000). Please set it to a value <= 5927. → trying vocab_size=6500
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (6500). Please set it to a value <= 5927. → trying vocab_size=6000
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (6000). Please set it 

In [None]:
# ---------------- Preprocessing Verification ----------------

print("=== Vocabulary & Special Tokens ===")
print("Urdu vocab size:", ur_sp.get_piece_size())
print("Roman vocab size:", ro_sp.get_piece_size())
print("Special tokens Urdu:", ur_sp.pad_id(), ur_sp.bos_id(), ur_sp.eos_id())
print("Special tokens Roman:", ro_sp.pad_id(), ro_sp.bos_id(), ro_sp.eos_id())
assert ur_sp.pad_id()==0 and ur_sp.bos_id()==1 and ur_sp.eos_id()==2
assert ro_sp.pad_id()==0 and ro_sp.bos_id()==1 and ro_sp.eos_id()==2

print("\n=== Encode / Decode Sanity ===")
urdu_sample = "مجھے اردو بہت پسند ہے"
roman_sample = "mujhe urdu bohot pasand hai"

u_ids = ur_sp.encode(urdu_sample, out_type=int)
r_ids = ro_sp.encode(roman_sample, out_type=int)
print("Urdu sample:", urdu_sample)
print("Encoded:", u_ids)
print("Decoded:", ur_sp.decode(u_ids))

print("Roman sample:", roman_sample)
print("Encoded:", r_ids)
print("Decoded:", ro_sp.decode(r_ids))

print("\n=== With BOS/EOS & Padding ===")
u_ids_pad = encode_sentences(ur_sp, [urdu_sample], max_len=12)[0]
r_ids_pad = encode_sentences(ro_sp, [roman_sample], max_len=12)[0]


# convert tensors to Python lists before decoding
u_list = u_ids_pad.tolist()
r_list = r_ids_pad.tolist()

print("Urdu padded IDs:", u_list, "→", ur_sp.decode(u_list))
print("Roman padded IDs:", r_list, "→", ro_sp.decode(r_list))

assert u_list[0] == ur_sp.bos_id() and u_list[-1] in [ur_sp.eos_id(), ur_sp.pad_id()]
assert r_list[0] == ro_sp.bos_id() and r_list[-1] in [ro_sp.eos_id(), ro_sp.pad_id()]

print("\n=== Train/Val/Test Split Check ===")
print("Train size:", len(train_X), "Val size:", len(val_X), "Test size:", len(test_X))
print("Example pair:")
print("Urdu:", ur_sp.decode(train_X[0].tolist()))
print("Roman:", ro_sp.decode(train_Y[0].tolist()))

print("\n Preprocessing verified successfully if no assertion errors.")


=== Vocabulary & Special Tokens ===
Urdu vocab size: 5500
Roman vocab size: 8000
Special tokens Urdu: 0 1 2
Special tokens Roman: 0 1 2

=== Encode / Decode Sanity ===
Urdu sample: مجھے اردو بہت پسند ہے
Encoded: [1925, 3, 7, 33, 3998, 80, 1324, 4]
Decoded: مج ⁇ ے اردو بہت پسند ہے
Roman sample: mujhe urdu bohot pasand hai
Encoded: [53, 3802, 1094, 6098, 17, 70, 2811, 4]
Decoded: mujhe urdu bohot pasand hai

=== With BOS/EOS & Padding ===
Urdu padded IDs: [1, 1925, 3, 7, 33, 3998, 80, 1324, 4, 2, 0, 0] → مج ⁇ ے اردو بہت پسند ہے
Roman padded IDs: [1, 53, 3802, 1094, 6098, 17, 70, 2811, 4, 2, 0, 0] → mujhe urdu bohot pasand hai

=== Train/Val/Test Split Check ===
Train size: 10501 Val size: 5251 Test size: 5251
Example pair:
Urdu: عالم ہے فقط مومن جانباز کی میراث
Roman: aalam hai faqat mominejbz k mrs

 Preprocessing verified successfully if no assertion errors.


# Model Archetecture


In [None]:
import torch
import torch.nn as nn

# ================ Encoder: 2-layer BiLSTM ================
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim=256, hid_dim=512, n_layers=2, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embed   = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm    = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers,
                               bidirectional=True, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.hid_dim = hid_dim
        self.n_layers = n_layers

    def forward(self, src):                         # src: (B, T)
        x = self.dropout(self.embed(src))           # (B, T, E)
        outputs, (h, c) = self.lstm(x)              # outputs: (B,T,2H)  h,c: (2*n_layers,B,H)
        return outputs, h, c


# ================ Bridge: BiLSTM → 4-layer decoder ================
class Bridge(nn.Module):
    """Concat forward/backward (2H) -> project to H, then tile to dec_layers."""
    def __init__(self, enc_layers=2, dec_layers=4, hid_dim=512):
        super().__init__()
        self.h_proj = nn.Linear(2*hid_dim, hid_dim)
        self.c_proj = nn.Linear(2*hid_dim, hid_dim)
        self.dec_layers = dec_layers

    def forward(self, h, c):                        # h,c: (2*enc_layers, B, H)
        n2, B, H = h.size()
        enc_layers = n2 // 2
        h = h.view(enc_layers, 2, B, H)
        c = c.view(enc_layers, 2, B, H)
        h = torch.cat([h[:,0], h[:,1]], dim=-1)     # (enc_layers,B,2H)
        c = torch.cat([c[:,0], c[:,1]], dim=-1)     # (enc_layers,B,2H)
        h = torch.tanh(self.h_proj(h))              # (enc_layers,B,H)
        c = torch.tanh(self.c_proj(c))              # (enc_layers,B,H)
        reps = self.dec_layers // enc_layers
        rem  = self.dec_layers %  enc_layers
        h = h.repeat_interleave(reps, dim=0)
        c = c.repeat_interleave(reps, dim=0)
        if rem:
            h = torch.cat([h, h[-1:].repeat(rem,1,1)], dim=0)
            c = torch.cat([c, c[-1:].repeat(rem,1,1)], dim=0)
        return h, c                                  # (dec_layers,B,H)


# ================ Luong (general) Attention ================
class LuongAttention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.Wa = nn.Linear(hid_dim, hid_dim, bias=False)

    def forward(self, dec_h, enc_outs, enc_mask):
        proj = self.Wa(enc_outs)                                 # (B,T,H)
        scores = torch.bmm(proj, dec_h.unsqueeze(2)).squeeze(2)  # (B,T)
        scores = scores.masked_fill(enc_mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)                     # (B,T)
        ctx = torch.bmm(attn.unsqueeze(1), enc_outs).squeeze(1)  # (B,H)
        return ctx, attn


# ================ Decoder with Attention ================
class AttnDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim=256, hid_dim=512, n_layers=4, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embed   = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.lstm    = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.attn    = LuongAttention(hid_dim)
        self.concat  = nn.Linear(hid_dim*2, hid_dim)
        self.dropout = nn.Dropout(dropout)
        self.n_layers = n_layers
        self.hid_dim  = hid_dim
        self.output_dim = output_dim

        # weight tying
        if hid_dim == emb_dim:
            self.fc_out = nn.Linear(hid_dim, output_dim, bias=False)
            self.fc_out.weight = self.embed.weight
            self.proj = None
        else:
            self.proj  = nn.Linear(hid_dim, emb_dim, bias=False)
            self.fc_out = nn.Linear(emb_dim, output_dim, bias=False)
            self.fc_out.weight = self.embed.weight

    def forward(self, input_t, hidden, cell, enc_outs, enc_mask):
        emb = self.dropout(self.embed(input_t.unsqueeze(1)))      # (B,1,E)
        lstm_out, (hidden, cell) = self.lstm(emb, (hidden, cell)) # (B,1,H)
        h_t = lstm_out.squeeze(1)                                 # (B,H)
        ctx, _ = self.attn(h_t, enc_outs, enc_mask)               # (B,H)
        cat = torch.tanh(self.concat(torch.cat([h_t, ctx], dim=-1)))
        if self.proj is not None:
            cat = self.proj(cat)
        logits = self.fc_out(cat)                                 # (B,V)
        return logits, hidden, cell


# ================ Seq2Seq wrapper (uses Bridge + Attention) ================
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx=0, bos_idx=1, eos_idx=2, device=None):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx, self.bos_idx, self.eos_idx = pad_idx, bos_idx, eos_idx
        self.device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.bridge = Bridge(enc_layers=encoder.n_layers, dec_layers=decoder.n_layers, hid_dim=encoder.hid_dim)

    def make_src_mask(self, src):
        return (src != self.pad_idx).long()

    def forward(self, src, trg=None, teacher_forcing_ratio=0.5, max_len=None):
        B = src.size(0)
        T_out = (trg.size(1) if trg is not None else max_len)
        assert T_out is not None, "Provide trg or max_len"

        enc_outs2H, h, c = self.encoder(src)        # (B,T,2H)
        Bsz, Tsrc, H2 = enc_outs2H.size()
        H = H2 // 2
        enc_outs = enc_outs2H.view(Bsz, Tsrc, 2, H).sum(dim=2)    # (B,T,H)

        h, c = self.bridge(h, c)                    # (n_dec,B,H)
        src_mask = self.make_src_mask(src)          # (B,T)

        V = self.decoder.output_dim
        outputs = torch.zeros(B, T_out, V, device=self.device, dtype=torch.float)

        input_t = (trg[:, 0] if trg is not None else
                   torch.full((B,), self.bos_idx, dtype=torch.long, device=self.device))

        hidden, cell = h, c
        for t in range(1, T_out):
            logits, hidden, cell = self.decoder(input_t, hidden, cell, enc_outs, src_mask)
            outputs[:, t, :] = logits
            use_tf = (trg is not None) and (torch.rand(1).item() < teacher_forcing_ratio)
            input_t = trg[:, t] if use_tf else logits.argmax(dim=-1)
        return outputs


In [None]:
import torch.nn as nn
class LabelSmoothingCE(nn.Module):
    def __init__(self, eps=0.1, ignore_index=0):
        super().__init__()
        self.eps = eps; self.ignore_index = ignore_index
    def forward(self, logits, target):
        V = logits.size(-1)
        logp = torch.log_softmax(logits, dim=-1)
        mask = target.ne(self.ignore_index)
        if mask.sum() == 0: return logits.new_tensor(0.)
        logp = logp[mask]; target = target[mask]
        nll = -logp.gather(1, target.unsqueeze(1)).squeeze(1)
        smooth = -logp.mean(dim=1)
        return ((1 - self.eps) * nll + self.eps * smooth).mean()

@torch.no_grad()
def token_accuracy(logits, target, ignore_index=0):
    preds = logits.argmax(dim=-1)
    mask = target.ne(ignore_index)
    if mask.sum() == 0: return 0.0
    return ((preds == target) & mask).sum().float().item() / mask.sum().float().item()


# 3. Training & Hyperparameters


In [None]:
import torch.nn as nn
def train_epoch(model, loader, optimizer, criterion, teacher_forcing=0.5, clip=1.0):
    model.train(); total_loss=0.0; total_acc=0.0
    for src,trg in loader:
        src, trg = src.to(model.device), trg.to(model.device)
        optimizer.zero_grad()
        logits = model(src, trg, teacher_forcing_ratio=teacher_forcing)
        loss = criterion(logits[:,1:].reshape(-1, logits.size(-1)), trg[:,1:].reshape(-1))
        loss.backward(); nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        total_loss += loss.item()
        total_acc  += token_accuracy(logits[:,1:], trg[:,1:])
    n = max(1, len(loader))
    return total_loss/n, total_acc/n

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval(); total_loss=0.0; total_acc=0.0
    for src,trg in loader:
        src, trg = src.to(model.device), trg.to(model.device)
        logits = model(src, trg, teacher_forcing_ratio=0.0)
        loss = criterion(logits[:,1:].reshape(-1, logits.size(-1)), trg[:,1:].reshape(-1))
        total_loss += loss.item()
        total_acc  += token_accuracy(logits[:,1:], trg[:,1:])
    n = max(1, len(loader))
    avg = total_loss/n
    ppl = float(torch.exp(torch.tensor(avg)))
    return avg, ppl, total_acc/n


In [None]:
# build dataloaders (if not built)
from torch.utils.data import DataLoader, TensorDataset
BATCH_SIZE = 64
train_dl = DataLoader(TensorDataset(train_X, train_Y), batch_size=BATCH_SIZE, shuffle=True)
val_dl   = DataLoader(TensorDataset(val_X,   val_Y),   batch_size=BATCH_SIZE)

# build model
pad, bos, eos = 0, 1, 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

enc = Encoder(ur_sp.get_piece_size(), emb_dim=256, hid_dim=512, n_layers=2, dropout=0.3, pad_idx=pad)
dec = AttnDecoder(ro_sp.get_piece_size(), emb_dim=256, hid_dim=512, n_layers=4, dropout=0.3, pad_idx=pad)  # <- attention decoder

model = Seq2Seq(enc, dec, pad_idx=pad, bos_idx=bos, eos_idx=eos, device=device).to(device)

print("decoder class:", type(model.decoder))  # should show AttnDecoder

# quick forward check
x = train_X[:8].to(device)
y = train_Y[:8].to(device)
with torch.no_grad():
    logits = model(x, y, teacher_forcing_ratio=0.0)
print("OK →", logits.shape)  # expect (8, T, V)


decoder class: <class '__main__.AttnDecoder'>
OK → torch.Size([8, 50, 8000])


In [None]:
import math, torch

criterion = LabelSmoothingCE(eps=0.1, ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)

best_val = float('inf'); patience, bad = 5, 0
for epoch in range(1, 50+1):        # cap at 50; early stopping will cut it earlier
    tr_loss, tr_acc = train_epoch(model, train_dl, optimizer, criterion, teacher_forcing=0.5)
    val_loss, val_ppl, val_acc = evaluate(model, val_dl, criterion)
    print(f"Epoch {epoch:02d} | train {tr_loss:.4f} acc {tr_acc:.2%} | "
          f"val {val_loss:.4f} acc {val_acc:.2%} | ppl {val_ppl:.2f}")
    # save best
    if val_loss < best_val - 1e-4:
        best_val, bad = val_loss, 0
        torch.save(model.state_dict(), "best_attn_seq2seq.pt")
    else:
        bad += 1
        if bad >= patience:
            print("Early stopping.")
            break

# load best before test
model.load_state_dict(torch.load("best_attn_seq2seq.pt", map_location=model.device))
test_loss, test_ppl, test_acc = evaluate(model, val_dl, criterion)  # or test_dl if you kept it aside
print(f"FINAL | val loss {test_loss:.4f} acc {test_acc:.2%} ppl {test_ppl:.2f}")


Epoch 01 | train 5.7636 acc 35.54% | val 4.6664 acc 48.11% | ppl 106.31
Epoch 02 | train 4.3164 acc 51.50% | val 4.0471 acc 54.97% | ppl 57.23
Epoch 03 | train 3.7560 acc 58.34% | val 3.8843 acc 57.18% | ppl 48.63
Epoch 04 | train 3.3604 acc 63.60% | val 3.8009 acc 58.37% | ppl 44.74
Epoch 05 | train 3.0326 acc 68.56% | val 3.5740 acc 63.90% | ppl 35.66
Epoch 06 | train 2.7653 acc 72.93% | val 3.5033 acc 65.58% | ppl 33.23
Epoch 07 | train 2.5575 acc 76.08% | val 3.4414 acc 66.03% | ppl 31.23
Epoch 08 | train 2.3819 acc 78.98% | val 3.4073 acc 67.34% | ppl 30.18
Epoch 09 | train 2.2443 acc 81.27% | val 3.3817 acc 68.12% | ppl 29.42
Epoch 10 | train 2.1253 acc 83.72% | val 3.4094 acc 68.60% | ppl 30.25
Epoch 11 | train 2.0378 acc 85.45% | val 3.4640 acc 68.22% | ppl 31.94
Epoch 12 | train 1.9499 acc 87.20% | val 3.3350 acc 69.47% | ppl 28.08
Epoch 13 | train 1.8881 acc 88.66% | val 3.3579 acc 69.53% | ppl 28.73
Epoch 14 | train 1.8291 acc 90.05% | val 3.3822 acc 70.06% | ppl 29.44
Epoch

In [None]:
!pip -q install sacrebleu
import sacrebleu, torch

@torch.no_grad()
def eval_bleu(model, pairs, ur_sp, ro_sp, max_len=50):
    hyps, refs = [], []
    for ur, ro in pairs:
        hyp = greedy_decode(model, ur_sp, ro_sp, ur, max_len)
        hyps.append(hyp); refs.append([ro])
    bleu = sacrebleu.corpus_bleu(hyps, list(zip(*refs))).score
    chrf = sacrebleu.corpus_chrf(hyps, list(zip(*refs))).score
    return bleu, chrf


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
@torch.no_grad()
def beam_decode(model, ur_sp, ro_sp, src_text, max_len=50, beam_size=4):
    BOS, EOS, PAD = 1,2,0
    src = encode_sentences(ur_sp, [src_text], max_len=max_len).to(model.device)
    enc_outs2H, h, c = model.encoder(src)
    B,T,H2 = enc_outs2H.size(); H=H2//2
    enc_outs = enc_outs2H.view(B,T,2,H).sum(2)
    h,c = model.bridge(h,c)
    src_mask = (src!=PAD).long()

    beams = [(0.0, [BOS], h, c)]  # (logprob, ids, h, c)
    for _ in range(1, max_len):
        new = []
        for lp, ids, hh, cc in beams:
            if ids[-1]==EOS: new.append((lp, ids, hh, cc)); continue
            inp = torch.tensor([ids[-1]], device=model.device)
            logits, hh2, cc2 = model.decoder(inp, hh, cc, enc_outs, src_mask)
            logp = torch.log_softmax(logits, -1).squeeze(0)  # (V,)
            topk = torch.topk(logp, beam_size).indices.tolist()
            for tid in topk:
                new.append((lp+float(logp[tid]), ids+[tid], hh2, cc2))
        beams = sorted(new, key=lambda x: x[0], reverse=True)[:beam_size]
        if all(b[1][-1]==EOS for b in beams): break

    best = max(beams, key=lambda x: x[0])[1]
    toks = [t for t in best if t not in (PAD,BOS,EOS)]
    return ro_sp.decode(toks)


# Random Testing

## Exp-A

In [None]:
# retrain tokenizers
(train_X, train_Y), (val_X, val_Y), (test_X, test_Y), ur_sp, ro_sp = preprocess_pipeline(
    urdu, roman, vocab_size=8000, max_len=50   # roman vocab smaller
)

# rebuild dataloaders
train_dl = DataLoader(TensorDataset(train_X, train_Y), batch_size=64, shuffle=True)
val_dl   = DataLoader(TensorDataset(val_X,   val_Y),   batch_size=64)

# rebuild model
enc = Encoder(ur_sp.get_piece_size(), emb_dim=256, hid_dim=512, n_layers=2, dropout=0.3, pad_idx=0)
dec = AttnDecoder(ro_sp.get_piece_size(), emb_dim=256, hid_dim=512, n_layers=4, dropout=0.3, pad_idx=0)
model = Seq2Seq(enc, dec, pad_idx=0, bos_idx=1, eos_idx=2).to(device)


[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (8000). Please set it to a value <= 5927. → trying vocab_size=7500
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (7500). Please set it to a value <= 5927. → trying vocab_size=7000
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (7000). Please set it to a value <= 5927. → trying vocab_size=6500
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (6500). Please set it to a value <= 5927. → trying vocab_size=6000
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (6000). Please set it 

## Exp-B

In [None]:
enc = Encoder(ur_sp.get_piece_size(), emb_dim=256, hid_dim=512, n_layers=2, dropout=0.2, pad_idx=0)
dec = AttnDecoder(ro_sp.get_piece_size(), emb_dim=256, hid_dim=512, n_layers=4, dropout=0.2, pad_idx=0)
model = Seq2Seq(enc, dec, pad_idx=0, bos_idx=1, eos_idx=2).to(device)

criterion = LabelSmoothingCE(eps=0.05, ignore_index=0)   # smoother eps


## Exp-C

In [None]:
enc = Encoder(ur_sp.get_piece_size(), emb_dim=512, hid_dim=512, n_layers=2, dropout=0.3, pad_idx=0)
dec = AttnDecoder(ro_sp.get_piece_size(), emb_dim=512, hid_dim=512, n_layers=4, dropout=0.3, pad_idx=0)
model = Seq2Seq(enc, dec, pad_idx=0, bos_idx=1, eos_idx=2).to(device)


In [None]:
criterion = LabelSmoothingCE(eps=0.1, ignore_index=0)  # unless changed
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

best_val = float('inf'); bad = 0; patience = 3
for epoch in range(1, 8):   # ~5–7 epochs enough for experiments
    tr_loss, tr_acc = train_epoch(model, train_dl, optimizer, criterion)
    val_loss, val_ppl, val_acc = evaluate(model, val_dl, criterion)
    print(f"Exp | Epoch {epoch:02d} | train {tr_loss:.4f} acc {tr_acc:.2%} "
          f"| val {val_loss:.4f} acc {val_acc:.2%} | ppl {val_ppl:.2f}")
    if val_loss < best_val - 1e-4:
        best_val, bad = val_loss, 0
    else:
        bad += 1
        if bad >= patience:
            print("Early stopping.")
            break


Exp | Epoch 01 | train 5.3521 acc 36.63% | val 4.3471 acc 47.16% | ppl 77.25
Exp | Epoch 02 | train 3.7539 acc 55.33% | val 3.8367 acc 52.98% | ppl 46.37
Exp | Epoch 03 | train 3.1836 acc 63.21% | val 3.5159 acc 61.38% | ppl 33.65
Exp | Epoch 04 | train 2.7742 acc 69.72% | val 3.4031 acc 64.42% | ppl 30.06
Exp | Epoch 05 | train 2.4791 acc 74.93% | val 3.3568 acc 66.67% | ppl 28.70
Exp | Epoch 06 | train 2.2654 acc 79.09% | val 3.3463 acc 67.62% | ppl 28.40
Exp | Epoch 07 | train 2.1096 acc 82.56% | val 3.2615 acc 68.77% | ppl 26.09


## Testing Section

In [None]:
import os, random, numpy as np, torch

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed); torch.backends.cudnn.deterministic = True
set_seed(42)

def cfg_name(prefix, ur_v, ro_v, emb, hid, drop, lr):
    return f"{prefix}_ur{ur_v}_ro{ro_v}_emb{emb}_hid{hid}_drop{drop}_lr{lr}.pt"


In [None]:
# greedy decode with bounds/EOS checks
@torch.no_grad()
def greedy_decode(model, ur_sp, ro_sp, src_text, max_len=50):
    model.eval()
    PAD, BOS, EOS = 0, 1, 2
    src = encode_sentences(ur_sp, [src_text], max_len=max_len).to(model.device)
    logits = model(src, trg=None, teacher_forcing_ratio=0.0, max_len=max_len)  # (1,T,V)
    ids = logits.argmax(dim=-1).squeeze(0).tolist()
    V = ro_sp.get_piece_size()
    toks = []
    for t in ids:
        if t == EOS: break
        if t in (PAD, BOS): continue
        if 0 <= t < V: toks.append(t)
    return ro_sp.decode(toks)

# simple BLEU/chrF on a small list of (src, tgt) pairs
!pip -q install sacrebleu
import sacrebleu

@torch.no_grad()
def eval_bleu(model, sample_pairs, ur_sp, ro_sp, max_len=50):
    hyps, refs = [], []
    for ur, ro in sample_pairs:
        hyp = greedy_decode(model, ur_sp, ro_sp, ur, max_len=max_len)
        hyps.append(hyp); refs.append([ro])
    bleu = sacrebleu.corpus_bleu(hyps, list(zip(*refs))).score
    chrf = sacrebleu.corpus_chrf(hyps, list(zip(*refs))).score
    return bleu, chrf


### Exp-A

In [None]:
set_seed(42)

# 1) re-run preprocessing with target vocab ≈ 8000
(train_X, train_Y), (val_X, val_Y), (test_X, test_Y), ur_sp, ro_sp = preprocess_pipeline(
    urdu, roman, vocab_size=8000, max_len=50
)

# 2) dataloaders
from torch.utils.data import DataLoader, TensorDataset
BATCH = 64
train_dl = DataLoader(TensorDataset(train_X, train_Y), batch_size=BATCH, shuffle=True)
val_dl   = DataLoader(TensorDataset(val_X,   val_Y),   batch_size=BATCH)

# 3) build model
pad, bos, eos = 0, 1, 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
URV, ROV = ur_sp.get_piece_size(), ro_sp.get_piece_size()
EMB, HID, DROP = 256, 512, 0.3
LR = 1e-3

enc = Encoder(URV, emb_dim=EMB, hid_dim=HID, n_layers=2, dropout=DROP, pad_idx=pad)
dec = AttnDecoder(ROV, emb_dim=EMB, hid_dim=HID, n_layers=4, dropout=DROP, pad_idx=pad)
model = Seq2Seq(enc, dec, pad_idx=pad, bos_idx=bos, eos_idx=eos, device=device).to(device)

criterion = LabelSmoothingCE(eps=0.10, ignore_index=pad)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-6)

# 4) train (short with early stop)
best, bad, patience = float('inf'), 0, 3
ckpt_A = cfg_name("best_attn_A", URV, ROV, EMB, HID, DROP, LR)
for epoch in range(1, 9):
    tr_loss, tr_acc = train_epoch(model, train_dl, optimizer, criterion, teacher_forcing=0.5)
    val_loss, val_ppl, val_acc = evaluate(model, val_dl, criterion)
    print(f"[A] e{epoch:02d} | train {tr_loss:.4f} acc {tr_acc:.2%} | val {val_loss:.4f} acc {val_acc:.2%} | ppl {val_ppl:.2f}")
    if val_loss < best - 1e-4:
        best, bad = val_loss, 0
        torch.save(model.state_dict(), ckpt_A)
    else:
        bad += 1
        if bad >= patience:
            print("[A] early stop"); break

# 5) (optional) small BLEU on 50 val pairs
pairs_A = [(ur_sp.decode(train_X[i].tolist()), ro_sp.decode(train_Y[i].tolist())) for i in range(50)]
bleu_A, chrf_A = eval_bleu(model, pairs_A, ur_sp, ro_sp, max_len=50)
print(f"[A] BLEU {bleu_A:.2f}  chrF {chrf_A:.2f}")

res_A = dict(exp="A", urv=URV, rov=ROV, emb=EMB, hid=HID, drop=DROP, lr=LR,
             best_val=best, bleu=bleu_A, chrf=chrf_A)


[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (8000). Please set it to a value <= 5927. → trying vocab_size=7500
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (7500). Please set it to a value <= 5927. → trying vocab_size=7000
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (7000). Please set it to a value <= 5927. → trying vocab_size=6500
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (6500). Please set it to a value <= 5927. → trying vocab_size=6000
[Retry] urdu: Internal: src/trainer_interface.cc(664) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (6000). Please set it 

### Exp-B

In [None]:
set_seed(42)

# re-use train_dl / val_dl built above (same tokenizers)
URV, ROV = ur_sp.get_piece_size(), ro_sp.get_piece_size()
EMB, HID, DROP = 256, 512, 0.2
LR = 1e-3

enc = Encoder(URV, emb_dim=EMB, hid_dim=HID, n_layers=2, dropout=DROP, pad_idx=0)
dec = AttnDecoder(ROV, emb_dim=EMB, hid_dim=HID, n_layers=4, dropout=DROP, pad_idx=0)
model = Seq2Seq(enc, dec, pad_idx=0, bos_idx=1, eos_idx=2, device=device).to(device)

criterion = LabelSmoothingCE(eps=0.05, ignore_index=0)  # milder smoothing
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-6)

best, bad, patience = float('inf'), 0, 3
ckpt_B = cfg_name("best_attn_B", URV, ROV, EMB, HID, DROP, LR)
for epoch in range(1, 9):
    tr_loss, tr_acc = train_epoch(model, train_dl, optimizer, criterion, teacher_forcing=0.5)
    val_loss, val_ppl, val_acc = evaluate(model, val_dl, criterion)
    print(f"[B] e{epoch:02d} | train {tr_loss:.4f} acc {tr_acc:.2%} | val {val_loss:.4f} acc {val_acc:.2%} | ppl {val_ppl:.2f}")
    if val_loss < best - 1e-4:
        best, bad = val_loss, 0
        torch.save(model.state_dict(), ckpt_B)
    else:
        bad += 1
        if bad >= patience:
            print("[B] early stop"); break

pairs_B = [(ur_sp.decode(train_X[i].tolist()), ro_sp.decode(train_Y[i].tolist())) for i in range(50)]
bleu_B, chrf_B = eval_bleu(model, pairs_B, ur_sp, ro_sp, max_len=50)
print(f"[B] BLEU {bleu_B:.2f}  chrF {chrf_B:.2f}")

res_B = dict(exp="B", urv=URV, rov=ROV, emb=EMB, hid=HID, drop=DROP, lr=LR,
             best_val=best, bleu=bleu_B, chrf=chrf_B)


[B] e01 | train 5.3976 acc 36.57% | val 4.1496 acc 49.61% | ppl 63.41
[B] e02 | train 3.7446 acc 53.07% | val 3.8512 acc 50.99% | ppl 47.05
[B] e03 | train 3.0888 acc 60.40% | val 3.3281 acc 60.33% | ppl 27.88
[B] e04 | train 2.6026 acc 66.94% | val 3.1171 acc 62.69% | ppl 22.58
[B] e05 | train 2.2139 acc 72.65% | val 3.0836 acc 64.50% | ppl 21.84
[B] e06 | train 1.9093 acc 77.30% | val 3.1003 acc 65.41% | ppl 22.20
[B] e07 | train 1.6749 acc 80.87% | val 3.0585 acc 66.04% | ppl 21.30
[B] e08 | train 1.4694 acc 84.74% | val 3.0984 acc 66.54% | ppl 22.16
[B] BLEU 81.59  chrF 89.64


### Exp-C

In [None]:
set_seed(42)

URV, ROV = ur_sp.get_piece_size(), ro_sp.get_piece_size()
EMB, HID, DROP = 512, 512, 0.3
LR = 5e-4  # slightly lower LR for bigger emb

enc = Encoder(URV, emb_dim=EMB, hid_dim=HID, n_layers=2, dropout=DROP, pad_idx=0)
dec = AttnDecoder(ROV, emb_dim=EMB, hid_dim=HID, n_layers=4, dropout=DROP, pad_idx=0)
model = Seq2Seq(enc, dec, pad_idx=0, bos_idx=1, eos_idx=2, device=device).to(device)

criterion = LabelSmoothingCE(eps=0.10, ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-6)

best, bad, patience = float('inf'), 0, 3
ckpt_C = cfg_name("best_attn_C", URV, ROV, EMB, HID, DROP, LR)
for epoch in range(1, 9):
    tr_loss, tr_acc = train_epoch(model, train_dl, optimizer, criterion, teacher_forcing=0.5)
    val_loss, val_ppl, val_acc = evaluate(model, val_dl, criterion)
    print(f"[C] e{epoch:02d} | train {tr_loss:.4f} acc {tr_acc:.2%} | val {val_loss:.4f} acc {val_acc:.2%} | ppl {val_ppl:.2f}")
    if val_loss < best - 1e-4:
        best, bad = val_loss, 0
        torch.save(model.state_dict(), ckpt_C)
    else:
        bad += 1
        if bad >= patience:
            print("[C] early stop"); break

pairs_C = [(ur_sp.decode(train_X[i].tolist()), ro_sp.decode(train_Y[i].tolist())) for i in range(50)]
bleu_C, chrf_C = eval_bleu(model, pairs_C, ur_sp, ro_sp, max_len=50)
print(f"[C] BLEU {bleu_C:.2f}  chrF {chrf_C:.2f}")

res_C = dict(exp="C", urv=URV, rov=ROV, emb=EMB, hid=HID, drop=DROP, lr=LR,
             best_val=best, bleu=bleu_C, chrf=chrf_C)


[C] e01 | train 5.5824 acc 35.37% | val 4.4607 acc 48.34% | ppl 86.55
[C] e02 | train 3.9710 acc 53.70% | val 3.8195 acc 56.79% | ppl 45.58
[C] e03 | train 3.4276 acc 59.92% | val 3.5926 acc 59.09% | ppl 36.33
[C] e04 | train 3.0499 acc 65.15% | val 3.5070 acc 62.26% | ppl 33.35
[C] e05 | train 2.7586 acc 69.74% | val 3.3704 acc 64.37% | ppl 29.09
[C] e06 | train 2.5397 acc 73.29% | val 3.3972 acc 64.73% | ppl 29.88
[C] e07 | train 2.3426 acc 77.14% | val 3.3181 acc 66.81% | ppl 27.61
[C] e08 | train 2.1844 acc 80.38% | val 3.3695 acc 66.64% | ppl 29.06
[C] BLEU 75.80  chrF 86.36


# Final Results

In [None]:
from pandas import DataFrame
results = [res_A, res_B, res_C]
df = DataFrame(results, columns=["exp","urv","rov","emb","hid","drop","lr","best_val","bleu","chrf"])
display(df)
print("\nMarkdown table for your report:\n")
print(df.to_markdown(index=False))


Unnamed: 0,exp,urv,rov,emb,hid,drop,lr,best_val,bleu,chrf
0,A,5500,8000,256,512,0.3,0.001,3.429504,74.186658,86.25848
1,B,5500,8000,256,512,0.2,0.001,3.058476,81.588795,89.642438
2,C,5500,8000,512,512,0.3,0.0005,3.318058,75.795069,86.356333



Markdown table for your report:

| exp   |   urv |   rov |   emb |   hid |   drop |     lr |   best_val |    bleu |    chrf |
|:------|------:|------:|------:|------:|-------:|-------:|-----------:|--------:|--------:|
| A     |  5500 |  8000 |   256 |   512 |    0.3 | 0.001  |    3.4295  | 74.1867 | 86.2585 |
| B     |  5500 |  8000 |   256 |   512 |    0.2 | 0.001  |    3.05848 | 81.5888 | 89.6424 |
| C     |  5500 |  8000 |   512 |   512 |    0.3 | 0.0005 |    3.31806 | 75.7951 | 86.3563 |


In [38]:
samples = [
    # "مجھے اردو بہت پسند ہے"
   "وقت کا کیا ہے گزرتا ہے گزر جائے گا"
    # "ہر مشکل کے بعد آسانی ضرور آتی ہے۔"

]

for s in samples:
    hyp_greedy = greedy_decode(model, ur_sp, ro_sp, s, max_len=50)
    print(f"SRC: {s}\nROMAN: {hyp_greedy}\n")


SRC: وقت کا کیا ہے گزرتا ہے گزر جائے گا
ROMAN: vaqt k ky hai guzart hai guzar jgg

