### Install Libraries

In [3]:
!pip install -q gradio nltk rouge-score sentencepiece

### Import Libraries

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import pandas as pd
import numpy as np
import unicodedata
import re
import random
import math
import os
import time
from tqdm import tqdm

import sentencepiece as spm

import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from nltk.translate.chrf_score import corpus_chrf
nltk.download('punkt') 

import gradio as gr

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Check device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f" Libraries imported. Using device: {DEVICE}")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


 Libraries imported. Using device: cuda


### Load & Inspect Data

In [5]:
import pandas as pd

DATASET_PATH = "/kaggle/input/urdu-dataset-20000/final_main_dataset.tsv"

try:
    df_raw = pd.read_csv(DATASET_PATH, sep='\t', on_bad_lines='skip')
    
    if 'sentence' not in df_raw.columns:
        print("Error: 'sentence' column not found!")
    else:
        df = df_raw[['sentence']].copy()
        
        df = df.dropna(subset=['sentence'])
        df = df.drop_duplicates(subset=['sentence'])
        
        print(f"Data shape after cleaning (rows, cols): {df.shape}")
        print("--- Random Sample (Raw) ---")
        print(df.sample(3))

except Exception as e:
    print(f"Error loading dataset: {e}")

Data shape after cleaning (rows, cols): (10699, 1)
--- Random Sample (Raw) ---
                                               sentence
3704       بچوں پر تعلیم کے حوالے سے مضراثرات مرتب ہوئے
6427           اب میں مزیداِس بات کو برداشت نہیں کرسکتا
304   جب کوئی اختلافی صورت نمودار ہوتی، خلیفہ کے سام...


### Preprocessing (Cleaning Function)

In [6]:
import unicodedata
import re

def normalize_urdu_text(text: str) -> str:
    """
    Normalize Urdu text:
    1. Remove diacritics (zer, zabar, pesh).
    2. Standardize Alef and Yeh forms.
    3. Keep only Urdu characters, digits, and basic punctuation.
    4. Collapse multiple spaces.
    """
    if not isinstance(text, str):
        return ""
    
    text = ''.join(c for c in unicodedata.normalize('NFD', text) 
                   if unicodedata.category(c) != 'Mn')
    
    text = re.sub(r'[آأإ]', 'ا', text) # Standardize Alef
    text = re.sub(r'[ى]', 'ی', text)  # Standardize Yeh
    
    text = re.sub(r'[^\u0600-\u06FF\s\d\.!؟،]', '', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

test_sentence = "یہ ایک ٹیسٹ ہے، اِس میں 123 نمبرز بھی ہیں!"
print(f"Original:  {test_sentence}")
print(f"Cleaned:   {normalize_urdu_text(test_sentence)}")
print("\n Normalization function created.")

Original:  یہ ایک ٹیسٹ ہے، اِس میں 123 نمبرز بھی ہیں!
Cleaned:   یہ ایک ٹیسٹ ہے، اس میں 123 نمبرز بھی ہیں!

 Normalization function created.


### Apply Cleaning & Create (Input, Response) Pairs

In [10]:
print(f"Applying normalization to {len(df)} sentences...")
df['cleaned_sentence'] = df['sentence'].apply(normalize_urdu_text)

inputs = []
responses = []

for text in tqdm(df['cleaned_sentence'], desc="Creating Pairs"):
    words = text.split()
    if len(words) >= 2:
        mid_point = len(words) // 2
        
        # Handle 1-word-each case
        if mid_point == 0: mid_point = 1 
            
        input_text = ' '.join(words[:mid_point])
        response_text = ' '.join(words[mid_point:])
        
        # Ensure neither side is empty
        if input_text and response_text:
            inputs.append(input_text)
            responses.append(response_text)

data_pairs = pd.DataFrame({
    'input': inputs,
    'response': responses
})
data_pairs = data_pairs.dropna()

print(f"\nTotal conversational pairs created: {len(data_pairs)}")
print("--- Sample Pairs (Split Logic) ---")
print(data_pairs.sample(3))
df = data_pairs 

Applying normalization to 10699 sentences...


Creating Pairs: 100%|██████████| 10699/10699 [00:00<00:00, 491827.78it/s]


Total conversational pairs created: 10519
--- Sample Pairs (Split Logic) ---
                  input            response
1056           ٹیسٹ میں   بیکٹیریا نہیں تھا
2982  اج موقف ایک ہی ہے  جو نواز شریف کا ہے
4795     عمرا ن خان میں  بہت سی خوبیاں ہوں۔





### Split Data (Train, Val, Test)

In [11]:
from sklearn.model_selection import train_test_split

# We will use the 'input' and 'response' columns
data_pairs = df[['input', 'response']]

train_df, temp_df = train_test_split(
    data_pairs,
    test_size=0.2, # 20% for validation and test
    random_state=SEED # Ensures reproducible split
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5, # 0.5 * 0.2 = 0.1 (10% of total)
    random_state=SEED # Ensures reproducible split
)

# Reset index just for clean looks (optional)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Data splitting complete:")
print(f"  Train set:    {len(train_df)} pairs")
print(f"  Validation set: {len(val_df)} pairs")
print(f"  Test set:     {len(test_df)} pairs")

Data splitting complete:
  Train set:    8415 pairs
  Validation set: 1052 pairs
  Test set:     1052 pairs


### Create Corpus for Tokenizer

In [12]:

print("Creating corpus file for tokenizer...")

# Define the corpus file path
corpus_path = 'urdu_corpus.txt'

# Open the file in write mode (encoding='utf-8')
with open(corpus_path, 'w', encoding='utf-8') as f:
    # Add 'input' sentences
    for text in train_df['input']:
        f.write(text + '\n')
    # Add 'response' sentences
    for text in train_df['response']:
        f.write(text + '\n')

print(f" Corpus file created at: {corpus_path}")

Creating corpus file for tokenizer...
 Corpus file created at: urdu_corpus.txt


### Train SentencePiece Tokenizer

In [13]:
import sentencepiece as spm

print("Training SentencePiece tokenizer...")

# Define paths and parameters
corpus = 'urdu_corpus.txt'
model_prefix = 'urdu_spm' # Will create urdu_spm.model and urdu_spm.vocab


vocab_size = 6300 # Set to a value 

# Define special token IDs
PAD_ID = 0
UNK_ID = 1
SOS_ID = 2 # Start of Sentence
EOS_ID = 3 # End of Sentence

# Train the model
spm.SentencePieceTrainer.Train(
    input=corpus,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    model_type='unigram',
    character_coverage=1.0,
    pad_id=PAD_ID,
    unk_id=UNK_ID,
    bos_id=SOS_ID,
    eos_id=EOS_ID,
    pad_piece='<pad>',
    unk_piece='<unk>',
    bos_piece='<sos>',
    eos_piece='<eos>'
)

print(f" Tokenizer training complete. Vocab size: {vocab_size}")
print(f"Files created: {model_prefix}.model, {model_prefix}.vocab")

Training SentencePiece tokenizer...
 Tokenizer training complete. Vocab size: 6300
Files created: urdu_spm.model, urdu_spm.vocab


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: urdu_corpus.txt
  input_format: 
  model_prefix: urdu_spm
  model_type: UNIGRAM
  vocab_size: 6300
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <sos>
  eos_piece: <eos>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  diffe

### Load and Test Tokenizer

In [14]:
import sentencepiece as spm

# Load the trained model
sp = spm.SentencePieceProcessor(model_file='urdu_spm.model')

# --- Test ---
print(f"Vocabulary size: {sp.vocab_size()}")

# Check special token IDs
print(f"PAD ID: {sp.pad_id()}")
print(f"SOS ID: {sp.bos_id()}") # bos = <sos>
print(f"EOS ID: {sp.eos_id()}") # eos = <eos>
print(f"UNK ID: {sp.unk_id()}")

# Test encoding and decoding
test_sentence = "یہ ایک ٹیسٹ ہے"
print(f"\nOriginal: '{test_sentence}'")

encoded_pieces = sp.encode(test_sentence, out_type=str)
print(f"Encoded (pieces): {encoded_pieces}")

encoded_ids = sp.encode(test_sentence, out_type=int)
print(f"Encoded (IDs): {encoded_ids}")

decoded_text = sp.decode(encoded_ids)
print(f"Decoded: '{decoded_text}'")

print(f"\n Tokenizer loaded and tested.")

Vocabulary size: 6300
PAD ID: 0
SOS ID: 2
EOS ID: 3
UNK ID: 1

Original: 'یہ ایک ٹیسٹ ہے'
Encoded (pieces): ['▁یہ', '▁ایک', '▁ٹیسٹ', '▁ہے']
Encoded (IDs): [18, 28, 284, 8]
Decoded: 'یہ ایک ٹیسٹ ہے'

 Tokenizer loaded and tested.


### Create PyTorch Dataset Class

In [15]:
from torch.utils.data import Dataset, DataLoader
import random

# --- Constants ---
MAX_LEN = 60
VOCAB_SIZE = sp.vocab_size()
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
PAD_ID, UNK_ID, SOS_ID, EOS_ID = sp.pad_id(), sp.unk_id(), sp.bos_id(), sp.eos_id()
MASK_TOKEN = UNK_ID # Use <unk> (ID 1) as mask token

def corrupt_tokens(token_ids, mask_prob=0.15, max_span_len=3):
    corrupted_ids = list(token_ids)
    n_tokens = len(corrupted_ids)
    num_to_mask = int(n_tokens * mask_prob)
    masked_indices = set()
    if n_tokens < 2: return corrupted_ids # Cannot corrupt
    
    while len(masked_indices) < num_to_mask:
        span_len = random.randint(1, max_span_len)
        start_idx = random.randint(0, n_tokens - span_len)
        for i in range(span_len):
            idx = start_idx + i
            if idx < n_tokens and idx not in masked_indices:
                corrupted_ids[idx] = MASK_TOKEN
                masked_indices.add(idx)
    return corrupted_ids

class UrduChatDataset(Dataset):
    def __init__(self, dataframe, sp_tokenizer, max_len=MAX_LEN):
        self.dataframe = dataframe
        self.sp = sp_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def tokenize_and_pad(self, text, max_len):
        token_ids = self.sp.encode(text, out_type=int)
        token_ids = token_ids[:max_len - 2] # Truncate
        final_ids = [SOS_ID] + token_ids + [EOS_ID]
        pad_len = max_len - len(final_ids)
        return torch.tensor(final_ids + ([PAD_ID] * pad_len), dtype=torch.long)

    def __getitem__(self, idx):
        input_text = self.dataframe.loc[idx, 'input']
        response_text = self.dataframe.loc[idx, 'response']
        
        tgt_tensor = self.tokenize_and_pad(response_text, self.max_len)
        
        src_token_ids = self.sp.encode(input_text, out_type=int)
        src_token_ids = src_token_ids[:self.max_len - 2]
        
        corrupted_ids = corrupt_tokens(src_token_ids) # Apply corruption
        
        final_src_ids = [SOS_ID] + corrupted_ids + [EOS_ID]
        pad_len = self.max_len - len(final_src_ids)
        src_tensor = torch.tensor(final_src_ids + ([PAD_ID] * pad_len), dtype=torch.long)
        
        return src_tensor, tgt_tensor

# --- Test the Dataset class ---
print("Testing Dataset class (Split + Corruption)...")
test_data = pd.DataFrame({'input': ['کیا ہم یہ'], 'response': ['کر سکتے ہیں؟']})
test_dataset = UrduChatDataset(test_data, sp, max_len=20)
src, tgt = test_dataset[0]

print(f"Source Text (Input): 'کیا ہم یہ'")
print(f"Target Text (Resp): 'کر سکتے ہیں؟'")
print(f"Source Tensor (Corrupted):\n {src}")
print(f"Target Tensor (Clean):\n {tgt}")
print(f"\n Dataset class (Split + Corruption) created and tested.")

Testing Dataset class (Split + Corruption)...
Source Text (Input): 'کیا ہم یہ'
Target Text (Resp): 'کر سکتے ہیں؟'
Source Tensor (Corrupted):
 tensor([ 2, 29, 44, 18,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0])
Target Tensor (Clean):
 tensor([  2,  27, 250,  25,  36,   3,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0])

 Dataset class (Split + Corruption) created and tested.


### Create DataLoaders

In [16]:
from torch.utils.data import DataLoader

BATCH_SIZE = 30 # As suggested in the assignment

train_dataset = UrduChatDataset(train_df, sp, MAX_LEN)
val_dataset = UrduChatDataset(val_df, sp, MAX_LEN)
test_dataset = UrduChatDataset(test_df, sp, MAX_LEN)

# We shuffle the training data to improve model generalization
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

# No need to shuffle validation or test data
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

print(f" DataLoaders created successfully.")
print(f"Total training batches: {len(train_loader)}")
print(f"Total validation batches: {len(val_loader)}")

# --- Test one batch ---
print("\nTesting one batch from train_loader...")
src_batch, tgt_batch = next(iter(train_loader))
print(f"Source batch shape: {src_batch.shape}") # [BATCH_SIZE, MAX_LEN]
print(f"Target batch shape: {tgt_batch.shape}") # [BATCH_SIZE, MAX_LEN]

 DataLoaders created successfully.
Total training batches: 281
Total validation batches: 36

Testing one batch from train_loader...
Source batch shape: torch.Size([30, 60])
Target batch shape: torch.Size([30, 60])


### Model Hyperparameters & Positional Encoding

In [17]:
import torch.nn as nn
import math

EMBED_DIM = 256
NUM_HEADS = 2
ENCODER_LAYERS = 2
DECODER_LAYERS = 2
FFN_DIM = 1024
DROPOUT = 0.1 # Start with 0.1

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=MAX_LEN):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model); position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1); div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)); pe[:, 0::2] = torch.sin(position * div_term); pe[:, 1::2] = torch.cos(position * div_term); pe = pe.unsqueeze(0); self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]
        
print(f" PositionalEncoding created. Model size reset to: 256 Dim, 2 Layers.")

 PositionalEncoding created. Model size reset to: 256 Dim, 2 Layers.


### Multi-Head Attention Class

In [18]:
# Cell 13: Multi-Head Attention Class
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    """
    Implements the Multi-Head Attention mechanism from scratch.
    """
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        
        # d_model must be divisible by num_heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model       # Total embedding dimension (e.g., 256)
        self.num_heads = num_heads   # Number of heads (e.g., 2)
        self.d_k = d_model // num_heads # Dimension of each head (e.g., 256 // 2 = 128)
        
        # Linear layers for Query, Key, Value, and the final output
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Q, K, V shape: [batch_size, num_heads, seq_len, d_k]
        
        # scores shape: [batch_size, num_heads, seq_len, seq_len]
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            # Fill with a very small number where mask is 0
            scores = scores.masked_fill(mask == 0, -1e9)
            
        # attn shape: [batch_size, num_heads, seq_len, seq_len]
        attn = F.softmax(scores, dim=-1)
        
        # context shape: [batch_size, num_heads, seq_len, d_k]
        context = torch.matmul(attn, V)
        
        return context

    def forward(self, Q, K, V, mask=None):
        # Q, K, V input shape: [batch_size, seq_len, d_model]
        batch_size = Q.size(0)
        
        Q = self.W_q(Q)
        K = self.W_k(K)
        V = self.W_v(V)
        
        # [batch_size, seq_len, d_model] -> [batch_size, num_heads, seq_len, d_k]
        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        context = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # [batch_size, num_heads, seq_len, d_k] -> [batch_size, seq_len, d_model]
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        output = self.W_o(context)
        
        # output shape: [batch_size, seq_len, d_model]
        return output

# --- Test ---
print(f"Testing MultiHeadAttention...")
mha_layer = MultiHeadAttention(d_model=EMBED_DIM, num_heads=NUM_HEADS).to(DEVICE)
test_tensor = torch.rand(BATCH_SIZE, MAX_LEN, EMBED_DIM).to(DEVICE) # Dummy batch

# In self-attention (like in Encoder), Q, K, and V are the same tensor
output_tensor = mha_layer(test_tensor, test_tensor, test_tensor, mask=None)

print(f"Input shape:  {test_tensor.shape}")
print(f"Output shape: {output_tensor.shape}")
print(f" MultiHeadAttention class created and tested.")

Testing MultiHeadAttention...
Input shape:  torch.Size([30, 60, 256])
Output shape: torch.Size([30, 60, 256])
 MultiHeadAttention class created and tested.


### Feed-Forward Network Class

In [19]:
class FeedForward(nn.Module):

    def __init__(self, d_model, d_ff, dropout=DROPOUT):
        super(FeedForward, self).__init__()
        
        # FFN consists of two linear layers with a ReLU activation
        self.linear_1 = nn.Linear(d_model, d_ff) # e.g., 256 -> 1024
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # e.g., 1024 -> 256

    def forward(self, x):
        # x shape: [batch_size, seq_len, d_model]
        
        x = self.linear_1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.linear_2(x)
        
        # output shape: [batch_size, seq_len, d_model]
        return x

# --- Test ---
print(f"Testing FeedForward...")
ffn_layer = FeedForward(d_model=EMBED_DIM, d_ff=FFN_DIM).to(DEVICE)
test_tensor = torch.rand(BATCH_SIZE, MAX_LEN, EMBED_DIM).to(DEVICE) # Dummy batch

output_tensor = ffn_layer(test_tensor)

print(f"Input shape:  {test_tensor.shape}")
print(f"Output shape: {output_tensor.shape}")
print(f" FeedForward class created and tested.")

Testing FeedForward...
Input shape:  torch.Size([30, 60, 256])
Output shape: torch.Size([30, 60, 256])
 FeedForward class created and tested.


### Encoder Layer Class

In [20]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=DROPOUT):
        super(EncoderLayer, self).__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model) # Layer normalization
        
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model) # Layer normalization
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        # Note: Q, K, and V are all 'x' in self-attention
        attn_output = self.self_attn(x, x, x, mask)
        # Add & Norm: Add the original input (residual)
        x = self.norm1(x + self.dropout(attn_output))
        
        ff_output = self.feed_forward(x)
        # Add & Norm
        x = self.norm2(x + self.dropout(ff_output))
        
        # output shape: [batch_size, seq_len, d_model]
        return x

# --- Test ---
print(f"Testing EncoderLayer...")
encoder_layer = EncoderLayer(EMBED_DIM, NUM_HEADS, FFN_DIM).to(DEVICE)
test_tensor = torch.rand(BATCH_SIZE, MAX_LEN, EMBED_DIM).to(DEVICE)


test_mask = (torch.ones(BATCH_SIZE, MAX_LEN) > 0.5).unsqueeze(1).unsqueeze(2).to(DEVICE) # Dummy mask

output_tensor = encoder_layer(test_tensor, test_mask)

print(f"Input shape:  {test_tensor.shape}")
print(f"Mask shape:   {test_mask.shape}")
print(f"Output shape: {output_tensor.shape}")
print(f" EncoderLayer class created and tested.")

Testing EncoderLayer...
Input shape:  torch.Size([30, 60, 256])
Mask shape:   torch.Size([30, 1, 1, 60])
Output shape: torch.Size([30, 60, 256])
 EncoderLayer class created and tested.


### Decoder Layer Class

In [21]:

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=DROPOUT):
        super(DecoderLayer, self).__init__()
        
        # Sub-layer 1: Masked Multi-Head Attention
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        
        # Sub-layer 2: Cross-Attention (with Encoder output)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Sub-layer 3: Feed-Forward Network
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm3 = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):

        # Q, K, V are all 'x' (decoder input)
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Q is from decoder ('x'), K and V are from encoder ('enc_output')
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        
        # output shape: [batch_size, tgt_seq_len, d_model]
        return x

# --- Test ---
print(f"Testing DecoderLayer...")
decoder_layer = DecoderLayer(EMBED_DIM, NUM_HEADS, FFN_DIM).to(DEVICE)

# Dummy tensors
test_tgt_tensor = torch.rand(BATCH_SIZE, MAX_LEN, EMBED_DIM).to(DEVICE) # Decoder input
test_enc_output = torch.rand(BATCH_SIZE, MAX_LEN, EMBED_DIM).to(DEVICE) # Encoder output

# Dummy masks
test_src_mask = (torch.ones(BATCH_SIZE, MAX_LEN) > 0.5).unsqueeze(1).unsqueeze(2).to(DEVICE)
# Decoder target mask (look-ahead)
test_tgt_sub_mask = torch.tril(torch.ones(MAX_LEN, MAX_LEN)).bool().to(DEVICE)
test_tgt_mask = test_src_mask & test_tgt_sub_mask # Combine padding and look-ahead

output_tensor = decoder_layer(test_tgt_tensor, test_enc_output, test_src_mask, test_tgt_mask)

print(f"Decoder Input shape: {test_tgt_tensor.shape}")
print(f"Encoder Output shape: {test_enc_output.shape}")
print(f"Source Mask shape:   {test_src_mask.shape}")
print(f"Target Mask shape:   {test_tgt_mask.shape}")
print(f"Final Output shape:  {output_tensor.shape}")
print(f" DecoderLayer class created and tested.")

Testing DecoderLayer...
Decoder Input shape: torch.Size([30, 60, 256])
Encoder Output shape: torch.Size([30, 60, 256])
Source Mask shape:   torch.Size([30, 1, 1, 60])
Target Mask shape:   torch.Size([30, 1, 60, 60])
Final Output shape:  torch.Size([30, 60, 256])
 DecoderLayer class created and tested.


### Transformer Model (Main Class)

In [22]:

class Transformer(nn.Module):
    def __init__(self, 
                 src_vocab_size, 
                 tgt_vocab_size, 
                 d_model, 
                 num_heads, 
                 num_encoder_layers, 
                 num_decoder_layers, 
                 d_ff, 
                 max_len, 
                 dropout=DROPOUT,
                 pad_idx=PAD_ID):
        
        super(Transformer, self).__init__()
        
        # --- Embeddings ---
        self.src_embedding = nn.Embedding(src_vocab_size, d_model, padding_idx=pad_idx)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model, padding_idx=pad_idx)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        
        # --- Encoder Stack ---
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout) 
            for _ in range(num_encoder_layers)
        ])
        
        # --- Decoder Stack ---
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])
        
        # Projects decoder output (d_model) to vocabulary size
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx

    def _generate_mask(self, src, tgt):
        # src shape: [batch_size, src_seq_len]
        # tgt shape: [batch_size, tgt_seq_len]
        # 1. Source Padding Mask
        # [batch_size, 1, 1, src_seq_len]
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)

        # [batch_size, 1, tgt_seq_len, 1]
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(3)
        
        tgt_seq_len = tgt.size(1)
        # [1, 1, tgt_seq_len, tgt_seq_len]
        tgt_sub_mask = torch.tril(torch.ones((tgt_seq_len, tgt_seq_len), device=DEVICE)).bool()
        
        # [batch_size, 1, tgt_seq_len, tgt_seq_len]
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        
        return src_mask, tgt_mask

    def encode(self, src, src_mask):
        src_emb = self.dropout(self.pos_encoding(self.src_embedding(src)))
        
        enc_output = src_emb
        for layer in self.encoder_layers:
            enc_output = layer(enc_output, src_mask)
            
        return enc_output

    def decode(self, tgt, enc_output, src_mask, tgt_mask):
        tgt_emb = self.dropout(self.pos_encoding(self.tgt_embedding(tgt)))
        
        dec_output = tgt_emb
        for layer in self.decoder_layers:
            dec_output = layer(dec_output, enc_output, src_mask, tgt_mask)
            
        return dec_output

    def forward(self, src, tgt):
        # src: [batch_size, src_seq_len]
        # tgt: [batch_size, tgt_seq_len]
        
        src_mask, tgt_mask = self._generate_mask(src, tgt)
        
        # enc_output: [batch_size, src_seq_len, d_model]
        enc_output = self.encode(src, src_mask)
        
        # dec_output: [batch_size, tgt_seq_len, d_model]
        dec_output = self.decode(tgt, enc_output, src_mask, tgt_mask)
        
        # output: [batch_size, tgt_seq_len, tgt_vocab_size]
        output = self.fc_out(dec_output)
        
        return output

# --- Test ---
print(f"Testing Transformer model...")
# Instantiate the model
transformer_model = Transformer(
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    d_model=EMBED_DIM,
    num_heads=NUM_HEADS,
    num_encoder_layers=ENCODER_LAYERS,
    num_decoder_layers=DECODER_LAYERS,
    d_ff=FFN_DIM,
    max_len=MAX_LEN,
    dropout=DROPOUT,
    pad_idx=PAD_ID
).to(DEVICE)

# Get a test batch from our DataLoader
src_batch, tgt_batch = next(iter(train_loader))
src_batch, tgt_batch = src_batch.to(DEVICE), tgt_batch.to(DEVICE)


output_logits = transformer_model(src_batch, tgt_batch[:, :-1])

print(f"Source batch shape:  {src_batch.shape}")
print(f"Target batch shape (input): {tgt_batch[:, :-1].shape}")
print(f"Model output shape:  {output_logits.shape}")
print(f"  (Expected: [Batch, SeqLen-1, VocabSize])")
print(f"  (Actual:   [{output_logits.shape[0]}, {output_logits.shape[1]}, {output_logits.shape[2]}])")

print(f"\n Transformer class created and tested.")

Testing Transformer model...
Source batch shape:  torch.Size([30, 60])
Target batch shape (input): torch.Size([30, 59])
Model output shape:  torch.Size([30, 59, 6300])
  (Expected: [Batch, SeqLen-1, VocabSize])
  (Actual:   [30, 59, 6300])

 Transformer class created and tested.


### Initialize Model, Loss, and Optimizer

In [23]:
model = Transformer(
    src_vocab_size=VOCAB_SIZE, tgt_vocab_size=VOCAB_SIZE,
    d_model=EMBED_DIM, num_heads=NUM_HEADS,
    num_encoder_layers=ENCODER_LAYERS, num_decoder_layers=DECODER_LAYERS,
    d_ff=FFN_DIM, max_len=MAX_LEN, dropout=DROPOUT, pad_idx=PAD_ID
).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
LEARNING_RATE = 1e-4 # (Aap ke dost wala LR)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f" LARGER Model (Dropout={DROPOUT}), Criterion, Optimizer initialized.")
print(f"   Model has {count_parameters(model):,} parameters.")

 LARGER Model (Dropout=0.1), Criterion, Optimizer initialized.
   Model has 8,531,100 parameters.


### Training Loop Function (One Epoch)

In [24]:
import time

def train_epoch(model, dataloader, optimizer, criterion, device, clip=1):
    """
    Trains the model for one epoch.
    """
    # Set the model to training mode (enables dropout)
    model.train()
    
    epoch_loss = 0.0
    
    # Use tqdm for a nice progress bar
    for src, tgt in tqdm(dataloader, desc="Training Epoch"):
        src = src.to(device)
        tgt = tgt.to(device)
        
        optimizer.zero_grad()
        
        # We feed the decoder all tokens except the last one
        # e.g., [<sos>, "token1", "token2"]
        output = model(src, tgt[:, :-1])
        
        # We compare the output against all tokens except the first one
        # e.g., ["token1", "token2", <eos>]
        
        # Reshape for CrossEntropyLoss
        # Output: [Batch * SeqLen-1, VocabSize]
        # Target: [Batch * SeqLen-1]
        output_dim = output.shape[-1]
        output_reshaped = output.contiguous().view(-1, output_dim)
        tgt_reshaped = tgt[:, 1:].contiguous().view(-1)
        
        loss = criterion(output_reshaped, tgt_reshaped)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    # Return the average loss for this epoch
    return epoch_loss / len(dataloader)

print(" train_epoch function defined.")

 train_epoch function defined.


### Evaluation Loop Function (One Epoch)

In [25]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate_bleu(model, dataloader, tokenizer, device):
    """
    Evaluates the model on the validation dataset and returns the BLEU score.
    """
    model.eval() # Set model to evaluation mode
    
    all_references = [] # Ground truth
    all_hypotheses = [] # Predictions
    
    # We don't need to track gradients
    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Calculating Val BLEU"):
            src = src.to(device)
            tgt = tgt.to(device)
            
            # --- Perform REAL Inference (Greedy Search) ---
            # This is slow, but it's the CORRECT way to evaluate
            
            # 1. Run Encoder (once per batch)
            src_mask = (src != PAD_ID).unsqueeze(1).unsqueeze(2)
            enc_output = model.encode(src, src_mask)
            
            # 2. Run Decoder (token by token for each item in batch)
            batch_size = src.shape[0]
            
            # Start decoder input with <sos> for all items in batch
            tgt_tokens = torch.full((batch_size, 1), SOS_ID, dtype=torch.long, device=device)

            for _ in range(MAX_LEN - 1): # Loop up to max length
                tgt_len = tgt_tokens.shape[1]
                
                # Create decoder look-ahead mask
                tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
                tgt_mask = tgt_sub_mask.unsqueeze(0).unsqueeze(1) # [1, 1, len, len]
                
                # Decoder pass
                dec_output = model.decode(tgt_tokens, enc_output, src_mask, tgt_mask)
                
                # Get logits for the *last* token
                last_token_logits = model.fc_out(dec_output[:, -1, :])
                
                # Get the predicted token (Greedy)
                pred_token = last_token_logits.argmax(dim=-1).unsqueeze(1) # [B, 1]
                
                # Append the new token to our running target
                tgt_tokens = torch.cat((tgt_tokens, pred_token), dim=1)
            
            # --- End of Greedy Search ---
            
            # 'tgt_tokens' now contains the full predicted sentences
            # 'tgt' contains the ground truth
            
            # Decode predictions
            hyps = decode_batch(tgt_tokens, tokenizer) # decode_batch is from Cell 23
            all_hypotheses.extend(hyps)
            
            # Decode references (ground truth)
            refs = decode_batch(tgt[:, 1:], tokenizer) # Use tgt[:, 1:] (skip <sos>)
            all_references.extend([[r] for r in refs])

    # --- Calculate Final BLEU Score ---
    bleu_score = corpus_bleu(all_references, all_hypotheses)
    
    return bleu_score

# --- Test the new evaluate function ---
# (Assuming 'model' is loaded from Cell 18)
# (Assuming 'decode_batch' is defined in Cell 23... let's define it here just in case)

def decode_batch(batch_tensor, tokenizer):
    """Helper function to decode tensors (if not defined in Cell 23 yet)."""
    text_list = []
    for tensor in batch_tensor:
        ids = tensor.cpu().numpy()
        ids = [int(id) for id in ids if id not in (PAD_ID, SOS_ID, EOS_ID)]
        text = tokenizer.decode(ids)
        text_list.append(text)
    return text_list

print(" 'evaluate_bleu' function defined.")

 'evaluate_bleu' function defined.


### 2) Run the Main Training Loop

## 1) Load Best Model and Set to Eval Mode

In [26]:
N_EPOCHS = 30 
best_val_bleu = 0.0 
MODEL_SAVE_PATH = 'best_model.pth'

print(f"Starting training for {N_EPOCHS} epochs on {DEVICE}...")

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    # --- Training (calculates loss) ---
    train_loss = train_epoch(model, train_loader, optimizer, criterion, DEVICE, clip=1)
    
    # --- Evaluation (calculates BLEU) ---
    # We use the new function
    val_bleu = evaluate_bleu(model, val_loader, sp, DEVICE)
    
    end_time = time.time()
    
    epoch_mins = int((end_time - start_time) / 60)
    epoch_secs = int((end_time - start_time) % 60)
    
    # Calculate perplexity (PPL) just for printing
    train_ppl = math.exp(train_loss)
    
    print(f"\n--- Epoch {epoch+1:02} / {N_EPOCHS} | Time: {epoch_mins}m {epoch_secs}s ---")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {train_ppl:7.3f}")
    # We now print Validation BLEU
    print(f"\tVal. BLEU:  {val_bleu:.4f}")
    
    # --- Save the best model (based on BLEU) ---
    if val_bleu > best_val_bleu:
        best_val_bleu = val_bleu
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"\t✨ New best model saved! (Val. BLEU: {best_val_bleu:.4f})")

print(f"\n Training complete. Best model (BLEU: {best_val_bleu:.4f}) saved to {MODEL_SAVE_PATH}")

Starting training for 30 epochs on cuda...


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 33.10it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.45it/s]



--- Epoch 01 / 30 | Time: 0m 15s ---
	Train Loss: 6.285 | Train PPL: 536.486
	Val. BLEU:  0.0629
	✨ New best model saved! (Val. BLEU: 0.0629)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.84it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.60it/s]



--- Epoch 02 / 30 | Time: 0m 14s ---
	Train Loss: 5.634 | Train PPL: 279.800
	Val. BLEU:  0.0864
	✨ New best model saved! (Val. BLEU: 0.0864)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 33.71it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.46it/s]



--- Epoch 03 / 30 | Time: 0m 15s ---
	Train Loss: 5.438 | Train PPL: 229.947
	Val. BLEU:  0.0986
	✨ New best model saved! (Val. BLEU: 0.0986)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 33.67it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.62it/s]



--- Epoch 04 / 30 | Time: 0m 14s ---
	Train Loss: 5.266 | Train PPL: 193.734
	Val. BLEU:  0.0986
	✨ New best model saved! (Val. BLEU: 0.0986)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.68it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.65it/s]



--- Epoch 05 / 30 | Time: 0m 14s ---
	Train Loss: 5.103 | Train PPL: 164.460
	Val. BLEU:  0.0932


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 35.12it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.69it/s]



--- Epoch 06 / 30 | Time: 0m 14s ---
	Train Loss: 4.948 | Train PPL: 140.867
	Val. BLEU:  0.1057
	✨ New best model saved! (Val. BLEU: 0.1057)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.87it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.65it/s]



--- Epoch 07 / 30 | Time: 0m 14s ---
	Train Loss: 4.803 | Train PPL: 121.844
	Val. BLEU:  0.1123
	✨ New best model saved! (Val. BLEU: 0.1123)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.56it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.64it/s]



--- Epoch 08 / 30 | Time: 0m 14s ---
	Train Loss: 4.664 | Train PPL: 106.065
	Val. BLEU:  0.1065


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.46it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.62it/s]



--- Epoch 09 / 30 | Time: 0m 14s ---
	Train Loss: 4.531 | Train PPL:  92.894
	Val. BLEU:  0.1076


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.52it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.63it/s]



--- Epoch 10 / 30 | Time: 0m 14s ---
	Train Loss: 4.399 | Train PPL:  81.394
	Val. BLEU:  0.1079


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.68it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.61it/s]



--- Epoch 11 / 30 | Time: 0m 14s ---
	Train Loss: 4.266 | Train PPL:  71.264
	Val. BLEU:  0.1189
	✨ New best model saved! (Val. BLEU: 0.1189)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.71it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.69it/s]



--- Epoch 12 / 30 | Time: 0m 14s ---
	Train Loss: 4.142 | Train PPL:  62.955
	Val. BLEU:  0.1027


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.78it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.69it/s]



--- Epoch 13 / 30 | Time: 0m 14s ---
	Train Loss: 4.026 | Train PPL:  56.059
	Val. BLEU:  0.1103


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.79it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.67it/s]



--- Epoch 14 / 30 | Time: 0m 14s ---
	Train Loss: 3.904 | Train PPL:  49.579
	Val. BLEU:  0.1238
	✨ New best model saved! (Val. BLEU: 0.1238)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.71it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.66it/s]



--- Epoch 15 / 30 | Time: 0m 14s ---
	Train Loss: 3.782 | Train PPL:  43.909
	Val. BLEU:  0.1171


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.71it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.68it/s]



--- Epoch 16 / 30 | Time: 0m 14s ---
	Train Loss: 3.662 | Train PPL:  38.932
	Val. BLEU:  0.1149


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.58it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.68it/s]



--- Epoch 17 / 30 | Time: 0m 14s ---
	Train Loss: 3.549 | Train PPL:  34.792
	Val. BLEU:  0.1163


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.69it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.57it/s]



--- Epoch 18 / 30 | Time: 0m 14s ---
	Train Loss: 3.433 | Train PPL:  30.969
	Val. BLEU:  0.1138


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.55it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.66it/s]



--- Epoch 19 / 30 | Time: 0m 14s ---
	Train Loss: 3.318 | Train PPL:  27.610
	Val. BLEU:  0.1165


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.62it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.60it/s]



--- Epoch 20 / 30 | Time: 0m 14s ---
	Train Loss: 3.207 | Train PPL:  24.697
	Val. BLEU:  0.1169


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.44it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.62it/s]



--- Epoch 21 / 30 | Time: 0m 14s ---
	Train Loss: 3.097 | Train PPL:  22.121
	Val. BLEU:  0.1158


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.21it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.61it/s]



--- Epoch 22 / 30 | Time: 0m 14s ---
	Train Loss: 2.997 | Train PPL:  20.022
	Val. BLEU:  0.1105


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.59it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.61it/s]



--- Epoch 23 / 30 | Time: 0m 14s ---
	Train Loss: 2.877 | Train PPL:  17.765
	Val. BLEU:  0.1186


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.53it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.58it/s]



--- Epoch 24 / 30 | Time: 0m 14s ---
	Train Loss: 2.776 | Train PPL:  16.049
	Val. BLEU:  0.1093


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.59it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.64it/s]



--- Epoch 25 / 30 | Time: 0m 14s ---
	Train Loss: 2.671 | Train PPL:  14.451
	Val. BLEU:  0.1196


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.69it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.69it/s]



--- Epoch 26 / 30 | Time: 0m 14s ---
	Train Loss: 2.563 | Train PPL:  12.976
	Val. BLEU:  0.1207


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.66it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.66it/s]



--- Epoch 27 / 30 | Time: 0m 14s ---
	Train Loss: 2.461 | Train PPL:  11.713
	Val. BLEU:  0.1247
	✨ New best model saved! (Val. BLEU: 0.1247)


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.61it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.67it/s]



--- Epoch 28 / 30 | Time: 0m 14s ---
	Train Loss: 2.360 | Train PPL:  10.590
	Val. BLEU:  0.1172


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.67it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.57it/s]



--- Epoch 29 / 30 | Time: 0m 14s ---
	Train Loss: 2.256 | Train PPL:   9.545
	Val. BLEU:  0.1232


Training Epoch: 100%|██████████| 281/281 [00:08<00:00, 34.67it/s]
Calculating Val BLEU: 100%|██████████| 36/36 [00:06<00:00,  5.69it/s]



--- Epoch 30 / 30 | Time: 0m 14s ---
	Train Loss: 2.166 | Train PPL:   8.721
	Val. BLEU:  0.1205

 Training complete. Best model (BLEU: 0.1247) saved to best_model.pth


In [27]:
model = Transformer(
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    d_model=EMBED_DIM,
    num_heads=NUM_HEADS,
    num_encoder_layers=ENCODER_LAYERS,
    num_decoder_layers=DECODER_LAYERS,
    d_ff=FFN_DIM,
    max_len=MAX_LEN,
    dropout=DROPOUT,
    pad_idx=PAD_ID
).to(DEVICE)

MODEL_SAVE_PATH = 'best_model.pth'
model.load_state_dict(torch.load(MODEL_SAVE_PATH))

model.eval()

print(f" Best model loaded from '{MODEL_SAVE_PATH}' and set to eval mode.")

 Best model loaded from 'best_model.pth' and set to eval mode.


### 3) Calculate Final Evaluation Metrics

In [28]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.chrf_score import corpus_chrf
from rouge_score import rouge_scorer
import numpy as np

def decode_batch(batch_tensor, tokenizer):
    text_list = []
    for tensor in batch_tensor:
        ids = tensor.cpu().numpy()
        ids = [int(id) for id in ids if id not in (PAD_ID, SOS_ID, EOS_ID)]
        text = tokenizer.decode(ids)
        text_list.append(text)
    return text_list

def calculate_metrics_real(model, dataloader, criterion, tokenizer, device):
    
    model.eval()
    total_loss = 0.0
    all_references = []
    all_hypotheses = []
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
    
    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Calculating Final Metrics"):
            src = src.to(device)
            tgt = tgt.to(device)
            
            output_loss = model(src, tgt[:, :-1])
            output_dim = output_loss.shape[-1]
            output_reshaped = output_loss.contiguous().view(-1, output_dim)
            tgt_reshaped = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output_reshaped, tgt_reshaped)
            total_loss += loss.item()
            
            # (This is the same logic from our new Cell 20)
            src_mask = (src != PAD_ID).unsqueeze(1).unsqueeze(2)
            enc_output = model.encode(src, src_mask)
            batch_size = src.shape[0]
            tgt_tokens = torch.full((batch_size, 1), SOS_ID, dtype=torch.long, device=device)

            for _ in range(MAX_LEN - 1):
                tgt_len = tgt_tokens.shape[1]
                tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
                tgt_mask = tgt_sub_mask.unsqueeze(0).unsqueeze(1)
                dec_output = model.decode(tgt_tokens, enc_output, src_mask, tgt_mask)
                last_token_logits = model.fc_out(dec_output[:, -1, :])
                pred_token = last_token_logits.argmax(dim=-1).unsqueeze(1)
                tgt_tokens = torch.cat((tgt_tokens, pred_token), dim=1)
            
            # Decode results
            hyps = decode_batch(tgt_tokens, tokenizer)
            refs = decode_batch(tgt[:, 1:], tokenizer)
            
            # ROUGE-L Fix: Only add non-empty pairs
            for r, h in zip(refs, hyps):
                if r.strip() and h.strip():
                    all_references.append([r])
                    all_hypotheses.append(h)

    avg_loss = total_loss / len(dataloader)
    perplexity = math.exp(avg_loss)
    
    if not all_hypotheses or not all_references:
        print("Warning: No valid metrics found.")
        return {'Perplexity': perplexity, 'BLEU': 0.0, 'ROUGE-L': 0.0, 'chrF': 0.0}

    bleu_score = corpus_bleu(all_references, all_hypotheses)
    rouge_l_scores = []
    for ref_list, hyp in zip(all_references, all_hypotheses):
        score = scorer.score(ref_list[0], hyp)
        rouge_l_scores.append(score['rougeL'].fmeasure)
    rouge_l_avg = np.mean(rouge_l_scores) if rouge_l_scores else 0.0
    chrf_score = corpus_chrf([ref[0] for ref in all_references], all_hypotheses)
    
    return {
        'Perplexity': perplexity,
        'BLEU': bleu_score,
        'ROUGE-L': rouge_l_avg,
        'chrF': chrf_score
    }

print("Calculating evaluation metrics on the test set...")
# (Assuming 'model' is loaded from Cell 22)
metrics = calculate_metrics_real(model, test_loader, criterion, sp, DEVICE)

print("\n--- Evaluation Results (Test Set) ---")
print(f"  Perplexity: {metrics['Perplexity']:.3f}")
print(f"  BLEU:       {metrics['BLEU']:.4f}")
print(f"  ROUGE-L:    {metrics['ROUGE-L']:.4f}")
print(f"  chrF:       {metrics['chrF']:.4f}")
print("---------------------------------------")

Calculating evaluation metrics on the test set...


Calculating Final Metrics: 100%|██████████| 36/36 [00:06<00:00,  5.43it/s]



--- Evaluation Results (Test Set) ---
  Perplexity: 274.712
  BLEU:       0.1192
  ROUGE-L:    0.0000
  chrF:       0.1057
---------------------------------------


### Inference Function (Greedy Search)

In [29]:
def predict_sentence(sentence, model, sp_tokenizer, device, max_len=MAX_LEN):

    
    model.eval()
    
    normalized_sentence = normalize_urdu_text(sentence)
    token_ids = sp_tokenizer.encode(normalized_sentence, out_type=int)
    
    src_tokens = [SOS_ID] + token_ids[:max_len - 2] + [EOS_ID]
    
    pad_len = max_len - len(src_tokens)
    src_tokens += [PAD_ID] * pad_len
    
    src_tensor = torch.tensor([src_tokens], dtype=torch.long).to(device)
    
    # --- Encoder Pass ---
    with torch.no_grad():
        src_mask = (src_tensor != PAD_ID).unsqueeze(1).unsqueeze(2)
        
        # enc_output shape: [1, src_seq_len, d_model]
        enc_output = model.encode(src_tensor, src_mask)
        
    # --- Decoder Loop (Greedy Search) ---
    
    # tgt_tokens shape: [1, 1]
    tgt_tokens = torch.tensor([[SOS_ID]], dtype=torch.long).to(device)
    
    # Loop for a maximum of max_len steps
    for i in range(max_len):
        with torch.no_grad():
            tgt_len = tgt_tokens.shape[1]
            # [tgt_len, tgt_len]
            tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
            # We don't need a padding mask for the target during inference
            tgt_mask = tgt_sub_mask.unsqueeze(0).unsqueeze(1) # [1, 1, tgt_len, tgt_len]

            # dec_output shape: [1, tgt_len, d_model]
            dec_output = model.decode(tgt_tokens, enc_output, src_mask, tgt_mask)
            
            # 11. Get logits for the *very ast* token
            # last_token_logits shape: [1, d_model]
            last_token_logits = model.fc_out(dec_output[:, -1, :])
            
            # pred_token shape: [1]
            pred_token = last_token_logits.argmax(dim=-1)
            
            # This will be used as input in the next loop iteration
            # tgt_tokens shape: [1, tgt_len + 1]
            tgt_tokens = torch.cat((tgt_tokens, pred_token.unsqueeze(0)), dim=1)
            
            if pred_token.item() == EOS_ID:
                break

    output_ids = tgt_tokens.squeeze(0).cpu().numpy()[1:] # Remove <sos>
    # Filter out <eos> before decoding
    output_text = sp_tokenizer.decode([int(id) for id in output_ids if id != EOS_ID])
    
    return output_text

# --- Test the inference function ---
print("Testing inference function...")
# We use a sentence from our validation set for a fair test
test_input_sentence = val_df.iloc[10]['input']
predicted_response = predict_sentence(test_input_sentence, model, sp, DEVICE)

print(f"Input:    '{test_input_sentence}'")
print(f"Response: '{predicted_response}'")
print(f"Expected: '{val_df.iloc[10]['response']}'")
print("\n Inference function created and tested.")

Testing inference function...
Input:    'کہ نبی صلی اللہ'
Response: 'وسلم مسجد میں بھی ہیں۔'
Expected: 'علیہ وسلم نے فرمایا'

 Inference function created and tested.


### Gradio UI Interface

In [30]:
import gradio as gr
import time

print("Launching Improved Gradio Interface...")

def chatbot_response(input_text, history):
    """
    Main function called by Gradio.
    'history' is a list of [user_msg, bot_msg] pairs.
    """
    if not input_text:
        history.append((None, "برائے مہربانی، کچھ لکھ کر بھیجیں۔"))
        return "", history # Return empty string, updated history
    
    try:
        # Use the inference function
        response = predict_sentence(input_text, model, sp, DEVICE)
        
        # Add the new [user_msg, bot_msg] pair to the history
        history.append((input_text, response))
        
        # Return empty string to clear the textbox, and the updated history
        return "", history
    
    except Exception as e:
        history.append((input_text, f"ایک خرابی پیش آ گئی: {str(e)}"))
        return "", history

def clear_chat():
    """Clears the chat history."""
    return None 

css = """
.gradio-container { direction: rtl; }
textarea[data-testid="textbox"] { text-align: right; }
"""

with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
    gr.Markdown("# 🤖 اردو ٹرانسفارmer چیٹ بوٹ (From Scratch)")
    gr.Markdown("یہ چیٹ بوٹ PyTorch Transformer (Encoder-Decoder) کا استعمال کرتے ہوئے بنایا گیا ہے۔")
    
    with gr.Row():
        # Chatbot component for history
        chatbot = gr.Chatbot(label="گفتگو", height=400, rtl=True, show_label=False)
        
    with gr.Row():
        # Textbox for user input
        txt_input = gr.Textbox(
            label="آپ کا پیغام",
            placeholder="آپ کا پیغام یہاں لکھیں...",
            lines=2,
            rtl=True,
            scale=4 # Make textbox bigger
        )
        
        # Submit button
        btn_submit = gr.Button("بھیجیں", variant="primary", scale=1) # "Send" button
        
    with gr.Row():
        # 'Clear' button
        btn_clear = gr.Button("گفتگو صاف کریں", variant="secondary") # "Clear Chat" button
        
    
    # Function to handle submission
    def submit_message(input_text, history):
        return chatbot_response(input_text, history)

    txt_input.submit(
        fn=submit_message,
        inputs=[txt_input, chatbot],
        outputs=[txt_input, chatbot]
    )
    
    btn_submit.click(
        fn=submit_message,
        inputs=[txt_input, chatbot],
        outputs=[txt_input, chatbot]
    )
    
    btn_clear.click(
        fn=clear_chat,
        inputs=None,
        outputs=[chatbot]
    )
    
demo.launch(share=True, debug=True)

Launching Improved Gradio Interface...


  chatbot = gr.Chatbot(label="گفتگو", height=400, rtl=True, show_label=False)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://29a55c0201745833d7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://29a55c0201745833d7.gradio.live




### Qualitative Testing

In [35]:

print("--- Final Qualitative Testing ---")

model.eval()

test_sentences_good = [
    "کیا ہم یہ",         # Aap ka test case
    "یہ ایک",             # Bohat aam jumla
    "پاکستان کا سب سے بڑا", # Dekhte hain kya kehta hai
    "وہ سکول",
    "رات کے وقت",
    "اس نے مجھے"
]

test_sentences_bad = [
    "السلام علیکم",
    "آپ کا نام کیا ہے؟",
    "لاہور"
]

print("\n--- Testing: Sentence Completion (Good Tests) ---")
for sentence in test_sentences_good:
    response = predict_sentence(sentence, model, sp, DEVICE)
    print(f"Input:    '{sentence}'")
    print(f"Response: '{response}'")
    print("-----")

print("\n--- Testing: Conversational (Bad Tests) ---")
for sentence in test_sentences_bad:
    response = predict_sentence(sentence, model, sp, DEVICE)
    print(f"Input:    '{sentence}'")
    print(f"Response: '{response}'")
    print("-----")

print(" Qualitative testing complete.")

--- Final Qualitative Testing ---

--- Testing: Sentence Completion (Good Tests) ---
Input:    'کیا ہم یہ'
Response: 'کر سکتے ہیں؟'
-----
Input:    'یہ ایک'
Response: 'بارڈر ہے۔'
-----
Input:    'پاکستان کا سب سے بڑا'
Response: 'بھارت میں بہت کم ہے'
-----
Input:    'وہ سکول'
Response: 'پیدا ہوتے ہیں۔'
-----
Input:    'رات کے وقت'
Response: 'کارواں نہیں۔'
-----
Input:    'اس نے مجھے'
Response: 'کیوں نکالا ہے'
-----

--- Testing: Conversational (Bad Tests) ---
Input:    'السلام علیکم'
Response: 'بھی کہا جاتا'
-----
Input:    'آپ کا نام کیا ہے؟'
Response: 'اس کے لیے نہیں کیا جايے؟'
-----
Input:    'لاہور'
Response: 'قلندرز کے'
-----
 Qualitative testing complete.
