In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/urdu-roman-dataset/parallel_clean.csv


In [4]:
import os

data_path = '/kaggle/input/urdu-roman-dataset'  # replace with your folder name
print(os.listdir(data_path))


['parallel_clean.csv']


In [5]:
import os

# List all datasets in /kaggle/input
print(os.listdir('/kaggle/input'))

['urdu-roman-dataset']


In [7]:
# Step 1: List all top-level dataset folders in /kaggle/input
datasets = os.listdir('/kaggle/input')
print("Available datasets:", datasets)

# Step 2: If you know part of your dataset name, find it automatically
dataset_name = None
for d in datasets:
    if 'urdu' in d.lower():  # change 'urdu' to a unique keyword in your dataset name
        dataset_name = d
        break

if dataset_name is None:
    print("Dataset not found. Please check that it is added via 'Add data'.")
else:
    data_path = f'/kaggle/input/{dataset_name}'
    print("Dataset folder found:", data_path)
    
    # Step 3: List all files in the dataset folder
    files = os.listdir(data_path)
    print("Files in dataset:", files)

Available datasets: ['urdu-roman-dataset']
Dataset folder found: /kaggle/input/urdu-roman-dataset
Files in dataset: ['parallel_clean.csv']


In [8]:
import pandas as pd

# Get all CSV files in the folder
csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]

# Load each CSV into a dictionary
dataframes = {}
for f in csv_files:
    df_name = f.replace('.csv', '')  # key = file name without .csv
    dataframes[df_name] = pd.read_csv(f'{data_path}/{f}')

# Check what keys are in dataframes
print("Loaded CSVs:", list(dataframes.keys()))

Loaded CSVs: ['parallel_clean']


In [9]:
import os
import pandas as pd

# -------------------------------
# Step 1: List all datasets in Kaggle input
# -------------------------------
datasets = os.listdir('/kaggle/input')
print("Available datasets:", datasets)

# -------------------------------
# Step 2: Automatically find your dataset folder
# -------------------------------
dataset_name = None
for d in datasets:
    if 'urdu' in d.lower():  # change 'urdu' to a keyword unique to your dataset
        dataset_name = d
        break

if dataset_name is None:
    print("Dataset not found. Please add it via 'Add data'.")
else:
    data_path = f'/kaggle/input/{dataset_name}'
    print("Dataset folder found:", data_path)

    # -------------------------------
    # Step 3: List all files in the dataset folder
    # -------------------------------
    files = os.listdir(data_path)
    print("Files in dataset:", files)

    # -------------------------------
    # Step 4: Load all CSV files into a dictionary
    # -------------------------------
    csv_files = [f for f in files if f.endswith('.csv')]
    dataframes = {}
    for f in csv_files:
        df_name = f.replace('.csv', '')
        dataframes[df_name] = pd.read_csv(f'{data_path}/{f}')
        print(f"Loaded {f} with shape {dataframes[df_name].shape}")

    # -------------------------------
    # Step 5: Preview the first CSV
    # -------------------------------
    df = dataframes[list(dataframes.keys())[0]]
    print("\nPreview of first CSV:")
    print(df.head())

Available datasets: ['urdu-roman-dataset']
Dataset folder found: /kaggle/input/urdu-roman-dataset
Files in dataset: ['parallel_clean.csv']
Loaded parallel_clean.csv with shape (1314, 2)

Preview of first CSV:
                                                urdu  \
0  اہٹ سی کوئی ائے تو لگتا ہے کہ تم ہو سایہ کوئی ...   
1  موج گل موج صبا موج سحر لگتی ہے سر سے پا تک وہ ...   
2  طلوع صبح ہے نظریں اٹھا کے دیکھ ذرا شکست ظلمت ش...   
3  ہم سے بھاگا نہ کرو دور غزالوں کی طرح ہم نے چاہ...   
4  ہر ایک روح میں اک غم چھپا لگے ہے مجھے یہ زندگی...   

                                               roman  
0  aahat s ko aa.e to lagt hai ki tum ho saaya ko...  
1  mauj e gul mauj e sab mauj e sahar lagt hai sa...  
2  tul e sub.h hai nazre uth ke dekh zar shikast ...  
3  ham se bh g na karo duur haz lo k tarah ham ne...  
4  har ek ruuh me ik ham chhup lage hai mujhe ye ...  


In [10]:
import re
from sklearn.model_selection import train_test_split

# -------------------------------
# Step 1: Use the loaded DataFrame
# -------------------------------
# 'df' is already your first CSV loaded
df = df.dropna().drop_duplicates().reset_index(drop=True)

# -------------------------------
# Step 2: Rename columns for clarity
# -------------------------------
# Replace these with the actual column names from your dataset
df = df.rename(columns={'urdu_column_name': 'urdu', 'roman_column_name': 'roman'})

# -------------------------------
# Step 3: Text cleaning function
# -------------------------------
def clean_text(text):
    """
    Remove unwanted characters, punctuation, and extra spaces
    """
    text = str(text)
    text = text.strip()                  # Remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text)     # Replace multiple spaces with one
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply cleaning to both columns
df['urdu'] = df['urdu'].apply(clean_text)
df['roman'] = df['roman'].apply(clean_text)

# -------------------------------
# Step 4: Inspect cleaned data
# -------------------------------
print("Sample data after cleaning:")
print(df.head())

# -------------------------------
# Step 5: Split into train and validation sets
# -------------------------------
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)

# -------------------------------
# Step 6: Save preprocessed data (optional)
# -------------------------------
train_df.to_csv('/kaggle/working/train_data.csv', index=False)
val_df.to_csv('/kaggle/working/val_data.csv', index=False)

print("Preprocessed data saved to /kaggle/working/")


Sample data after cleaning:
                                                urdu  \
0  اہٹ سی کوئی ائے تو لگتا ہے کہ تم ہو سایہ کوئی ...   
1  موج گل موج صبا موج سحر لگتی ہے سر سے پا تک وہ ...   
2  طلوع صبح ہے نظریں اٹھا کے دیکھ ذرا شکست ظلمت ش...   
3  ہم سے بھاگا نہ کرو دور غزالوں کی طرح ہم نے چاہ...   
4  ہر ایک روح میں اک غم چھپا لگے ہے مجھے یہ زندگی...   

                                               roman  
0  aahat s ko aae to lagt hai ki tum ho saaya ko ...  
1  mauj e gul mauj e sab mauj e sahar lagt hai sa...  
2  tul e subh hai nazre uth ke dekh zar shikast e...  
3  ham se bh g na karo duur haz lo k tarah ham ne...  
4  har ek ruuh me ik ham chhup lage hai mujhe ye ...  
Training set shape: (1182, 2)
Validation set shape: (132, 2)
Preprocessed data saved to /kaggle/working/


In [11]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers import Tokenizer, trainers, pre_tokenizers
import os

# -------------------------------
# Step 1: Save text files for tokenizer training
# -------------------------------
train_urdu_path = '/kaggle/working/train_urdu.txt'
train_roman_path = '/kaggle/working/train_roman.txt'

# Save training sentences to text files
with open(train_urdu_path, 'w', encoding='utf-8') as f:
    for sent in train_df['urdu']:
        f.write(sent + '\n')

with open(train_roman_path, 'w', encoding='utf-8') as f:
    for sent in train_df['roman']:
        f.write(sent + '\n')

# -------------------------------
# Step 2: Initialize Byte-Pair Encoding (BPE) tokenizers
# -------------------------------
# Source tokenizer (Urdu)
tokenizer_urdu = ByteLevelBPETokenizer()
tokenizer_urdu.train(files=[train_urdu_path], vocab_size=30000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Target tokenizer (Roman Urdu)
tokenizer_roman = ByteLevelBPETokenizer()
tokenizer_roman.train(files=[train_roman_path], vocab_size=30000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# -------------------------------
# Step 3: Encode sentences into token IDs
# -------------------------------
# Example: Encode first 5 sentences
encoded_urdu = [tokenizer_urdu.encode(s).ids for s in train_df['urdu'][:5]]
encoded_roman = [tokenizer_roman.encode(s).ids for s in train_df['roman'][:5]]

print("Example Urdu token IDs:", encoded_urdu)
print("Example Roman Urdu token IDs:", encoded_roman)

# -------------------------------
# Step 4: Prepare sequences for model training
# -------------------------------
# You can pad/truncate sequences for batch training
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len_src = 50  # max sequence length for Urdu
max_len_tgt = 50  # max sequence length for Roman Urdu

X_train = pad_sequences([tokenizer_urdu.encode(s).ids for s in train_df['urdu']], maxlen=max_len_src, padding='post', truncating='post')
y_train = pad_sequences([tokenizer_roman.encode(s).ids for s in train_df['roman']], maxlen=max_len_tgt, padding='post', truncating='post')

X_val = pad_sequences([tokenizer_urdu.encode(s).ids for s in val_df['urdu']], maxlen=max_len_src, padding='post', truncating='post')
y_val = pad_sequences([tokenizer_roman.encode(s).ids for s in val_df['roman']], maxlen=max_len_tgt, padding='post', truncating='post')

print("Training shapes:", X_train.shape, y_train.shape)
print("Validation shapes:", X_val.shape, y_val.shape)







Example Urdu token IDs: [[986, 1471, 945, 313, 384, 304, 2443, 423, 291, 958, 565, 438, 1200, 7787, 524, 318, 936, 2758, 335, 538, 2246, 308, 723, 524, 469, 341, 526, 663, 2730, 3237, 300, 291, 1917, 297, 4773, 1110, 335, 291, 768, 297, 497, 308, 7156, 4195, 1214, 308, 1157, 291, 3261, 719, 592, 273, 317, 2698, 1072, 2570, 291, 438, 1435, 1214, 524, 336, 341, 2885, 327, 313, 300, 514, 372, 300, 302, 3013, 3068, 2011, 297, 610, 2390, 297, 1042, 293, 683, 302, 1841, 297, 2010, 6639, 302, 1268, 7001, 393, 310, 4339, 812, 291, 346, 6920, 7251, 1052, 307, 438, 313, 715, 6364, 6624, 308, 2267, 300, 291, 2639, 346, 2679, 630, 4626, 291, 873, 313, 274, 706, 291, 599, 416, 279, 2544, 1006, 363, 348, 310, 2095, 937, 4321, 316, 976, 300, 291, 659, 525, 2590, 383, 291, 758, 4890, 310, 323, 3488, 693, 316, 1390, 307, 574, 610, 291, 1407, 693, 316, 1390, 307, 568, 4338, 300, 291, 573, 334, 310], [263, 375, 4003, 300, 326, 1707, 1872, 2069, 291, 1938, 340, 291, 4326, 318, 2033, 2069, 291, 346, 

2025-09-21 14:28:10.438864: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758464890.697460      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758464890.777336      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Training shapes: (1182, 50) (1182, 50)
Validation shapes: (132, 50) (132, 50)


In [12]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Bidirectional, Embedding, Dense
import numpy as np

# -------------------------------
# Parameters (based on your preprocessing)
# -------------------------------
src_vocab_size = tokenizer_urdu.get_vocab_size()
tgt_vocab_size = tokenizer_roman.get_vocab_size()
embedding_dim = 256
encoder_units = 256
decoder_units = 256
max_len_src = X_train.shape[1]
max_len_tgt = X_train.shape[1]

# -------------------------------
# Encoder: 2-layer BiLSTM
# -------------------------------
encoder_inputs = Input(shape=(max_len_src,), name='encoder_input')
encoder_embedding = Embedding(input_dim=src_vocab_size, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)

# First BiLSTM layer
encoder_bi1 = Bidirectional(LSTM(encoder_units, return_sequences=True, return_state=True))
encoder_out1, forward_h1, forward_c1, backward_h1, backward_c1 = encoder_bi1(encoder_embedding)
state_h1 = tf.keras.layers.Concatenate()([forward_h1, backward_h1])
state_c1 = tf.keras.layers.Concatenate()([forward_c1, backward_c1])

# Second BiLSTM layer
encoder_bi2 = Bidirectional(LSTM(encoder_units, return_sequences=True, return_state=True))
encoder_out2, forward_h2, forward_c2, backward_h2, backward_c2 = encoder_bi2(encoder_out1)
state_h2 = tf.keras.layers.Concatenate()([forward_h2, backward_h2])
state_c2 = tf.keras.layers.Concatenate()([forward_c2, backward_c2])

encoder_states = [state_h2, state_c2]

# -------------------------------
# Decoder: 4-layer LSTM
# -------------------------------
decoder_inputs = Input(shape=(max_len_tgt,), name='decoder_input')
decoder_embedding = Embedding(input_dim=tgt_vocab_size, output_dim=embedding_dim, mask_zero=True)(decoder_inputs)

decoder_lstm1 = LSTM(decoder_units*2, return_sequences=True, return_state=True)
decoder_out1, _, _ = decoder_lstm1(decoder_embedding, initial_state=encoder_states)

decoder_lstm2 = LSTM(decoder_units*2, return_sequences=True, return_state=True)
decoder_out2, _, _ = decoder_lstm2(decoder_out1)

decoder_lstm3 = LSTM(decoder_units*2, return_sequences=True, return_state=True)
decoder_out3, _, _ = decoder_lstm3(decoder_out2)

decoder_lstm4 = LSTM(decoder_units*2, return_sequences=True, return_state=True)
decoder_out4, _, _ = decoder_lstm4(decoder_out3)

decoder_dense = Dense(tgt_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_out4)

# -------------------------------
# Define the model
# -------------------------------
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# -------------------------------
# Prepare target data for teacher forcing
# -------------------------------
y_train_shifted = np.zeros_like(y_train)
y_train_shifted[:, :-1] = y_train[:, 1:]
y_val_shifted = np.zeros_like(y_val)
y_val_shifted[:, :-1] = y_val[:, 1:]

# -------------------------------
# Train the model
# -------------------------------
history = model.fit(
    [X_train, y_train],
    np.expand_dims(y_train_shifted, -1),
    validation_data=([X_val, y_val], np.expand_dims(y_val_shifted, -1)),
    batch_size=32,  # reduce if memory issues
    epochs=10
)


2025-09-21 14:28:32.626339: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 3s/step - accuracy: 0.0354 - loss: 7.2984 - val_accuracy: 0.0365 - val_loss: 6.2520
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 3s/step - accuracy: 0.0409 - loss: 6.2139 - val_accuracy: 0.0379 - val_loss: 6.2220
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3s/step - accuracy: 0.0431 - loss: 6.1714 - val_accuracy: 0.0362 - val_loss: 6.1975
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 3s/step - accuracy: 0.0413 - loss: 6.1610 - val_accuracy: 0.0373 - val_loss: 6.1894
Epoch 5/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 3s/step - accuracy: 0.0413 - loss: 6.1480 - val_accuracy: 0.0400 - val_loss: 6.1704
Epoch 6/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 3s/step - accuracy: 0.0424 - loss: 6.1417 - val_accuracy: 0.0365 - val_loss: 6.1724
Epoch 7/10
[1m37/37[0m [32m━━━━

In [14]:
# -------------------------------
# STEP 0: Imports
# -------------------------------
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# -------------------------------
# STEP 1: Dataset Split 50/25/25
# -------------------------------
all_urdu = df['urdu'].values
all_roman = df['roman'].values

# First split: 50% train, 50% temp
X_train_raw, X_temp_raw, y_train_raw, y_temp_raw = train_test_split(
    all_urdu, all_roman, test_size=0.5, random_state=42)

# Second split: 25% val, 25% test
X_val_raw, X_test_raw, y_val_raw, y_test_raw = train_test_split(
    X_temp_raw, y_temp_raw, test_size=0.5, random_state=42)

print(len(X_train_raw), len(X_val_raw), len(X_test_raw))

# -------------------------------
# STEP 2: Encode & Pad sequences using BPE tokenizers
# -------------------------------
max_len_src = 50
max_len_tgt = 50

def encode_pad(tokenizer, sentences, max_len):
    sequences = [tokenizer.encode(s).ids for s in sentences]
    sequences = [seq + [0]*(max_len - len(seq)) if len(seq)<max_len else seq[:max_len] for seq in sequences]
    return sequences

X_train = encode_pad(tokenizer_urdu, X_train_raw, max_len_src)
y_train = encode_pad(tokenizer_roman, y_train_raw, max_len_tgt)
X_val = encode_pad(tokenizer_urdu, X_val_raw, max_len_src)
y_val = encode_pad(tokenizer_roman, y_val_raw, max_len_tgt)
X_test = encode_pad(tokenizer_urdu, X_test_raw, max_len_src)
y_test = encode_pad(tokenizer_roman, y_test_raw, max_len_tgt)

# Convert to tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# -------------------------------
# STEP 3: PyTorch Dataset & DataLoader
# -------------------------------
class NMTDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = src
        self.tgt = tgt
        
    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

batch_size = 32
train_dataset = NMTDataset(X_train, y_train)
val_dataset = NMTDataset(X_val, y_val)
test_dataset = NMTDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# -------------------------------
# STEP 4: Seq2Seq Model in PyTorch
# -------------------------------

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, n_layers, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.lstm_layers = nn.ModuleList()
        for i in range(n_layers):
            self.lstm_layers.append(nn.LSTM(emb_dim if i==0 else enc_hid_dim*2, enc_hid_dim, num_layers=1, bidirectional=True, batch_first=True))
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs = embedded
        for lstm in self.lstm_layers:
            outputs, (hidden, cell) = lstm(outputs)
        # concatenate final forward and backward hidden states
        hidden = torch.cat([hidden[0], hidden[1]], dim=1).unsqueeze(0)
        cell = torch.cat([cell[0], cell[1]], dim=1).unsqueeze(0)
        return outputs, (hidden, cell)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, dec_hid_dim, n_layers, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.lstm_layers = nn.ModuleList()
        for i in range(n_layers):
            self.lstm_layers.append(nn.LSTM(emb_dim if i==0 else dec_hid_dim*2, dec_hid_dim*2, num_layers=1, batch_first=True))
        self.fc_out = nn.Linear(dec_hid_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, tgt, hidden, cell):
        embedded = self.dropout(self.embedding(tgt))
        outputs = embedded
        for lstm in self.lstm_layers:
            outputs, (hidden, cell) = lstm(outputs, (hidden, cell))
        predictions = self.fc_out(outputs)
        return predictions, (hidden, cell)

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src, tgt):
        encoder_outputs, (hidden, cell) = self.encoder(src)
        outputs, _ = self.decoder(tgt, hidden, cell)
        return outputs

# Model instantiation
enc = Encoder(input_dim=tokenizer_urdu.get_vocab_size(), emb_dim=256, enc_hid_dim=256, n_layers=2).to(device)
dec = Decoder(output_dim=tokenizer_roman.get_vocab_size(), emb_dim=256, dec_hid_dim=256, n_layers=4).to(device)
model = Seq2Seq(enc, dec).to(device)

# -------------------------------
# STEP 5: Loss & Optimizer
# -------------------------------
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# -------------------------------
# STEP 6: Training Loop (simplified)
# -------------------------------
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    for src_batch, tgt_batch in train_loader:
        src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
        optimizer.zero_grad()
        # shift target for teacher forcing
        tgt_input = tgt_batch[:, :-1]
        tgt_output = tgt_batch[:, 1:]
        output = model(src_batch, tgt_input)
        output = output.view(-1, output.shape[-1])
        tgt_output = tgt_output.contiguous().view(-1)
        loss = criterion(output, tgt_output)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {epoch_loss/len(train_loader):.4f}')


657 328 329
Epoch 1, Loss: 6.8554
Epoch 2, Loss: 6.1871
Epoch 3, Loss: 6.1443
Epoch 4, Loss: 6.1270
Epoch 5, Loss: 6.1134
Epoch 6, Loss: 6.0970
Epoch 7, Loss: 6.0891
Epoch 8, Loss: 6.0760
Epoch 9, Loss: 6.0639
Epoch 10, Loss: 6.0493


In [17]:
# -------------------------------
# STEP 7: Training + Validation Loop
# -------------------------------
import math
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smooth = SmoothingFunction().method1

def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    all_preds, all_refs = [], []
    
    with torch.no_grad():
        for src_batch, tgt_batch in loader:
            src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)

            tgt_input = tgt_batch[:, :-1]   # input to decoder
            tgt_output = tgt_batch[:, 1:]   # expected output

            output = model(src_batch, tgt_input)   # [batch, seq, vocab]
            output_dim = output.shape[-1]

            output_flat = output.view(-1, output_dim)
            tgt_output_flat = tgt_output.contiguous().view(-1)

            loss = criterion(output_flat, tgt_output_flat)
            epoch_loss += loss.item()

            # Predictions for BLEU
            preds = output.argmax(-1).cpu().numpy()
            refs = tgt_output.cpu().numpy()

            for p, r in zip(preds, refs):
                p = [idx for idx in p if idx != 0]
                r = [idx for idx in r if idx != 0]
                if len(p) > 0 and len(r) > 0:
                    bleu = sentence_bleu([r], p, smoothing_function=smooth)
                    all_preds.append(bleu)

    avg_loss = epoch_loss / len(loader)
    ppl = math.exp(avg_loss) if avg_loss < 20 else float("inf")
    avg_bleu = np.mean(all_preds) if len(all_preds) > 0 else 0.0

    return avg_loss, ppl, avg_bleu


# -------------------------------
# STEP 8: Train with Validation
# -------------------------------
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    train_loss = 0

    for src_batch, tgt_batch in train_loader:
        src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
        optimizer.zero_grad()

        tgt_input = tgt_batch[:, :-1]
        tgt_output = tgt_batch[:, 1:]

        output = model(src_batch, tgt_input)
        output_dim = output.shape[-1]

        output_flat = output.view(-1, output_dim)
        tgt_output_flat = tgt_output.contiguous().view(-1)

        loss = criterion(output_flat, tgt_output_flat)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation at end of epoch
    val_loss, val_ppl, val_bleu = evaluate(model, val_loader, criterion)

    print(f"Epoch {epoch+1}: "
          f"Train Loss={train_loss/len(train_loader):.4f} | "
          f"Val Loss={val_loss:.4f}, Val PPL={val_ppl:.2f}, Val BLEU={val_bleu:.4f}")


# -------------------------------
# STEP 9: Final Test Evaluation
# -------------------------------
test_loss, test_ppl, test_bleu = evaluate(model, test_loader, criterion)
print(f"\nFinal Test Results -> Loss={test_loss:.4f}, PPL={test_ppl:.2f}, BLEU={test_bleu:.4f}")


Epoch 1: Train Loss=5.8828 | Val Loss=6.3136, Val PPL=552.05, Val BLEU=0.0055
Epoch 2: Train Loss=5.8528 | Val Loss=6.3091, Val PPL=549.55, Val BLEU=0.0056
Epoch 3: Train Loss=5.8204 | Val Loss=6.2994, Val PPL=544.23, Val BLEU=0.0054
Epoch 4: Train Loss=5.7764 | Val Loss=6.2673, Val PPL=527.03, Val BLEU=0.0058
Epoch 5: Train Loss=5.7273 | Val Loss=6.2439, Val PPL=514.84, Val BLEU=0.0057
Epoch 6: Train Loss=5.6691 | Val Loss=6.2372, Val PPL=511.44, Val BLEU=0.0072
Epoch 7: Train Loss=5.6050 | Val Loss=6.2134, Val PPL=499.40, Val BLEU=0.0077
Epoch 8: Train Loss=5.5228 | Val Loss=6.1922, Val PPL=488.91, Val BLEU=0.0081
Epoch 9: Train Loss=5.4415 | Val Loss=6.1829, Val PPL=484.39, Val BLEU=0.0083
Epoch 10: Train Loss=5.3664 | Val Loss=6.1830, Val PPL=484.46, Val BLEU=0.0085

Final Test Results -> Loss=6.1527, PPL=469.96, BLEU=0.0086


In [18]:
# -------------------------------
# STEP 10: Inference (Qualitative Examples)
# -------------------------------

def translate_sentence(model, sentence, tokenizer_src, tokenizer_tgt, max_len=50):
    model.eval()

    # 1. Encode Urdu sentence
    src_ids = tokenizer_src.encode(sentence).ids
    src_ids = src_ids[:max_len] + [0]*(max_len - len(src_ids))  # pad
    src_tensor = torch.tensor([src_ids], dtype=torch.long).to(device)

    # 2. Start with <sos> token for decoder (use id=1 if tokenizer has special tokens)
    tgt_ids = [1]  # assuming 1 = <sos>
    for _ in range(max_len):
        tgt_tensor = torch.tensor([tgt_ids], dtype=torch.long).to(device)
        with torch.no_grad():
            output = model(src_tensor, tgt_tensor)  # [1, seq_len, vocab_size]
        next_token = output[0, -1].argmax(-1).item()
        tgt_ids.append(next_token)
        if next_token == 2:  # assuming 2 = <eos>
            break

    # 3. Decode predicted Roman Urdu ids back to text
    pred_sentence = tokenizer_tgt.decode(tgt_ids)
    return pred_sentence


# -------------------------------
# Example Usage
# -------------------------------
sample_urdu = "میں تجھ کو بھول جاؤں گا"
predicted_roman = translate_sentence(model, sample_urdu, tokenizer_urdu, tokenizer_roman)
print("Urdu Input:", sample_urdu)
print("Predicted Roman Urdu:", predicted_roman)


Urdu Input: میں تجھ کو بھول جاؤں گا
Predicted Roman Urdu:  to to bh bh mai mai bh bh bh bh bh to to th to th th th th th th th th th th th th th bh k mai k mai k mai k mai k mai k mai k mai k mai k mai k mai k


In [27]:
def translate_sentence(model, sentence, tokenizer_src, tokenizer_tgt, max_len=50):
    model.eval()

    src_ids = tokenizer_src.encode(sentence).ids
    src_ids = src_ids[:max_len] + [0]*(max_len - len(src_ids))
    src_tensor = torch.tensor([src_ids], dtype=torch.long).to(device)

    tgt_ids = [1]  # <sos>
    for _ in range(max_len):
        tgt_tensor = torch.tensor([tgt_ids], dtype=torch.long).to(device)
        with torch.no_grad():
            output = model(src_tensor, tgt_tensor)
        next_token = output[0, -1].argmax(-1).item()

        if next_token == 2:  # <eos>
            break
        if len(tgt_ids) > 2 and next_token == tgt_ids[-1]:  # avoid loops
            break

        tgt_ids.append(next_token)

    pred_sentence = tokenizer_tgt.decode(tgt_ids)
    return pred_sentence


In [28]:
print("Urdu tokenizer vocab size:", tokenizer_urdu.get_vocab_size())
print("Roman tokenizer vocab size:", tokenizer_roman.get_vocab_size())

# Print first few tokens
print("Roman vocab sample:", list(tokenizer_roman.get_vocab().items())[:20])


Urdu tokenizer vocab size: 8045
Roman tokenizer vocab size: 4895
Roman vocab sample: [('Ġaiso', 3783), ('Ġbha', 1791), ('Ġizzat', 2544), ('Ġafsh', 1762), ('Ĩ', 233), ('ç', 168), ('Ġgust', 2065), ('ı', 242), ('Ġnikhat', 2184), ('Ġhikmat', 3125), ('Ġguhar', 1555), ('Ġbuute', 4695), ('Ġsamt', 1558), ('Ġanv', 4428), ('Ġlekin', 608), ('Ġhum', 1218), ('ist', 892), ('Ġpyaar', 853), ('Ġaf', 674), ('Ġleh', 4333)]


In [29]:
def translate_sentence(model, sentence, tokenizer_src, tokenizer_tgt, max_len=50):
    model.eval()

    # Encode Urdu input
    src_ids = tokenizer_src.encode(sentence).ids
    src_ids = src_ids[:max_len] + [0]*(max_len - len(src_ids))   # padding
    src_tensor = torch.tensor([src_ids], dtype=torch.long).to(device)

    # Start with an empty target sequence (or PAD=0)
    tgt_ids = [0]

    for _ in range(max_len):
        tgt_tensor = torch.tensor([tgt_ids], dtype=torch.long).to(device)
        with torch.no_grad():
            output = model(src_tensor, tgt_tensor)
        next_token = output[0, -1].argmax(-1).item()

        # Stop if next token is PAD (0)
        if next_token == 0:
            break

        tgt_ids.append(next_token)

    # Decode predicted tokens into Roman Urdu string
    pred_sentence = tokenizer_tgt.decode(tgt_ids)
    return pred_sentence


# 🔥 Example test
sample_urdu = "میں تجھ کو بھول جاؤں گا"
predicted_roman = translate_sentence(model, sample_urdu, tokenizer_urdu, tokenizer_roman)
print("Urdu Input:", sample_urdu)
print("Predicted Roman Urdu:", predicted_roman)


Urdu Input: میں تجھ کو بھول جاؤں گا
Predicted Roman Urdu:  to to bh bh mai mai bh bh bh bh bh to to bh th to th th th th th th th th th th th th bh k mai k mai k mai k mai k mai k mai k mai k mai k mai k mai k


In [30]:
# =======================================================
# COMBINED PIPELINE (Steps 1 → 6)
# =======================================================

# -------------------------------
# STEP 0: Imports & Device
# -------------------------------
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tokenizers import ByteLevelBPETokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -------------------------------
# STEP 1: Add <sos>/<eos> tokens + Split 50/25/25
# -------------------------------
df['roman_sos_eos'] = "<sos> " + df['roman'].astype(str) + " <eos>"

all_src = df['urdu'].astype(str).tolist()
all_tgt = df['roman_sos_eos'].astype(str).tolist()

X_train_raw, X_temp_raw, y_train_raw, y_temp_raw = train_test_split(all_src, all_tgt, test_size=0.5, random_state=42)
X_val_raw, X_test_raw, y_val_raw, y_test_raw = train_test_split(X_temp_raw, y_temp_raw, test_size=0.5, random_state=42)

print("Splits:", len(X_train_raw), len(X_val_raw), len(X_test_raw))

# -------------------------------
# STEP 2: Train Roman tokenizer (with special tokens)
# -------------------------------
train_roman_path = '/kaggle/working/train_roman_with_tokens.txt'
with open(train_roman_path, 'w', encoding='utf-8') as f:
    for s in y_train_raw:
        f.write(s + '\n')

tokenizer_roman = ByteLevelBPETokenizer()
tokenizer_roman.train(
    files=[train_roman_path],
    vocab_size=8000,
    min_frequency=2,
    special_tokens=["<sos>", "<pad>", "<eos>", "<unk>", "<mask>"]
)

# Urdu tokenizer assumed already trained earlier
# If not, you can retrain like this:
# tokenizer_urdu = ByteLevelBPETokenizer()
# tokenizer_urdu.train(files=['/kaggle/working/train_urdu.txt'],
#                      vocab_size=8000, min_frequency=2,
#                      special_tokens=["<pad>", "<unk>"])

# -------------------------------
# STEP 3: Get special token IDs
# -------------------------------
pad_id = tokenizer_roman.token_to_id("<pad>")
sos_id = tokenizer_roman.token_to_id("<sos>")
eos_id = tokenizer_roman.token_to_id("<eos>")
src_pad_id = tokenizer_urdu.token_to_id("<pad>") or 0

print("Roman pad:", pad_id, "sos:", sos_id, "eos:", eos_id)
print("Urdu pad:", src_pad_id)

# -------------------------------
# STEP 4: Encode & Pad
# -------------------------------
MAX_LEN_SRC = 50
MAX_LEN_TGT = 50

def encode_pad(tokenizer, sentences, max_len, pad_id):
    ids_list = []
    for s in sentences:
        enc = tokenizer.encode(s).ids
        if len(enc) >= max_len:
            ids_list.append(enc[:max_len])
        else:
            ids_list.append(enc + [pad_id] * (max_len - len(enc)))
    return ids_list

X_train_ids = encode_pad(tokenizer_urdu, X_train_raw, MAX_LEN_SRC, src_pad_id)
X_val_ids   = encode_pad(tokenizer_urdu, X_val_raw, MAX_LEN_SRC, src_pad_id)
X_test_ids  = encode_pad(tokenizer_urdu, X_test_raw, MAX_LEN_SRC, src_pad_id)

y_train_ids = encode_pad(tokenizer_roman, y_train_raw, MAX_LEN_TGT, pad_id)
y_val_ids   = encode_pad(tokenizer_roman, y_val_raw, MAX_LEN_TGT, pad_id)
y_test_ids  = encode_pad(tokenizer_roman, y_test_raw, MAX_LEN_TGT, pad_id)

X_train = torch.tensor(X_train_ids, dtype=torch.long)
y_train = torch.tensor(y_train_ids, dtype=torch.long)
X_val   = torch.tensor(X_val_ids, dtype=torch.long)
y_val   = torch.tensor(y_val_ids, dtype=torch.long)
X_test  = torch.tensor(X_test_ids, dtype=torch.long)
y_test  = torch.tensor(y_test_ids, dtype=torch.long)

# -------------------------------
# STEP 5: Dataset & DataLoader
# -------------------------------
class NMTDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = src
        self.tgt = tgt
    def __len__(self): return len(self.src)
    def __getitem__(self, idx): return self.src[idx], self.tgt[idx]

batch_size = 32
train_loader = DataLoader(NMTDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(NMTDataset(X_val, y_val), batch_size=batch_size)
test_loader  = DataLoader(NMTDataset(X_test, y_test), batch_size=batch_size)

# -------------------------------
# STEP 6: Define Seq2Seq Model
# -------------------------------
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, enc_layers, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, enc_hid_dim, num_layers=enc_layers,
                            bidirectional=True, batch_first=True, dropout=dropout)
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, dec_hid_dim, dec_layers, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, dec_hid_dim, num_layers=dec_layers,
                            batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(dec_hid_dim, output_dim)
    def forward(self, input, hidden, cell):
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output)
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
    def forward(self, src, tgt):
        _, hidden, cell = self.encoder(src)
        output, _, _ = self.decoder(tgt, hidden, cell)
        return output

# Instantiate
EMBEDDING_DIM = 256
ENC_HID_DIM = 256
DEC_HID_DIM = ENC_HID_DIM*2
ENC_LAYERS = 2
DEC_LAYERS = 4
ENC_DROPOUT = 0.2
DEC_DROPOUT = 0.2

enc = Encoder(tokenizer_urdu.get_vocab_size(), EMBEDDING_DIM, ENC_HID_DIM,
              ENC_LAYERS, ENC_DROPOUT, src_pad_id).to(device)
dec = Decoder(tokenizer_roman.get_vocab_size(), EMBEDDING_DIM, DEC_HID_DIM,
              DEC_LAYERS, DEC_DROPOUT, pad_id).to(device)
model = Seq2Seq(enc, dec, pad_id).to(device)

# -------------------------------
# STEP 6B: Loss & Optimizer
# -------------------------------
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)

print("Pipeline ready ✅ — run training loop next")


Using device: cpu
Splits: 657 328 329



Roman pad: 1 sos: 0 eos: 2
Urdu pad: 1
Pipeline ready ✅ — run training loop next
