In [1]:
!pip install --upgrade pip
!pip install tensorflow wandb numpy pandas tqdm

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1


In [2]:
import wandb
wandb.login(key='613aac3388325cb6206db61e3c1a38a707589743')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mda24s006[0m ([33mda24s006-indian-institue-of-technology-madras-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Data Preparation:
- downloading and extracting dikshina hindi lexicon: already downloaded and uploaded the relavant part of the data in the kaggle environment.
- building character vocabularies and dataloader.

In [3]:
# data_utils.py
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

def build_vocab(pairs, specials=['<pad>','<sos>','<eos>']):
    chars = set(''.join(pairs))
    idx = {tok:i for i,tok in enumerate(specials)}
    for c in sorted(chars):
        idx[c] = len(idx)
    return idx

class TransliterationDataset(Dataset):
    def __init__(self, path, src_vocab, tgt_vocab, max_len=32):
        df = pd.read_csv(path, sep='\t', names=['devanagari','roman','dont_care'])
        df = df.dropna()
        self.pairs = df[['roman','devanagari']].values.tolist()
        self.src_vocab, self.tgt_vocab = src_vocab, tgt_vocab
        self.max_len = max_len

    def __len__(self): return len(self.pairs)

    def __getitem__(self, i):
        src, tgt = self.pairs[i]
        # src: [c1,c2,...] -> [..., <eos>]
        src_ids = [self.src_vocab[c] for c in src][:self.max_len] + [self.src_vocab['<eos>']]
        # tgt: [<sos>, c1,c2,..., <eos>]
        tgt_ids = [self.tgt_vocab['<sos>']] + \
                  [self.tgt_vocab[c] for c in tgt][:self.max_len] + \
                  [self.tgt_vocab['<eos>']]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

def collate_fn(batch):
    srcs, tgts = zip(*batch)
    srcs_p = pad_sequence(srcs, batch_first=True, padding_value=src_vocab['<pad>'])
    tgts_p = pad_sequence(tgts, batch_first=True, padding_value=tgt_vocab['<pad>'])
    return srcs_p, tgts_p

# Build vocabs once
df = pd.read_csv('/kaggle/input/lexicons-hindi-transliteration/hi.translit.sampled.train.tsv', sep='\t', names=['devanagari','roman','dont_care'])
df = df.dropna()
src_vocab = build_vocab(df['roman'])
tgt_vocab = build_vocab(df['devanagari'])

# Model Definition
- Encoder and Decoder class

In [4]:
# model.py
import torch, torch.nn as nn, torch.nn.functional as F

class Encoder(nn.Module):
    
    def __init__(self, inp_dim, emb_dim, hid_dim, n_layers, cell, dropout):
        super().__init__()
        self.emb = nn.Embedding(inp_dim, emb_dim)
        RNN = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[cell]
        self.rnn = RNN(emb_dim, hid_dim, n_layers,
                       dropout=dropout if n_layers>1 else 0,
                       batch_first=True)
        self.drop = nn.Dropout(dropout)
        
    def forward(self, x):
        # x: [B, S]
        e = self.drop(self.emb(x))
        out, hidden = self.rnn(e)
        return out, hidden

class Attention(nn.Module):
    
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim*2, hid_dim)
        self.v    = nn.Linear(hid_dim, 1, bias=False)
        
    def forward(self, hidden, enc_out):
        # hidden: [B, H], enc_out: [B, S, H]
        B, S, H = enc_out.size()
        h = hidden.unsqueeze(1).repeat(1,S,1)               # [B,S,H]
        energy = torch.tanh(self.attn(torch.cat([h,enc_out],dim=2)))  # [B,S,H]
        scores = self.v(energy).squeeze(2)                  # [B,S]
        return F.softmax(scores, dim=1)

class Decoder(nn.Module):
    
    def __init__(self, out_dim, emb_dim, hid_dim, n_layers, cell, dropout, use_attn=False):
        super().__init__()
        self.emb = nn.Embedding(out_dim, emb_dim)
        RNN = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[cell]
        self.rnn = RNN(emb_dim + (hid_dim if use_attn else 0),
                       hid_dim, n_layers,
                       dropout=dropout if n_layers>1 else 0,
                       batch_first=True)
        self.fc  = nn.Linear(hid_dim, out_dim)
        self.drop = nn.Dropout(dropout)
        self.use_attn = use_attn
        if use_attn: self.attn = Attention(hid_dim)

    def forward(self, tgt_tok, hidden, enc_out=None):
        # tgt_tok: [B], hidden: (h_n, c_n)? or h_n
        B = tgt_tok.size(0)
        t = tgt_tok.unsqueeze(1)            # [B,1]
        emb = self.drop(self.emb(t))        # [B,1,E]
        
        if self.use_attn:
            h = hidden[-1] if not isinstance(hidden, tuple) else hidden[0][-1]
            attn_w = self.attn(h, enc_out)  # [B, S]
            ctx    = torch.bmm(attn_w.unsqueeze(1), enc_out)  # [B,1,H]
            rnn_in = torch.cat([emb, ctx], dim=2)
        else:
            rnn_in = emb
        out, hidden = self.rnn(rnn_in, hidden)
        pred = self.fc(out.squeeze(1))      # [B, out_dim]
        
        return pred, hidden, (attn_w if self.use_attn else None)

class Seq2Seq(nn.Module):
    def __init__(self, enc, dec, pad_idx, device):
        super().__init__()
        self.enc, self.dec = enc, dec
        self.pad_idx = pad_idx
        self.device  = device

    # def _init_decoder_hidden(self, enc_hidden):
    #     # Handles both GRU (Tensor) and LSTM (tuple) encoder hidden states
    #     if isinstance(enc_hidden, tuple):  # LSTM: (h_n, c_n)
    #         h, c = enc_hidden
    #         dec_layers = self.dec.rnn.num_layers
    #         B, H = h.size(1), h.size(2)
    #         # Prepare zero-padded states
    #         h0 = torch.zeros(dec_layers, B, H, device=self.device)
    #         c0 = torch.zeros(dec_layers, B, H, device=self.device)
    #         # Copy encoder layers into decoder state
    #         h0[:h.size(0)] = h
    #         c0[:c.size(0)] = c
    #         return (h0, c0)
    #     else:  # GRU or RNN
    #         h = enc_hidden
    #         dec_layers = self.dec.rnn.num_layers
    #         B, H = h.size(1), h.size(2)
    #         h0 = torch.zeros(dec_layers, B, H, device=self.device)
    #         h0[:h.size(0)] = h
    #         return h0

    def _init_decoder_hidden(self, enc_hidden):
       
        dec_layers = self.dec.rnn.num_layers
    
        if isinstance(enc_hidden, tuple):  # LSTM: (h, c)
            h, c = enc_hidden
            enc_layers, B, H = h.size()
            # Prepare zero states
            h0 = torch.zeros(dec_layers,  B, H, device=self.device)
            c0 = torch.zeros(dec_layers,  B, H, device=self.device)
            # Number of layers to copy
            n = min(enc_layers, dec_layers)
            # Copy last n layers from encoder into bottom of decoder state
            h0[-n:] = h[-n:]
            c0[-n:] = c[-n:]
            return (h0, c0)

        else:  # GRU or vanilla RNN
            h = enc_hidden
            enc_layers, B, H = h.size()
            h0 = torch.zeros(dec_layers, B, H, device=self.device)
            n = min(enc_layers, dec_layers)
            h0[-n:] = h[-n:]
            return h0


    def forward(self, src, tgt, teacher_forcing=0.5):
        B, T = tgt.size()
        out_dim = self.dec.fc.out_features
        outputs = torch.zeros(B, T, out_dim, device=self.device)

        enc_out, enc_hidden = self.enc(src)
        # Initialize decoder hidden state to match dec_layers
        dec_hidden = self._init_decoder_hidden(enc_hidden)

        inp = tgt[:,0]  # <sos>
        for t in range(1, T):
            pred, dec_hidden, _ = self.dec(
                inp, dec_hidden,
                enc_out if self.dec.use_attn else None
            )
            outputs[:,t] = pred
            top1 = pred.argmax(1)
            inp = tgt[:,t] if torch.rand(1).item() < teacher_forcing else top1

        return outputs


# Beam Search Decoding

In [5]:
# beam_search.py
import torch, torch.nn.functional as F
from queue import PriorityQueue
from math import log

class BeamNode:
    def __init__(self, hidden, prev, tok, logp, length):
        self.hidden, self.prev, self.tok = hidden, prev, tok
        self.logp, self.len = logp, length
    def score(self): return self.logp / float(self.len)

def beam_decode(model, src, src_vocab, tgt_vocab, beam_k=3, max_len=32, device='cpu'):
    model.eval()
    with torch.no_grad():
        enc_out, hidden = model.enc(src)
        # init beam
        init_tok = torch.tensor([tgt_vocab['<sos>']], device=device)
        node = BeamNode(hidden, None, init_tok, 0.0, 1)
        pq = PriorityQueue(); pq.put((-node.score(), node))
        end_beams = []
        while not pq.empty():
            _, n = pq.get()
            if n.tok.item()==tgt_vocab['<eos>'] and n.prev is not None:
                end_beams.append((n.score(), n))
                if len(end_beams)>=beam_k: break
            inp = n.tok
            pred, hid, _ = model.dec(inp, n.hidden, enc_out if model.dec.use_attn else None)
            logps = F.log_softmax(pred, dim=1)
            topv, topi = logps.topk(beam_k)
            for i in range(beam_k):
                tok_i = topi[0][i].unsqueeze(0)
                score = n.logp + topv[0][i].item()
                new_node = BeamNode(hid, n, tok_i, score, n.len+1)
                pq.put((-new_node.score(), new_node))
        # backtrack best
        best = sorted(end_beams, key=lambda x: x[0], reverse=True)[0][1]
        seq = []
        while best.prev is not None:
            seq.append(best.tok.item()); best = best.prev
        return seq[::-1]


# 4. Training, Evaluation & W&B Sweep

In [6]:
# train.py
import wandb, torch, torch.optim as optim, torch.nn as nn
# from data_utils import TransliterationDataset, collate_fn, src_vocab, tgt_vocab
# from model import Encoder, Decoder, Seq2Seq
# from beam_search import beam_decode
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pad_idx = tgt_vocab['<pad>']

sweep_config = {
  'method': 'bayes',
  'metric': {'name':'val_loss','goal':'minimize'},
  'parameters':{
    'emb_dim':   {'values':[16,32,64,256]},
    'hid_dim':   {'values':[16,32,64,256]},
    'enc_layers':{'values':[1,2,3]},
    'dec_layers':{'values':[1,2,3]},
    'cell_type': {'values':['RNN','GRU','LSTM']},
    'dropout':   {'values':[0.2,0.3]},
    'beam_size': {'values':[1,3,5]},
    'lr':        {'value':1e-3},
    'batch_size':{'value':128}
  }
}

In [7]:

# def train_epoch(model, loader, opt, crit):
#     model.train(); total=0
#     for src, tgt in loader:
#         src, tgt = src.to(device), tgt.to(device)
#         opt.zero_grad()
#         out = model(src, tgt,teacher_forcing=0.5)
#         loss = crit(out[:,1:].reshape(-1,out.size(-1)), tgt[:,1:].reshape(-1))
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(),1)
#         opt.step(); total += loss.item()
#     return total/len(loader)

# def eval_epoch(model, loader, crit, beam_k):
#     model.eval(); total=0
#     with torch.no_grad():
#         for src,tgt in loader:
#             src,tgt = src.to(device), tgt.to(device)
#             # teacher_forced loss
#             out = model(src,tgt,teacher_forcing=0.0)
#             total += crit(out[:,1:].reshape(-1,out.size(-1)), tgt[:,1:].reshape(-1)).item()
#     return total/len(loader)

def calculate_accuracy(output, target, pad_idx):
    # output: [B, T, V], target: [B, T]
    with torch.no_grad():
        pred_tokens = output.argmax(dim=2)           # [B, T]
        mask        = target != pad_idx              # ignore pads
        correct     = (pred_tokens == target) & mask
        return correct.sum().float() / mask.sum().float()

def train_epoch(model, loader, opt, crit, pad_idx):
    model.train()
    total_loss = 0
    total_acc  = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        opt.zero_grad()
        out  = model(src, tgt, teacher_forcing=0.5)
        loss = crit(out[:,1:].reshape(-1,out.size(-1)), tgt[:,1:].reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        opt.step()

        total_loss += loss.item()
        total_acc  += calculate_accuracy(out[:,1:], tgt[:,1:], pad_idx)
    return total_loss/len(loader), total_acc/len(loader)

def eval_epoch(model, loader, crit, beam_k, pad_idx):
    model.eval()
    total_loss = 0
    total_acc  = 0
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            out  = model(src, tgt, teacher_forcing=0.0)
            loss = crit(out[:,1:].reshape(-1,out.size(-1)), tgt[:,1:].reshape(-1))

            total_loss += loss.item()
            total_acc  += calculate_accuracy(out[:,1:], tgt[:,1:], pad_idx)
    return total_loss/len(loader), total_acc/len(loader)


In [8]:

def sweep_run():
    wandb.init()
    cfg = wandb.config
    
    # data
    ds = TransliterationDataset('/kaggle/input/lexicons-hindi-transliteration/hi.translit.sampled.train.tsv', src_vocab, tgt_vocab)
    dv = TransliterationDataset('/kaggle/input/lexicons-hindi-transliteration/hi.translit.sampled.dev.tsv', src_vocab, tgt_vocab)
    dl = DataLoader(ds, batch_size=cfg.batch_size, collate_fn=collate_fn, shuffle=True)
    val_dl = DataLoader(dv, batch_size=cfg.batch_size, collate_fn=collate_fn)
    
    # model
    enc = Encoder(len(src_vocab), cfg.emb_dim, cfg.hid_dim, cfg.enc_layers, cfg.cell_type, cfg.dropout)
    dec = Decoder(len(tgt_vocab), cfg.emb_dim, cfg.hid_dim, cfg.dec_layers, cfg.cell_type, cfg.dropout, False)
    model = Seq2Seq(enc,dec,pad_idx,device).to(device)
    opt   = optim.Adam(model.parameters(), lr=cfg.lr)
    crit  = nn.CrossEntropyLoss(ignore_index=pad_idx)

    for epoch in range(1, 11):
        tr_loss, tr_acc = train_epoch(model, dl, opt, crit, pad_idx)
        vl_loss, vl_acc = eval_epoch(model, val_dl, crit, cfg.beam_size, pad_idx)
        wandb.log({
            'epoch':       epoch,
            'train_loss':  tr_loss,
            'train_acc':   tr_acc,
            'val_loss':    vl_loss,
            'val_acc':     vl_acc
        })
            
    # save best
    torch.save(model.state_dict(),'model_without_attn.pt')
    wandb.save('model_without_attn.pt')

if __name__=='__main__':
    sweep_id = wandb.sweep(sweep_config, project='dakshina-translit')
    wandb.agent(sweep_id, function=sweep_run, count=50)


Create sweep with ID: twa746j3
Sweep URL: https://wandb.ai/da24s006-indian-institue-of-technology-madras-/dakshina-translit/sweeps/twa746j3


[34m[1mwandb[0m: Agent Starting Run: 33fyuoza with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hid_dim: 64
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250520_053257-33fyuoza[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mswept-sweep-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/da24s006-indian-institue-of-technology-madras-/dakshina-translit[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/da24s006-indian-institue-of-technology-madras-/dakshina-translit/sweeps/twa746j3[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://w

# test set predictions

In [9]:
import torch

# 1. Load the saved state dict
state_dict = torch.load('model_without_attn.pt', map_location='cpu')

# 2. Print every parameter name and its shape
print("Saved parameters and shapes:\n")
for name, tensor in state_dict.items():
    print(f"{name:40s} → {tuple(tensor.shape)}")



Saved parameters and shapes:

enc.emb.weight                           → (29, 16)
enc.rnn.weight_ih_l0                     → (1024, 16)
enc.rnn.weight_hh_l0                     → (1024, 256)
enc.rnn.bias_ih_l0                       → (1024,)
enc.rnn.bias_hh_l0                       → (1024,)
enc.rnn.weight_ih_l1                     → (1024, 256)
enc.rnn.weight_hh_l1                     → (1024, 256)
enc.rnn.bias_ih_l1                       → (1024,)
enc.rnn.bias_hh_l1                       → (1024,)
dec.emb.weight                           → (66, 16)
dec.rnn.weight_ih_l0                     → (1024, 16)
dec.rnn.weight_hh_l0                     → (1024, 256)
dec.rnn.bias_ih_l0                       → (1024,)
dec.rnn.bias_hh_l0                       → (1024,)
dec.fc.weight                            → (66, 256)
dec.fc.bias                              → (66,)


In [10]:
# import torch
# from torch.utils.data import DataLoader

# # Paths and device
# test_path = '/kaggle/input/transliteration-9123/hi.translit.sampled.test.tsv'
# device    = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# pad_idx   = tgt_vocab['<pad>']

# # 1. Prepare test DataLoader
# test_dataset = TransliterationDataset(test_path, src_vocab, tgt_vocab)
# test_loader  = DataLoader(test_dataset, batch_size=128, collate_fn=collate_fn)

# # 2. Re‐create the model with best hyperparameters
# #    Replace these values with your actual best config
# best_emb_dim    = 256
# best_hid_dim    = 256
# best_enc_layers = 1
# best_dec_layers = 1
# best_cell_type  = 'LSTM'
# best_dropout    = 0.3


# enc = Encoder(len(src_vocab), best_emb_dim, best_hid_dim,
#               best_enc_layers, best_cell_type, best_dropout)
# dec = Decoder(len(tgt_vocab), best_emb_dim, best_hid_dim,
#               best_dec_layers, best_cell_type, best_dropout,
#               use_attn=False)
# model = Seq2Seq(enc, dec, pad_idx, device).to(device)

# # 3. Load the saved weights
# model.load_state_dict(torch.load('model_use_attn.pt', map_location=device))
# model.eval()

# # 4. Define the loss criterion
# criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_idx)

# dv = TransliterationDataset('/kaggle/input/transliteration-9123/hi.translit.sampled.dev.tsv', src_vocab, tgt_vocab)
# val_dl = DataLoader(dv, batch_size=128, collate_fn=collate_fn)


# # 5. Run evaluation on the test set
# test_loss, test_acc = eval_epoch(model, test_loader, criterion,
#                                  beam_k=1, pad_idx=pad_idx)

# print(f"Test Loss: {test_loss:.4f}")
# print(f"Test Token‐level Accuracy: {test_acc:.4%}")


In [11]:

# def sweep_run():
#     wandb.init()
#     cfg = wandb.config
    
#     # data
#     ds = TransliterationDataset('/kaggle/input/transliteration-9123/hi.translit.sampled.train.tsv', src_vocab, tgt_vocab)
#     dv = TransliterationDataset('/kaggle/input/transliteration-9123/hi.translit.sampled.dev.tsv', src_vocab, tgt_vocab)
#     dl = DataLoader(ds, batch_size=cfg.batch_size, collate_fn=collate_fn, shuffle=True)
#     val_dl = DataLoader(dv, batch_size=cfg.batch_size, collate_fn=collate_fn)
    
#     # model
#     enc = Encoder(len(src_vocab), cfg.emb_dim, cfg.hid_dim, cfg.enc_layers, cfg.cell_type, cfg.dropout)
#     dec = Decoder(len(tgt_vocab), cfg.emb_dim, cfg.hid_dim, cfg.dec_layers, cfg.cell_type, cfg.dropout, use_attn=False)
#     model = Seq2Seq(enc,dec,pad_idx,device).to(device)
#     opt   = optim.Adam(model.parameters(), lr=cfg.lr)
#     crit  = nn.CrossEntropyLoss(ignore_index=pad_idx)

#     for epoch in range(1, 11):
#         tr_loss, tr_acc = train_epoch(model, dl, opt, crit, pad_idx)
#         vl_loss, vl_acc = eval_epoch(model, val_dl, crit, cfg.beam_size, pad_idx)
#         wandb.log({
#             'epoch':       epoch,
#             'train_loss':  tr_loss,
#             'train_acc':   tr_acc,
#             'val_loss':    vl_loss,
#             'val_acc':     vl_acc
#         })
            
#     # save best
#     torch.save(model.state_dict(),'model.pt')
#     wandb.save('model.pt')

# if __name__=='__main__':
#     sweep_id = wandb.sweep(sweep_config, project='dakshina-translit')
#     wandb.agent(sweep_id, function=sweep_run)


# CHECK