# Necssary libary

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import unicodedata
import torch.optim as optim
from tqdm import tqdm
import wandb
import pandas as pd

In [3]:
train_path = '/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv'
valid_path = '/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv'
test_path  = '/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv'


In [4]:
import wandb
wandb.login(key="594642013968a68e466138e783dcece6765c43b9")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbgorai005[0m ([33mbgorai005-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Encoder, decoder and seq2seq model class

In [5]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers=1, cell_type='LSTM', dropout=0.2):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type
        self.embedding = nn.Embedding(input_size, embedding_dim)
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[cell_type]
        self.rnn = rnn_class(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_seq):
        embedded = self.dropout(self.embedding(input_seq))
        batch_size = input_seq.size(0)
        device = input_seq.device
        if self.cell_type == 'LSTM':
            hidden = (torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device),
                     torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device))
        else:
            hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_size, num_layers=1, cell_type='LSTM', dropout=0.2):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type
        self.embedding = nn.Embedding(output_size, embedding_dim)
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[cell_type]
        self.rnn = rnn_class(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input_char, hidden):
        embedded = self.dropout(self.embedding(input_char))
        output, hidden = self.rnn(embedded, hidden)
        output = self.softmax(self.out(output.squeeze(1)))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = self.decoder.embedding.num_embeddings
        device = source.device
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)
        _, hidden = self.encoder(source)
        if self.encoder.num_layers != self.decoder.num_layers:
            if self.encoder.cell_type == 'LSTM':
                h_n, c_n = hidden
                if self.decoder.num_layers > self.encoder.num_layers:
                    extra_layers = self.decoder.num_layers - self.encoder.num_layers
                    extra_h = torch.zeros(extra_layers, batch_size, self.decoder.hidden_size).to(device)
                    extra_c = torch.zeros(extra_layers, batch_size, self.decoder.hidden_size).to(device)
                    h_n = torch.cat([h_n, extra_h], dim=0)
                    c_n = torch.cat([c_n, extra_c], dim=0)
                else:
                    h_n = h_n[:self.decoder.num_layers]
                    c_n = c_n[:self.decoder.num_layers]
                hidden = (h_n, c_n)
            else:
                if self.decoder.num_layers > self.encoder.num_layers:
                    extra_layers = self.decoder.num_layers - self.encoder.num_layers
                    extra_h = torch.zeros(extra_layers, batch_size, self.decoder.hidden_size).to(device)
                    hidden = torch.cat([hidden, extra_h], dim=0)
                else:
                    hidden = hidden[:self.decoder.num_layers]
        decoder_input = target[:, 0].unsqueeze(1)
        for t in range(1, target_len):
            output, hidden = self.decoder(decoder_input, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = target[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)
        return outputs

    def predict(self, src, max_len=30, beam_size=3):
        self.eval()
        batch_size = src.size(0)
        device = src.device
        _, hidden = self.encoder(src)
        outputs = []
        for i in range(batch_size):
            if self.encoder.cell_type == 'LSTM':
                h = hidden[0][:, i:i+1].contiguous()
                c = hidden[1][:, i:i+1].contiguous()
                hidden_state = (h, c)
            else:
                hidden_state = hidden[:, i:i+1].contiguous()
            if self.encoder.num_layers != self.decoder.num_layers:
                if self.encoder.cell_type == 'LSTM':
                    h_n, c_n = hidden_state
                    if self.decoder.num_layers > self.encoder.num_layers:
                        extra_layers = self.decoder.num_layers - self.encoder.num_layers
                        extra_h = torch.zeros(extra_layers, 1, self.decoder.hidden_size).to(device)
                        extra_c = torch.zeros(extra_layers, 1, self.decoder.hidden_size).to(device)
                        h_n = torch.cat([h_n, extra_h], dim=0)
                        c_n = torch.cat([c_n, extra_c], dim=0)
                    else:
                        h_n = h_n[:self.decoder.num_layers]
                        c_n = c_n[:self.decoder.num_layers]
                    hidden_state = (h_n, c_n)
                else:
                    if self.decoder.num_layers > self.encoder.num_layers:
                        extra_layers = self.decoder.num_layers - self.encoder.num_layers
                        extra_h = torch.zeros(extra_layers, 1, self.decoder.hidden_size).to(device)
                        hidden_state = torch.cat([hidden_state, extra_h], dim=0)
                    else:
                        hidden_state = hidden_state[:self.decoder.num_layers]
            beams = [(torch.tensor([1], device=device), 0.0, hidden_state)]  # [sequence, score, hidden]
            for _ in range(max_len):
                new_beams = []
                for seq, score, h in beams:
                    input_char = seq[-1].unsqueeze(0).unsqueeze(0)
                    output, h_new = self.decoder(input_char, h)
                    probs = torch.log_softmax(output, dim=-1).squeeze(0)
                    topk = torch.topk(probs, beam_size)
                    for idx, prob in zip(topk.indices, topk.values):
                        new_seq = torch.cat([seq, idx.unsqueeze(0)])
                        new_score = score + prob.item()
                        if self.decoder.cell_type == 'LSTM':
                            h_new = (h_new[0].contiguous(), h_new[1].contiguous())
                        else:
                            h_new = h_new.contiguous()
                        new_beams.append((new_seq, new_score, h_new))
                beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]
                if beams[0][0][-1].item() == 2:  # Stop if <EOS>
                    break
            outputs.append(beams[0][0][1:])  # Exclude <SOS>
        return outputs


# data prepration class 

In [6]:



class DataPreprocessor:
    def __init__(self, batch_size=32, device='cpu'):
        self.batch_size = batch_size
        self.device = device
        self.src_vocab = None
        self.tgt_vocab = None
        self.PAD_TOKEN = '<PAD>'
        self.SOS_TOKEN = '<SOS>'
        self.EOS_TOKEN = '<EOS>'
        self.UNK_TOKEN = '<UNK>'
        self.PAD_IDX = 0
        self.SOS_IDX = 1
        self.EOS_IDX = 2
        self.UNK_IDX = 3

    def normalize_string(self, s):
        s = unicodedata.normalize('NFC', str(s))
        if all(ord(c) < 128 for c in s):
            s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
            s = s.lower()
        return s.strip()

    def load_dataset(self, file_path=None, data_frame=None):
        if file_path:
            try:
                data = pd.read_csv(file_path, sep='\t', header=None)
            except:
                data = pd.read_csv(file_path, header=None)
        elif data_frame is not None:
            data = data_frame.copy()
        else:
            raise ValueError("Either file_path or data_frame must be provided.")
        data = data.rename(columns={0: 'tgt', 1: 'src'})
        data['src'] = data['src'].apply(self.normalize_string)
        data['tgt'] = data['tgt'].apply(self.normalize_string)
        return data

    def create_vocab(self, data, column):
        vocab = {self.PAD_TOKEN: self.PAD_IDX, self.SOS_TOKEN: self.SOS_IDX,
                 self.EOS_TOKEN: self.EOS_IDX, self.UNK_TOKEN: self.UNK_IDX}
        for seq in data[column]:
            if pd.notna(seq):
                for char in seq:
                    if char not in vocab:
                        vocab[char] = len(vocab)
        return vocab

    def build_vocabularies(self, train_data):
        self.src_vocab = self.create_vocab(train_data, 'src')
        self.tgt_vocab = self.create_vocab(train_data, 'tgt')
        return self.src_vocab, self.tgt_vocab

    class TranslationDataset(Dataset):
        def __init__(self, data, input_vocab, output_vocab):
            self.data = data
            self.input_vocab = input_vocab
            self.output_vocab = output_vocab

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            src = [self.input_vocab.get(c, self.input_vocab['<UNK>']) for c in self.data.iloc[idx, 1]] + [self.input_vocab['<EOS>']]
            tgt = [self.output_vocab['<SOS>']] + [self.output_vocab.get(c, self.output_vocab['<UNK>']) for c in self.data.iloc[idx, 0]] + [self.output_vocab['<EOS>']]
            src_str = self.data.iloc[idx, 1]
            tgt_str = self.data.iloc[idx, 0]
            return torch.tensor(src, dtype=torch.long), torch.tensor(tgt, dtype=torch.long), src_str, tgt_str

    def pad_collate(self, batch):
        src_batch, tgt_batch, src_strs, tgt_strs = zip(*batch)
        src_padded = pad_sequence(src_batch, batch_first=True, padding_value=self.PAD_IDX)
        tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=self.PAD_IDX)
        return src_padded, tgt_padded, list(src_strs), list(tgt_strs)

    def prepare_data(self, train_data, val_data, test_data):
        if self.src_vocab is None or self.tgt_vocab is None:
            self.build_vocabularies(train_data)
        train_dataset = self.TranslationDataset(train_data, self.src_vocab, self.tgt_vocab)
        val_dataset = self.TranslationDataset(val_data, self.src_vocab, self.tgt_vocab)
        test_dataset = self.TranslationDataset(test_data, self.src_vocab, self.tgt_vocab)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True,
                                 collate_fn=self.pad_collate, pin_memory=True)
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False,
                                collate_fn=self.pad_collate, pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False,
                                 collate_fn=self.pad_collate, pin_memory=True)
        return train_loader, val_loader, test_loader
import torch
import torch.nn as nn



# train class 

In [7]:

# Assuming DataPreprocessor, Encoder, Decoder, Seq2Seq are defined as in your previous code
class Trainer:
    def __init__(self, model, train_loader, val_loader, config, device='cpu', save_path='best_model.pt'):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.config = config
        self.teacher_forcing_ratio = config.teacher_forcing
        self.num_epochs = config.epochs
        self.save_path = save_path
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)  # Changed to CrossEntropyLoss
        self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate)
        self.src_vocab = None  # To store vocab for predictions
        self.tgt_vocab = None

    def compute_token_accuracy(self, outputs, targets):
        """Compute token-level accuracy."""
        outputs = outputs.argmax(dim=-1)  # [batch_size, seq_len]
        non_pad_mask = (targets != 0) & (targets != 1) & (targets != 2)  # Exclude <PAD>, <SOS>, <EOS>
        correct = (outputs == targets) & non_pad_mask
        total = non_pad_mask.sum().item()
        correct = correct.sum().item()
        return correct / total if total > 0 else 0.0

    def compute_sequence_accuracy(self, outputs, targets):
        """Compute sequence-level accuracy."""
        outputs = outputs.argmax(dim=-1)  # [batch_size, seq_len]
        correct = 0
        total = outputs.size(0)
        for pred, tgt in zip(outputs, targets):
            # Compare sequences, ignoring <PAD>, <SOS>, <EOS>
            pred = pred[(tgt != 0) & (tgt != 1) & (tgt != 2)]
            tgt = tgt[(tgt != 0) & (tgt != 1) & (tgt != 2)]
            if torch.equal(pred, tgt):
                correct += 1
        return correct / total if total > 0 else 0.0

    def train_epoch(self):
        self.model.train()
        total_loss, total_token_acc, total_seq_acc, total_samples = 0.0, 0.0, 0.0, 0

        pbar = tqdm(self.train_loader, desc="Training", leave=False)
        for src, tgt, _, _ in pbar:  # Adjusted for src_strs, tgt_strs from DataLoader
            src, tgt = src.to(self.device), tgt.to(self.device)
            self.optimizer.zero_grad()

            output = self.model(src, tgt, self.teacher_forcing_ratio)
            output = output[:, 1:].contiguous().view(-1, output.size(-1))
            tgt_flat = tgt[:, 1:].contiguous().view(-1)

            loss = self.criterion(output, tgt_flat)
            loss.backward()
            self.optimizer.step()

            batch_size = src.size(0)
            token_acc = self.compute_token_accuracy(
                output.view(batch_size, -1, output.size(-1)), tgt[:, 1:]
            )
            seq_acc = self.compute_sequence_accuracy(
                output.view(batch_size, -1, output.size(-1)), tgt[:, 1:]
            )

            total_loss += loss.item() * batch_size
            total_token_acc += token_acc * batch_size
            total_seq_acc += seq_acc * batch_size
            total_samples += batch_size

            pbar.set_postfix(loss=loss.item(), token_acc=token_acc, seq_acc=seq_acc)

        avg_loss = total_loss / total_samples
        avg_token_acc = total_token_acc / total_samples
        avg_seq_acc = total_seq_acc / total_samples
        return avg_loss, avg_token_acc, avg_seq_acc

    def evaluate(self, loader):
        self.model.eval()
        total_loss, total_token_acc, total_seq_acc, total_samples = 0.0, 0.0, 0.0, 0

        pbar = tqdm(loader, desc="Evaluating", leave=False)
        with torch.no_grad():
            for src, tgt, _, _ in pbar:
                src, tgt = src.to(self.device), tgt.to(self.device)

                output = self.model(src, tgt, teacher_forcing_ratio=0.0)
                output = output[:, 1:].contiguous().view(-1, output.size(-1))
                tgt_flat = tgt[:, 1:].contiguous().view(-1)

                loss = self.criterion(output, tgt_flat)

                batch_size = src.size(0)
                token_acc = self.compute_token_accuracy(
                    output.view(batch_size, -1, output.size(-1)), tgt[:, 1:]
                )
                seq_acc = self.compute_sequence_accuracy(
                    output.view(batch_size, -1, output.size(-1)), tgt[:, 1:]
                )

                total_loss += loss.item() * batch_size
                total_token_acc += token_acc * batch_size
                total_seq_acc += seq_acc * batch_size
                total_samples += batch_size

                pbar.set_postfix(loss=loss.item(), token_acc=token_acc, seq_acc=seq_acc)

        avg_loss = total_loss / total_samples
        avg_token_acc = total_token_acc / total_samples
        avg_seq_acc = total_seq_acc / total_samples
        return avg_loss, avg_token_acc, avg_seq_acc

    def train(self, src_vocab, tgt_vocab):
        """Train the model, logging metrics and predictions to Wandb."""
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        best_val_seq_acc = 0.0
        patience = getattr(self.config, 'patience', 3)
        patience_counter = 0

        for epoch in range(1, self.num_epochs + 1):
            # Train
            train_loss, train_token_acc, train_seq_acc = self.train_epoch()
            # Evaluate
            val_loss, val_token_acc, val_seq_acc = self.evaluate(self.val_loader)

            # Print metrics
            print(f'\nEpoch {epoch}/{self.num_epochs}')
            print(f'Train Loss: {train_loss:.4f} | Train Token Acc: {train_token_acc*100:.2f}% | Train Seq Acc: {train_seq_acc*100:.2f}%')
            print(f'Val Loss:   {val_loss:.4f} | Val Token Acc:   {val_token_acc*100:.2f}% | Val Seq Acc:   {val_seq_acc*100:.2f}%')
            print('-' * 60)

            # Log metrics to Wandb
            wandb.log({
                'epoch': epoch,
                'train_loss': train_loss,
                 'val_loss': val_loss,
                'train_token_accuracy': train_token_acc,
                'val_token_accuracy': val_token_acc,
                'train_sequence_accuracy': train_seq_acc,
                'val_sequence_accuracy': val_seq_acc
            })

            # Log sample predictions
            src_sample, tgt_sample, src_strs, tgt_strs = next(iter(self.val_loader))
            src_sample = src_sample.to(self.device)
            preds = self.model.predict(src_sample[:5], max_len=30, beam_size=self.config.beam_size)

            inv_src_vocab = {v: k for k, v in src_vocab.items()}
            inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
            table = wandb.Table(columns=["Input", "Target", "Prediction"])
            for i in range(len(preds)):
                input_str = ''.join([inv_src_vocab.get(id.item(), '?') for id in src_sample[i] if id.item() not in [0, src_vocab['<EOS>']]])
                target_str = ''.join([inv_tgt_vocab.get(id.item(), '?') for id in tgt_sample[i] if id.item() not in [0, tgt_vocab['<EOS>'], tgt_vocab['<SOS>']]])
                pred_str = ''.join([inv_tgt_vocab.get(id.item(), '?') for id in preds[i] if id.item() not in [0, tgt_vocab['<EOS>']]])
                table.add_data(input_str, target_str, pred_str)
            wandb.log({"predictions": table})

            # Early stopping and checkpoint
            if val_seq_acc > best_val_seq_acc:
                best_val_seq_acc = val_seq_acc
                patience_counter = 0
                torch.save(self.model.state_dict(), self.save_path)
                print(f"✅ New best model saved with val sequence accuracy: {val_seq_acc*100:.2f}%")
            else:
                patience_counter += 1
                print(f"⚠️ No improvement. Patience counter: {patience_counter}/{patience}")
                if patience_counter >= patience:
                    print("🛑 Early stopping triggered.")
                    break


# hyper parameter tuning for searching best hyperparamter

In [None]:
import torch
import wandb
import pandas as pd
import os

def train_loader(
    train_path='/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv',
    valid_path='/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv',
    test_path='/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv',
    device='cuda' if torch.cuda.is_available() else 'cpu',
    save_path='/kaggle/working/best_model.pt'
):
    """
    Training function for running a WandB sweep on the Bengali Dakshina dataset.
    """
    # Initialize WandB
    wandb.init(project="assignment_3")
    # Shortcut to config
    config = wandb.config

    # Construct a descriptive run name
    run_name = (
        f"-cell-{config.cell_type}"
        f"embed-{config.emb_dim}"
        f"-enc_layers-{config.enc_layers}"
        f"-dec_layers-{config.dec_layers}"
        f"-hid-{config.hidden_dim}"
       
        f"-dropout-{config.dropout}"
        f"-bs-{config.batch_size}"
        f"-lr-{config.learning_rate}"
        f"-tf-{config.teacher_forcing}"
        f"-beam-{config.beam_size}"
    )
    wandb.run.name = run_name

    
    # Initialize DataPreprocessor
    preprocessor = DataPreprocessor(batch_size=config.batch_size, device=device)

    # Load datasets
    train_data = preprocessor.load_dataset(train_path)
    val_data = preprocessor.load_dataset(valid_path)
    test_data = preprocessor.load_dataset(test_path)

    # Prepare data loaders
    train_loader, val_loader, test_loader = preprocessor.prepare_data(train_data, val_data, test_data)

    # Initialize model
    encoder = Encoder(
        input_size=len(preprocessor.src_vocab),
        embedding_dim=config.emb_dim,
        hidden_size=config.hidden_dim,
        num_layers=config.enc_layers,
        cell_type=config.cell_type,
        dropout=config.dropout
    )
    decoder = Decoder(
        output_size=len(preprocessor.tgt_vocab),
        embedding_dim=config.emb_dim,
        hidden_size=config.hidden_dim,
        num_layers=config.dec_layers,
        cell_type=config.cell_type,
        dropout=config.dropout
    )
    model = Seq2Seq(encoder, decoder).to(device)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        config=config,
        device=device,
        save_path=save_path
    )

    # Train with vocabularies
    trainer.train(preprocessor.src_vocab, preprocessor.tgt_vocab)

   
    # Finish Wandb run
    wandb.finish()

# Wandb sweep config
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_sequence_accuracy', 'goal': 'maximize'},
    'parameters': {
        'emb_dim': {'values': [64, 128, 256]},
        'hidden_dim': {'values': [128, 256]},
        'enc_layers': {'values': [1, 2, 3]},
        'dec_layers': {'values': [1, 2, 3]},
        'cell_type': {'values': ['LSTM', 'GRU','RNN']},
        'dropout': {'values': [0.2, 0.3, 0.4]},
        'batch_size': {'values': [32, 64, 128]},
        'learning_rate': {'values': [0.001, 0.0005, 0.0001]},
        'teacher_forcing': {'values': [0.5, 0.7, 0.9]},
        'beam_size': {'values': [1, 3, 5]},
        'patience': {'value': 3},
        'epochs': {'values': [10, 15]}
    }
}

if __name__ == "__main__":
    sweep_id = wandb.sweep(sweep_config, project="assignment_3")
    wandb.agent(sweep_id, function=train_loader, count=10)


Create sweep with ID: njnchcyt
Sweep URL: https://wandb.ai/bgorai005-iit-madras/assignment_3/sweeps/njnchcyt


[34m[1mwandb[0m: Agent Starting Run: fxchzyhd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	patience: 3
[34m[1mwandb[0m: 	teacher_forcing: 0.5


                                                                                                         


Epoch 1/15
Train Loss: 2.5874 | Train Token Acc: 22.46% | Train Seq Acc: 0.20%
Val Loss:   2.3123 | Val Token Acc:   27.09% | Val Seq Acc:   1.24%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 1.24%


                                                                                                         


Epoch 2/15
Train Loss: 1.7696 | Train Token Acc: 42.59% | Train Seq Acc: 2.72%
Val Loss:   1.7507 | Val Token Acc:   43.53% | Val Seq Acc:   6.41%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 6.41%


                                                                                                         


Epoch 3/15
Train Loss: 1.4092 | Train Token Acc: 53.37% | Train Seq Acc: 6.61%
Val Loss:   1.5791 | Val Token Acc:   49.46% | Val Seq Acc:   9.95%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 9.95%


                                                                                                          


Epoch 4/15
Train Loss: 1.2383 | Train Token Acc: 58.88% | Train Seq Acc: 10.13%
Val Loss:   1.4740 | Val Token Acc:   52.68% | Val Seq Acc:   13.36%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 13.36%


                                                                                                          


Epoch 5/15
Train Loss: 1.1244 | Train Token Acc: 62.75% | Train Seq Acc: 12.98%
Val Loss:   1.4283 | Val Token Acc:   55.13% | Val Seq Acc:   15.83%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 15.83%


                                                                                                          


Epoch 6/15
Train Loss: 1.0507 | Train Token Acc: 65.15% | Train Seq Acc: 15.40%
Val Loss:   1.4045 | Val Token Acc:   55.44% | Val Seq Acc:   16.98%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 16.98%


                                                                                                          


Epoch 7/15
Train Loss: 0.9828 | Train Token Acc: 67.52% | Train Seq Acc: 17.63%
Val Loss:   1.3845 | Val Token Acc:   57.86% | Val Seq Acc:   19.02%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 19.02%


                                                                                                          


Epoch 8/15
Train Loss: 0.9332 | Train Token Acc: 69.22% | Train Seq Acc: 19.73%
Val Loss:   1.3425 | Val Token Acc:   58.37% | Val Seq Acc:   20.07%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 20.07%


                                                                                                          


Epoch 9/15
Train Loss: 0.8903 | Train Token Acc: 70.73% | Train Seq Acc: 21.63%
Val Loss:   1.3455 | Val Token Acc:   59.23% | Val Seq Acc:   21.13%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 21.13%


                                                                                                          


Epoch 10/15
Train Loss: 0.8595 | Train Token Acc: 71.76% | Train Seq Acc: 23.09%
Val Loss:   1.3046 | Val Token Acc:   60.62% | Val Seq Acc:   22.86%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 22.86%


                                                                                                          


Epoch 11/15
Train Loss: 0.8234 | Train Token Acc: 72.96% | Train Seq Acc: 24.62%
Val Loss:   1.3099 | Val Token Acc:   60.56% | Val Seq Acc:   22.22%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 1/3


                                                                                                         


Epoch 12/15
Train Loss: 0.7950 | Train Token Acc: 73.91% | Train Seq Acc: 25.97%
Val Loss:   1.2824 | Val Token Acc:   61.80% | Val Seq Acc:   23.86%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 23.86%


                                                                                                         


Epoch 13/15
Train Loss: 0.7738 | Train Token Acc: 74.72% | Train Seq Acc: 27.39%
Val Loss:   1.2648 | Val Token Acc:   62.60% | Val Seq Acc:   24.66%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 24.66%


                                                                                                         


Epoch 1/10
Train Loss: 1.9526 | Train Token Acc: 39.91% | Train Seq Acc: 1.61%
Val Loss:   2.3438 | Val Token Acc:   41.19% | Val Seq Acc:   6.41%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 6.41%


                                                                                                          


Epoch 2/10
Train Loss: 1.0461 | Train Token Acc: 65.59% | Train Seq Acc: 10.00%
Val Loss:   2.0366 | Val Token Acc:   50.83% | Val Seq Acc:   13.14%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 13.14%


                                                                                                          


Epoch 4/10
Train Loss: 0.6951 | Train Token Acc: 76.73% | Train Seq Acc: 21.67%
Val Loss:   1.8632 | Val Token Acc:   58.01% | Val Seq Acc:   20.84%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 20.84%


                                                                                                          


Epoch 5/10
Train Loss: 0.6165 | Train Token Acc: 79.44% | Train Seq Acc: 26.13%
Val Loss:   1.8727 | Val Token Acc:   58.47% | Val Seq Acc:   21.12%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 21.12%


                                                                                                          


Epoch 6/10
Train Loss: 0.5669 | Train Token Acc: 81.11% | Train Seq Acc: 29.54%
Val Loss:   1.8266 | Val Token Acc:   60.53% | Val Seq Acc:   24.22%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 24.22%


                                                                                                          


Epoch 8/10
Train Loss: 0.4898 | Train Token Acc: 83.73% | Train Seq Acc: 35.02%
Val Loss:   1.8048 | Val Token Acc:   61.70% | Val Seq Acc:   25.64%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 25.64%


                                                                                                          


Epoch 9/10
Train Loss: 0.4613 | Train Token Acc: 84.66% | Train Seq Acc: 37.13%
Val Loss:   1.8065 | Val Token Acc:   62.09% | Val Seq Acc:   26.11%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 26.11%


                                                                                                         


Epoch 10/10
Train Loss: 0.4385 | Train Token Acc: 85.47% | Train Seq Acc: 38.98%
Val Loss:   1.8159 | Val Token Acc:   63.06% | Val Seq Acc:   27.36%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 27.36%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
train_sequence_accuracy,▁▃▄▅▆▆▇▇██
train_token_accuracy,▁▅▆▇▇▇████
val_loss,█▄▃▂▂▁▁▁▁▁
val_sequence_accuracy,▁▃▅▆▆▇▇▇██
val_token_accuracy,▁▄▅▆▇▇▇███

0,1
epoch,10.0
train_loss,0.43852
train_sequence_accuracy,0.38979
train_token_accuracy,0.85466
val_loss,1.81587
val_sequence_accuracy,0.27363
val_token_accuracy,0.63059


[34m[1mwandb[0m: Agent Starting Run: qjbguqti with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	emb_dim: 128
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	patience: 3
[34m[1mwandb[0m: 	teacher_forcing: 0.9


                                                                                                         


Epoch 1/15
Train Loss: 2.6691 | Train Token Acc: 19.05% | Train Seq Acc: 0.00%
Val Loss:   3.7493 | Val Token Acc:   9.25% | Val Seq Acc:   0.00%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 1/3


                                                                                                         


Epoch 2/15
Train Loss: 2.4853 | Train Token Acc: 23.50% | Train Seq Acc: 0.01%
Val Loss:   3.6128 | Val Token Acc:   10.76% | Val Seq Acc:   0.01%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 0.01%


                                                                                                         


Epoch 6/15
Train Loss: 2.3045 | Train Token Acc: 27.76% | Train Seq Acc: 0.01%
Val Loss:   3.5799 | Val Token Acc:   10.22% | Val Seq Acc:   0.04%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 2/3


                                                                                                         


Epoch 7/15
Train Loss: 2.2948 | Train Token Acc: 28.06% | Train Seq Acc: 0.02%
Val Loss:   3.6045 | Val Token Acc:   11.68% | Val Seq Acc:   0.04%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 3/3
🛑 Early stopping triggered.


0,1
epoch,▁▂▃▅▆▇█
train_loss,█▅▃▂▂▁▁
train_sequence_accuracy,▁▅▄▆▇▄█
train_token_accuracy,▁▄▆▇▇██
val_loss,█▅▁▃▄▄▅
val_sequence_accuracy,▁▃▁█▁██
val_token_accuracy,▁▅▇▆▆▄█

0,1
epoch,7.0
train_loss,2.29481
train_sequence_accuracy,0.00019
train_token_accuracy,0.28056
val_loss,3.60454
val_sequence_accuracy,0.00043
val_token_accuracy,0.11682


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: b5rm47n2 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	patience: 3
[34m[1mwandb[0m: 	teacher_forcing: 0.7


                                                                                                          


Epoch 1/15
Train Loss: 1.2597 | Train Token Acc: 60.03% | Train Seq Acc: 11.67%
Val Loss:   1.6352 | Val Token Acc:   54.31% | Val Seq Acc:   15.47%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 15.47%


                                                                                                          


Epoch 2/15
Train Loss: 0.7132 | Train Token Acc: 77.00% | Train Seq Acc: 26.94%
Val Loss:   1.5811 | Val Token Acc:   59.45% | Val Seq Acc:   21.87%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 21.87%


                                                                                                          


Epoch 3/15
Train Loss: 0.5841 | Train Token Acc: 81.34% | Train Seq Acc: 35.01%
Val Loss:   1.5891 | Val Token Acc:   60.53% | Val Seq Acc:   23.68%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 23.68%


                                                                                                         


Epoch 9/15
Train Loss: 0.3654 | Train Token Acc: 88.63% | Train Seq Acc: 53.17%
Val Loss:   1.7184 | Val Token Acc:   62.66% | Val Seq Acc:   24.01%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 1/3


                                                                                                          


Epoch 10/15
Train Loss: 0.3551 | Train Token Acc: 88.95% | Train Seq Acc: 54.32%
Val Loss:   1.7205 | Val Token Acc:   62.45% | Val Seq Acc:   24.81%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 2/3


                                                                                                          


Epoch 11/15
Train Loss: 0.3390 | Train Token Acc: 89.48% | Train Seq Acc: 55.73%
Val Loss:   1.7204 | Val Token Acc:   62.88% | Val Seq Acc:   24.78%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 3/3
🛑 Early stopping triggered.


0,1
epoch,▁▂▂▃▄▅▅▆▇▇█
train_loss,█▄▃▂▂▂▂▁▁▁▁
train_sequence_accuracy,▁▃▅▆▆▇▇▇███
train_token_accuracy,▁▅▆▇▇▇▇████
val_loss,▄▁▁▃▃▄▆▄███
val_sequence_accuracy,▁▅▇▆▇█▇█▇▇▇
val_token_accuracy,▁▅▆▆▇▇▇█▇▇█

0,1
epoch,11.0
train_loss,0.33901
train_sequence_accuracy,0.55729
train_token_accuracy,0.89482
val_loss,1.72045
val_sequence_accuracy,0.24776
val_token_accuracy,0.62885


[34m[1mwandb[0m: Agent Starting Run: mz3ojt1r with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	patience: 3
[34m[1mwandb[0m: 	teacher_forcing: 0.5


                                                                                                         


Epoch 1/15
Train Loss: 2.7495 | Train Token Acc: 17.73% | Train Seq Acc: 0.02%
Val Loss:   2.5031 | Val Token Acc:   24.78% | Val Seq Acc:   0.15%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 0.15%


                                                                                                         


Epoch 2/15
Train Loss: 1.7820 | Train Token Acc: 43.63% | Train Seq Acc: 2.52%
Val Loss:   1.5914 | Val Token Acc:   48.93% | Val Seq Acc:   8.33%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 8.33%


                                                                                                          


Epoch 3/15
Train Loss: 1.2509 | Train Token Acc: 58.86% | Train Seq Acc: 9.18%
Val Loss:   1.3802 | Val Token Acc:   56.87% | Val Seq Acc:   16.61%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 16.61%


                                                                                                         


Epoch 7/15
Train Loss: 0.7112 | Train Token Acc: 76.60% | Train Seq Acc: 29.22%
Val Loss:   1.2144 | Val Token Acc:   65.12% | Val Seq Acc:   26.91%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 26.91%


                                                                                                         


Epoch 8/15
Train Loss: 0.6582 | Train Token Acc: 78.38% | Train Seq Acc: 32.55%
Val Loss:   1.2132 | Val Token Acc:   65.64% | Val Seq Acc:   28.26%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 28.26%


                                                                                                         


Epoch 9/15
Train Loss: 0.6083 | Train Token Acc: 80.03% | Train Seq Acc: 35.29%
Val Loss:   1.2240 | Val Token Acc:   66.28% | Val Seq Acc:   29.15%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 29.15%


                                                                                                         


Epoch 10/15
Train Loss: 0.5687 | Train Token Acc: 81.38% | Train Seq Acc: 38.21%
Val Loss:   1.1666 | Val Token Acc:   67.87% | Val Seq Acc:   31.45%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 31.45%


                                                                                                         


Epoch 11/15
Train Loss: 0.5336 | Train Token Acc: 82.50% | Train Seq Acc: 40.69%
Val Loss:   1.2108 | Val Token Acc:   67.72% | Val Seq Acc:   30.70%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 1/3


                                                                                                         


Epoch 12/15
Train Loss: 0.5076 | Train Token Acc: 83.33% | Train Seq Acc: 42.28%
Val Loss:   1.1726 | Val Token Acc:   68.78% | Val Seq Acc:   32.60%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 32.60%


                                                                                                         


Epoch 13/15
Train Loss: 0.4829 | Train Token Acc: 84.18% | Train Seq Acc: 44.39%
Val Loss:   1.1512 | Val Token Acc:   68.82% | Val Seq Acc:   32.64%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 32.64%


                                                                                                         


Epoch 14/15
Train Loss: 0.4606 | Train Token Acc: 84.92% | Train Seq Acc: 46.12%
Val Loss:   1.1677 | Val Token Acc:   69.14% | Val Seq Acc:   32.21%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 1/3


                                                                                                          


Epoch 4/15
Train Loss: 0.8689 | Train Token Acc: 70.86% | Train Seq Acc: 14.31%
Val Loss:   1.7564 | Val Token Acc:   56.73% | Val Seq Acc:   18.22%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 18.22%


                                                                                                          


Epoch 5/15
Train Loss: 0.7378 | Train Token Acc: 75.18% | Train Seq Acc: 19.36%
Val Loss:   1.6847 | Val Token Acc:   59.75% | Val Seq Acc:   21.50%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 21.50%


                                                                                                          


Epoch 6/15
Train Loss: 0.6467 | Train Token Acc: 78.35% | Train Seq Acc: 23.99%
Val Loss:   1.6509 | Val Token Acc:   61.68% | Val Seq Acc:   24.57%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 24.57%


                                                                                                          


Epoch 7/15
Train Loss: 0.5752 | Train Token Acc: 80.84% | Train Seq Acc: 28.67%
Val Loss:   1.6395 | Val Token Acc:   63.15% | Val Seq Acc:   26.60%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 26.60%


                                                                                                          


Epoch 12/15
Train Loss: 0.3703 | Train Token Acc: 87.78% | Train Seq Acc: 45.63%
Val Loss:   1.6355 | Val Token Acc:   67.01% | Val Seq Acc:   31.58%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 31.58%


                                                                                                          


Epoch 13/15
Train Loss: 0.3467 | Train Token Acc: 88.64% | Train Seq Acc: 48.14%
Val Loss:   1.6329 | Val Token Acc:   67.39% | Val Seq Acc:   31.84%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 31.84%


                                                                                                          


Epoch 14/15
Train Loss: 0.3223 | Train Token Acc: 89.44% | Train Seq Acc: 50.70%
Val Loss:   1.6767 | Val Token Acc:   67.69% | Val Seq Acc:   32.58%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 32.58%


                                                                                                          


Epoch 15/15
Train Loss: 0.3051 | Train Token Acc: 90.00% | Train Seq Acc: 52.69%
Val Loss:   1.6714 | Val Token Acc:   67.75% | Val Seq Acc:   32.57%
------------------------------------------------------------
⚠️ No improvement. Patience counter: 1/3


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▃▃▂▂▂▂▂▁▁▁▁▁▁
train_sequence_accuracy,▁▁▂▃▄▄▅▅▆▆▇▇▇██
train_token_accuracy,▁▄▅▆▆▇▇▇▇██████
val_loss,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁
val_sequence_accuracy,▁▂▄▅▅▆▇▇▇▇█████
val_token_accuracy,▁▄▅▆▆▇▇▇███████

0,1
epoch,15.0
train_loss,0.30509
train_sequence_accuracy,0.52695
train_token_accuracy,0.89997
val_loss,1.67143
val_sequence_accuracy,0.32568
val_token_accuracy,0.67747


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0d6pqdbq with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	emb_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	patience: 3
[34m[1mwandb[0m: 	teacher_forcing: 0.9


                                                                                                          


Epoch 6/15
Train Loss: 0.6009 | Train Token Acc: 80.01% | Train Seq Acc: 26.74%
Val Loss:   1.6379 | Val Token Acc:   63.80% | Val Seq Acc:   28.10%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 28.10%


                                                                                                          


Epoch 7/15
Train Loss: 0.5407 | Train Token Acc: 82.05% | Train Seq Acc: 30.94%
Val Loss:   1.6315 | Val Token Acc:   65.18% | Val Seq Acc:   29.64%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 29.64%


                                                                                                          


Epoch 8/15
Train Loss: 0.4884 | Train Token Acc: 83.87% | Train Seq Acc: 34.92%
Val Loss:   1.6131 | Val Token Acc:   66.36% | Val Seq Acc:   31.30%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 31.30%


                                                                                                          


Epoch 9/15
Train Loss: 0.4503 | Train Token Acc: 85.15% | Train Seq Acc: 37.97%
Val Loss:   1.6259 | Val Token Acc:   66.73% | Val Seq Acc:   32.51%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 32.51%


                                                                                                         


Epoch 10/15
Train Loss: 0.4145 | Train Token Acc: 86.31% | Train Seq Acc: 41.03%
Val Loss:   1.6181 | Val Token Acc:   67.51% | Val Seq Acc:   33.24%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 33.24%


                                                                                                          


Epoch 14/15
Train Loss: 0.3192 | Train Token Acc: 89.51% | Train Seq Acc: 50.65%
Val Loss:   1.5952 | Val Token Acc:   69.47% | Val Seq Acc:   35.82%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 35.82%


Training:  33%|███▎      | 989/2955 [00:46<01:31, 21.56it/s, loss=0.558, seq_acc=0.438, token_acc=0.83] 

In [5]:
import torch
import pandas as pd
from tqdm import tqdm
import os

class TestEvaluator:
    def __init__(self, model, test_loader, src_vocab, tgt_vocab, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = model.to(device)
        self.test_loader = test_loader
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.device = device
        self.inv_src_vocab = {v: k for k, v in src_vocab.items()}
        self.inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    def check_model_file(self, model_path):
        """Check if the model file exists and provide guidance if it doesn't."""
        if not os.path.exists(model_path):
            error_msg = f"Error: Model file not found at '{model_path}'.\n"
            error_msg += "Possible solutions:\n"
            error_msg += "1. Ensure training completed successfully and saved the model to '/kaggle/working/best_model.pt'.\n"
            error_msg += "2. Check if the model was saved to a different path and update 'model_path'.\n"
            error_msg += "3. Rerun the training script to generate the model.\n"
            error_msg += "4. If running in Kaggle, verify that '/kaggle/working/' is accessible and the file was persisted.\n"
            error_msg += "5. Provide the correct path to an existing model file."
            raise FileNotFoundError(error_msg)
        print(f"Model file found at '{model_path}'.")
    def compute_sequence_accuracy(self, outputs, targets):
        """Compute sequence-level accuracy (exact match, ignoring special tokens)."""
        outputs = outputs.argmax(dim=-1)  # [batch_size, seq_len]
        correct = 0
        total = outputs.size(0)
        for pred, tgt in zip(outputs, targets):
            pred = pred[(tgt != 0) & (tgt != 1) & (tgt != 2)]  # Exclude <PAD>, <SOS>, <EOS>
            tgt = tgt[(tgt != 0) & (tgt != 1) & (tgt != 2)]
            if torch.equal(pred, tgt):
                correct += 1
        return correct / total if total > 0 else 0.0

    def evaluate_test_set(self):
        """Evaluate the model on the test set and return sequence accuracy."""
        self.model.eval()
        total_seq_acc, total_samples = 0.0, 0

        with torch.no_grad():
            for src, tgt, _, _ in tqdm(self.test_loader, desc="Evaluating Test Set"):
                src, tgt = src.to(self.device), tgt.to(self.device)
                output = self.model(src, tgt, teacher_forcing_ratio=0.0)
                output = output[:, 1:].contiguous()  # Exclude <SOS>
                seq_acc = self.compute_sequence_accuracy(output, tgt[:, 1:])
                batch_size = src.size(0)
                total_seq_acc += seq_acc * batch_size
                total_samples += batch_size

        avg_seq_acc = total_seq_acc / total_samples
        return avg_seq_acc

    def generate_predictions(self, output_dir="predictions_vanilla", num_samples=10):
        """Generate predictions for the test set and save them to a file. Return samples for display."""
        self.model.eval()
        predictions = []
        sample_data = []

        os.makedirs(output_dir, exist_ok=True)
        pred_file = os.path.join(output_dir, "predictions.tsv")

        with torch.no_grad():
            for src, tgt, src_strs, tgt_strs in tqdm(self.test_loader, desc="Generating Predictions"):
                src = src.to(self.device)
                preds = self.model.predict(src, max_len=30, beam_size=3)  # Use beam_size=3 as default
                for i in range(len(preds)):
                    input_str = src_strs[i]
                    target_str = tgt_strs[i]
                    pred_ids = preds[i]
                    pred_str = ''.join([self.inv_tgt_vocab.get(id.item(), '?') for id in pred_ids if id.item() not in [0, self.tgt_vocab['<EOS>']]])
                    predictions.append((input_str, target_str, pred_str))
                    if len(sample_data) < num_samples:
                        sample_data.append((input_str, target_str, pred_str))

        # Save all predictions to a TSV file
        pred_df = pd.DataFrame(predictions, columns=["Input", "Target", "Prediction"])
        pred_df.to_csv(pred_file, sep='\t', index=False)

        return sample_data, pred_file

    def display_samples(self, sample_data):
        """Format sample predictions as a markdown table."""
        markdown = "| Input | Target | Prediction | Match |\n"
        markdown += "|-------|--------|------------|-------|\n"
        for input_str, target_str, pred_str in sample_data:
            match = "✅" if pred_str == target_str else "❌"
            markdown += f"| {input_str} | {target_str} | {pred_str} | {match} |\n"
        return markdown

    def display_samples_highlight_incorrect(self, sample_data):
        """Format sample predictions as a markdown table, highlighting incorrect predictions."""
        markdown = "| Input | Target | Prediction | Match |\n"
        markdown += "|-------|--------|------------|-------|\n"
        for input_str, target_str, pred_str in sample_data:
            match = "✅" if pred_str == target_str else "❌"
            display_pred = f"**{pred_str}**" if pred_str != target_str else pred_str
            markdown += f"| {input_str} | {target_str} | {display_pred} | {match} |\n"
        return markdown

In [7]:
import torch
import pandas as pd
from tqdm import tqdm
import os
import wandb

# Assuming DataPreprocessor, Encoder, Decoder, Seq2Seq, and Trainer are defined as in the original code

def train_with_best_hyperparams(
    train_path='/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv',
    valid_path='/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv',
    test_path='/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv',
    device='cuda' if torch.cuda.is_available() else 'cpu',
    save_path='/kaggle/working/best_model.pt'
):
    """
    Train the model with the best hyperparameters and save it to save_path.
    Returns the trained model and vocabularies.
    """
    # Best hyperparameters from WandB
    config = {
        'batch_size': 64,
        'beam_size': 1,
        'cell_type': 'GRU',
        'dec_layers': 2,
        'dropout': 0.2,
        'emb_dim': 256,
        'enc_layers': 3,
        'epochs': 1,
        'hidden_dim': 256,
        'learning_rate': 0.001,
        'patience': 3,
        'teacher_forcing': 0.7
    }

    # Convert config to an object for compatibility with Trainer
    class Config:
        def __init__(self, params):
            for key, value in params.items():
                setattr(self, key, value)
    
    config_obj = Config(config)

    # Initialize WandB run
    wandb.init(project="transliteration", config=config, name="best_hyperparams_run")
    
    # Initialize DataPreprocessor
    preprocessor = DataPreprocessor(batch_size=config['batch_size'], device=device)
    
    # Load datasets
    train_data = preprocessor.load_dataset(train_path)
    val_data = preprocessor.load_dataset(valid_path)
    test_data = preprocessor.load_dataset(test_path)
    
    # Prepare data loaders
    train_loader, val_loader, test_loader = preprocessor.prepare_data(train_data, val_data, test_data)
    
    # Initialize model
    encoder = Encoder(
        input_size=len(preprocessor.src_vocab),
        embedding_dim=config['emb_dim'],
        hidden_size=config['hidden_dim'],
        num_layers=config['enc_layers'],
        cell_type=config['cell_type'],
        dropout=config['dropout']
    )
    decoder = Decoder(
        output_size=len(preprocessor.tgt_vocab),
        embedding_dim=config['emb_dim'],
        hidden_size=config['hidden_dim'],
        num_layers=config['dec_layers'],
        cell_type=config['cell_type'],
        dropout=config['dropout']
    )
    model = Seq2Seq(encoder, decoder).to(device)
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        config=config_obj,
        device=device,
        save_path=save_path
    )
    
    # Train the model
    trainer.train(preprocessor.src_vocab, preprocessor.tgt_vocab)
    
    # Finish WandB run
    wandb.finish()
    
    print(f"Training completed. Model saved to {save_path}")
    return model, preprocessor.src_vocab, preprocessor.tgt_vocab, test_loader

def evaluate_with_best_model(
    model,
    test_loader,
    src_vocab,
    tgt_vocab,
    model_path='/kaggle/working/best_model.pt',
    device='cuda' if torch.cuda.is_available() else 'cpu'
):
    """
    Evaluate the model on the test set and generate predictions with highlighted incorrect outputs.
    Returns sequence accuracy, markdown table, and predictions file path.
    """
    # Initialize TestEvaluator
    evaluator = TestEvaluator(model, test_loader, src_vocab, tgt_vocab, device)
    
    # Check model file
    evaluator.check_model_file(model_path)
    
    # Load the model (already loaded in model, but verify for consistency)
    model.load_state_dict(torch.load(model_path, map_location=device))
    print(f"Loaded model from {model_path}")
    
    # Evaluate on test set
    test_seq_acc = evaluator.evaluate_test_set()
    print(f"\nTest Set Sequence Accuracy: {test_seq_acc*100:.2f}%")
    
    # Generate predictions
    sample_data, pred_file = evaluator.generate_predictions(num_samples=10)
    print(f"\nPredictions saved to {pred_file}")
    print(pred_file.sample(20))
    # Display sample predictions with incorrect ones highlighted
    markdown_table = evaluator.display_samples_highlight_incorrect(sample_data)
    print("\nSample Predictions (Incorrect Predictions Highlighted):")
    print(markdown_table)
    
    return test_seq_acc, markdown_table, pred_file

if __name__ == "__main__":
    try:
        # Train with best hyperparameters
        model, src_vocab, tgt_vocab, test_loader = train_with_best_hyperparams()
        
        # Evaluate on test set
        test_seq_acc, markdown_table, pred_file = evaluate_with_best_model(
            model, test_loader, src_vocab, tgt_vocab
        )
    except FileNotFoundError as e:
        print(e)
    except Exception as e:
        print(f"Error during training or evaluation: {e}")

                                                                                                          


Epoch 1/1
Train Loss: 1.2208 | Train Token Acc: 61.94% | Train Seq Acc: 14.63%
Val Loss:   1.5134 | Val Token Acc:   60.00% | Val Seq Acc:   22.58%
------------------------------------------------------------
✅ New best model saved with val sequence accuracy: 22.58%


0,1
epoch,▁
train_loss,▁
train_sequence_accuracy,▁
train_token_accuracy,▁
val_loss,▁
val_sequence_accuracy,▁
val_token_accuracy,▁

0,1
epoch,1.0
train_loss,1.22085
train_sequence_accuracy,0.14631
train_token_accuracy,0.61935
val_loss,1.51341
val_sequence_accuracy,0.22578
val_token_accuracy,0.60005


Training completed. Model saved to /kaggle/working/best_model.pt
Model file found at '/kaggle/working/best_model.pt'.
Loaded model from /kaggle/working/best_model.pt


Evaluating Test Set: 100%|██████████| 145/145 [00:04<00:00, 35.03it/s]



Test Set Sequence Accuracy: 23.33%


Generating Predictions: 100%|██████████| 145/145 [01:52<00:00,  1.28it/s]


Predictions saved to predictions_vanilla/predictions.tsv
Error during training or evaluation: 'str' object has no attribute 'sample'





In [13]:
import pandas as pd

# Correctly load the TSV file into a DataFrame
df = pd.read_csv('/kaggle/working/predictions_vanilla/predictions.tsv', sep='\t')

# Display the first few rows
print(df.sample(200))


                  Input         Target   Prediction
6584        mailashtona      মাইলস্টোন    মাইলস্তান
4524                not            নোট           নট
7348             rurala          রুরাল         রুলা
8792              stari         স্টোরি      স্ট্রাই
7836          sheleshma       শ্লেষ্মা       সেলেসম
...                 ...            ...          ...
6686              marbo          মারবো        মার্ব
8146              sarju           সরযূ       সার্জু
2267              ginir          গিনির        জিনির
2441        grambaseeke    গ্রামবাসীকে  গ্রামবিকেষে
2589  chalachchitrogulo  চলচ্চিত্রগুলো  চলচিত্রগুলো

[200 rows x 3 columns]
