In [3]:
import re
import math
import copy
import torch
from torch import nn
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import torch.nn.functional as F
import torch.multiprocessing as mp
import wandb


In [4]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

In [5]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
mp.set_start_method('spawn', force=True)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Reading Data

In [9]:
def read_data(path):
    with open(path, "r", encoding="utf-8") as f:
        data = [line.strip() for line in f.readlines()]
    return data


In [10]:
train_en = read_data('/kaggle/input/ted-talk-corpus/ted-talks-corpus/train.en')
train_fr = read_data('/kaggle/input/ted-talk-corpus/ted-talks-corpus/train.fr')
dev_en = read_data('/kaggle/input/ted-talk-corpus/ted-talks-corpus/dev.en')
dev_fr = read_data('/kaggle/input/ted-talk-corpus/ted-talks-corpus/dev.fr')
test_en = read_data('/kaggle/input/ted-talk-corpus/ted-talks-corpus/test.en')
test_fr = read_data('/kaggle/input/ted-talk-corpus/ted-talks-corpus/test.fr')


## Pre-processing

In [11]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    allowed_characters = r"a-zA-Zà-ÿÀ-Ÿ"

    sentence = re.sub(r"\d+", "<NUM>", sentence)
    sentence = sentence.replace("\n", " ")
    sentence = sentence.replace("\'", "'")
    
    sentence = re.sub(rf"[^ {allowed_characters}\s<NUM>]", "", sentence)
    sentence = re.sub(r" +", " ", sentence)
    
    return sentence.strip()

In [12]:
def preprocess(sentences):
    return [preprocess_sentence(sentence) for sentence in sentences]

In [13]:
train_en_clean = preprocess(train_en)
train_fr_clean = preprocess(train_fr)
dev_en_clean = preprocess(dev_en)
dev_fr_clean = preprocess(dev_fr)
test_en_clean = preprocess(test_en)
test_fr_clean = preprocess(test_fr)

## Creation of Vocabulary

In [14]:
def build_vocab(sentences, min_freq=3):
    word_to_idx = {
        '<PAD>': 0,
        '<UNK>': 1,
        '<s>': 2,
        '</s>': 3
    }
    
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    
    word_counter = Counter()
    for sentence in sentences:
        word_counter.update(sentence.split())
        
    for word, freq in word_counter.items():
        if freq >= min_freq:
            idx = len(word_to_idx)
            word_to_idx[word] = idx
            idx_to_word[idx] = word
            
    return word_to_idx, idx_to_word

In [15]:
word_to_idx_en, idx_to_word_en = build_vocab(train_en_clean, min_freq=3)
word_to_idx_fr, idx_to_word_fr = build_vocab(train_fr_clean, min_freq=3)


In [16]:
print(f"English Vocab Size: {len(word_to_idx_en)}")
print(f"French Vocab Size: {len(word_to_idx_fr)}")

English Vocab Size: 9569
French Vocab Size: 12129


## Tokenization

In [17]:
def tokenize(sentences):
    return [nltk.word_tokenize(sentence) for sentence in sentences]

In [18]:
train_en_tokenized = tokenize(train_en_clean)
train_fr_tokenized = tokenize(train_fr_clean)
dev_en_tokenized = tokenize(dev_en_clean)
dev_fr_tokenized = tokenize(dev_fr_clean)
test_en_tokenized = tokenize(test_en_clean)
test_fr_tokenized = tokenize(test_fr_clean)

## Creation of Dataset for training

In [19]:
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, word_to_idx_src, word_to_idx_tgt, max_length=100):
        self.src_data = self.convert_tokens_to_idx(src_sentences, word_to_idx_src, max_length)
        self.tgt_data = self.convert_tokens_to_idx(tgt_sentences, word_to_idx_tgt, max_length)
        
    def convert_tokens_to_idx(self, sentences, word_to_idx, max_length):
        indexed_sentences = []
        for tokens in sentences:
            indices = [word_to_idx.get(token, word_to_idx["<UNK>"]) for token in tokens]
            indices = [word_to_idx["<s>"]] + indices[:max_length - 2] + [word_to_idx["</s>"]]
            
            if len(indices) > max_length:
                indices = indices[:max_length]
            
            indexed_sentences.append(torch.tensor(indices))
        return pad_sequence(indexed_sentences, batch_first=True, padding_value=word_to_idx["<PAD>"])
    
    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        return self.src_data[idx], self.tgt_data[idx]

In [20]:
train_dataset = TranslationDataset(train_en_tokenized, train_fr_tokenized, word_to_idx_en, word_to_idx_fr, max_length=100)
dev_dataset = TranslationDataset(dev_en_tokenized, dev_fr_tokenized, word_to_idx_en, word_to_idx_fr, max_length=100)
test_dataset = TranslationDataset(test_en_tokenized, test_fr_tokenized, word_to_idx_en, word_to_idx_fr, max_length=100)


In [21]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

## Positional Encoding

In [22]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model, device=device)
        position = torch.arange(0, max_seq_length, dtype=torch.float, device=device).unsqueeze(1)
        div_term = torch.pow(10_000, (-torch.arange(0, d_model, 2, device=device).float() / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        return self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

## Attention module

In [23]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_len, d_model = x.size()
        return x.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_len, d_k = x.size()
        return x.transpose(1, 2).reshape(batch_size, seq_len, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output


## Feed Forward Module

In [24]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

## Encoder Layer

In [25]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

## Decoder Layer

In [26]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x


## Consolidated Transformer Class

In [71]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=device), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [83]:
en_vocab_size = len(word_to_idx_en)
fr_vocab_size = len(word_to_idx_fr)

d_model = 300
num_heads = 5
num_layers = 3
d_ff = 300
max_seq_length = 100
dropout = 0.3
num_epochs = 10

In [86]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=True)


In [87]:
model = Transformer(en_vocab_size, fr_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx_en['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)


## Model Training

In [88]:
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    loop = tqdm(train_loader, leave=False)
    for i, (src, tgt) in enumerate(loop):
        src = src.to(device)
        tgt = tgt.to(device)
        
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, fr_vocab_size), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
        loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())
    
    epoch_loss = epoch_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss}")
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for (src, tgt) in dev_loader:
            src = src.to(device)
            tgt = tgt.to(device)
            
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, fr_vocab_size), tgt[:, 1:].contiguous().view(-1))
            
            val_loss += loss.item()
            
    val_loss = val_loss / len(dev_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss}")

                                                                          

Epoch [1/10], Training Loss: 5.26201448664228
Epoch [1/10], Validation Loss: 4.637305464063372


                                                                          

Epoch [2/10], Training Loss: 4.378786410350027
Epoch [2/10], Validation Loss: 4.343770469938006


                                                                          

Epoch [3/10], Training Loss: 4.021551876942486
Epoch [3/10], Validation Loss: 4.122457614966801


                                                                          

Epoch [4/10], Training Loss: 3.767502897583854
Epoch [4/10], Validation Loss: 3.991719824927194


                                                                          

Epoch [5/10], Training Loss: 3.5605699192486338
Epoch [5/10], Validation Loss: 3.871927559375763


                                                                          

Epoch [6/10], Training Loss: 3.3808331291304468
Epoch [6/10], Validation Loss: 3.726334912436349


                                                                          

Epoch [7/10], Training Loss: 3.2369851621229255
Epoch [7/10], Validation Loss: 3.645478674343654


                                                                          

Epoch [8/10], Training Loss: 3.1183963017677194
Epoch [8/10], Validation Loss: 3.5958921142986844


                                                                          

Epoch [9/10], Training Loss: 3.0188596866278252
Epoch [9/10], Validation Loss: 3.5551946333476474


                                                                           

Epoch [10/10], Training Loss: 2.930576279473457
Epoch [10/10], Validation Loss: 3.4916256155286516


## Bleu Score Calculation

In [89]:
def calculate_bleu(pred, truth):
    pred = pred.cpu().detach().numpy()
    truth = truth.cpu().detach().numpy()
    
    # Flatten both arrays
    pred = pred.flatten()
    truth = truth.flatten()
    
    # Convert to list of integers
    preds = [int(item) for item in pred]
    truths = [int(item) for item in truth]
    
    # Calculate BLEU score
    chencherry = SmoothingFunction()
    bleu_score = sentence_bleu([truths], preds, smoothing_function=chencherry.method1)
    return bleu_score


In [90]:
def remove_pad_eos(tokens, pad_token_idx, eos_token_idx):
    while tokens and (tokens[-1] == pad_token_idx or tokens[-1] == eos_token_idx):
        tokens.pop()
    return tokens

def evaluate_model(model, test_loader, pad_token_idx, eos_token_idx):
    model.eval()
    model.to(device)

    bleu_scores = []
    sum_bleu_scores = 0
    total_batches = len(test_loader)

    with torch.no_grad():
        for i, (src, tgt) in enumerate(test_loader):
            src = src.to(device)
            tgt = tgt.to(device)
            
            output = model(src, tgt[:, :-1])
            
            predicted_seq = output.argmax(dim=-1) 

            # Remove PAD and EOS tokens from predictions and ground truth
            predicted_seq_list = [remove_pad_eos(pred.tolist(), pad_token_idx, eos_token_idx) for pred in predicted_seq]
            truth_list = [remove_pad_eos(tgt_entry.tolist(), pad_token_idx, eos_token_idx) for tgt_entry in tgt[:, 1:]]

            for pred, truth in zip(predicted_seq_list, truth_list):
                bleu_score = calculate_bleu(torch.tensor(pred), torch.tensor(truth))
                bleu_scores.append(bleu_score)
                sum_bleu_scores += bleu_score
                
#                 print(f'Device: {device}, Batch: {i + 1}, BLEU score: {bleu_score}')
    
    average_bleu_score = sum_bleu_scores / total_batches if total_batches > 0 else 0
    print(f'Average BLEU score over {total_batches} batches: {average_bleu_score}')
    
    return average_bleu_score

In [104]:
def make_file_bleu_score(model, test_loader, pad_token_idx, eos_token_idx, idx_to_word):
    model.eval()
    model.to(device)

    bleu_scores = []
    sum_bleu_scores = 0
    total_batches = len(test_loader)

    with open('testbleu.txt', 'w', encoding='utf-8') as file:
        with torch.no_grad():
            for i, (src, tgt) in enumerate(test_loader):
                src = src.to(device)
                tgt = tgt.to(device)
                output = model(src, tgt[:, :-1])
                predicted_seq = output.argmax(dim=-1)
                predicted_seq_list = remove_pad_eos(predicted_seq[0].tolist(), pad_token_idx, eos_token_idx)
                truth_list = remove_pad_eos(tgt[0, 1:].tolist(), pad_token_idx, eos_token_idx)
                predicted_sentence = ' '.join([idx_to_word[idx] for idx in predicted_seq_list if idx in idx_to_word])
                truth_sentence = ' '.join([idx_to_word[idx] for idx in truth_list if idx in idx_to_word])
                bleu_score = calculate_bleu(torch.tensor(predicted_seq_list), torch.tensor(truth_list))
                bleu_scores.append(bleu_score)
                sum_bleu_scores += bleu_score

                file.write(f'Sentence: {truth_sentence}, BLEU score: {bleu_score}\n')

        # Calculate and write the average BLEU score
        average_bleu_score = sum_bleu_scores / total_batches if total_batches > 0 else 0
        print(f'Average BLEU score over {total_batches} batches: {average_bleu_score}')

        # Write average BLEU score to the file
        file.write(f'Average BLEU score over {total_batches} batches: {average_bleu_score}\n')
    
    return average_bleu_score


In [105]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)
make_file_bleu_score(model, test_loader, word_to_idx_en['<PAD>'], word_to_idx_en['</s>'], idx_to_word_en)

Average BLEU score over 1305 batches: 0.15285215229406385


0.15285215229406385

In [91]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
evaluate_model(model, train_loader, word_to_idx_en['<PAD>'], word_to_idx_en['</s>'])

Average BLEU score over 30000 batches: 0.20559437761847157


0.20559437761847157

In [92]:
dev_loader = DataLoader(dev_dataset, batch_size=1, shuffle=True)
evaluate_model(model, dev_loader, word_to_idx_en['<PAD>'], word_to_idx_en['</s>'])

Average BLEU score over 887 batches: 0.12134013992858102


0.12134013992858102

In [93]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)
evaluate_model(model, test_loader, word_to_idx_en['<PAD>'], word_to_idx_en['</s>'])

Average BLEU score over 1305 batches: 0.1528521522940639


0.1528521522940639

## Hyperparamter Tuning

In [33]:
num_layers = [2, 3]
num_heads = [5, 10]
embedding_dim = [100, 300]
dropout = [0.1, 0.3]

In [44]:
d_ff = 300
max_seq_length = 100
num_epochs = 10


In [45]:
best_test_bleu_score = 0.0
best_hyperparams = None
performance_records = []

In [46]:
test_loader_small = DataLoader(test_dataset, batch_size=1, shuffle=True)


In [48]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=True)


In [49]:
for num_layer in num_layers:
    for num_head in num_heads:
        for emb_dim in embedding_dim:
            for dp in dropout:
                # Start a new wandb run for each configuration
                wandb.init(project='transformer_bleu', config={
                    'num_layers': num_layer,
                    'num_heads': num_head,
                    'embedding_dim': emb_dim,
                    'dropout': dp,
                    'd_ff': 300,
                    'learning_rate': 0.0005,
                    'epochs': 10
                })

                # Initialize model, loss, and optimizer
                model = Transformer(en_vocab_size, fr_vocab_size, emb_dim, num_head, num_layer, d_ff, max_seq_length, dp).to(device)
                criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx_en['<PAD>'])
                optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate, betas=(0.9, 0.98), eps=1e-9)

                train_losses = []
                val_losses = []

                # Training loop
                for epoch in range(wandb.config.epochs):
                    model.train()
                    running_loss = 0.0
                    for inputs, targets in tqdm(train_loader, desc=f'Training Epoch {epoch + 1}/{wandb.config.epochs}'):
                        inputs, targets = inputs.to(device), targets.to(device)

                        optimizer.zero_grad()  # Clear the gradients
                        outputs = model(inputs, targets[:, :-1])  # Pass input to model

                        # Compute loss
                        loss = criterion(outputs.view(-1, outputs.size(-1)), targets[:, 1:].contiguous().view(-1))  # Shift targets for teacher forcing
                        loss.backward()  # Backpropagation
                        optimizer.step()  # Update weights

                        running_loss += loss.item()

                    avg_train_loss = running_loss / len(train_loader)
                    train_losses.append(avg_train_loss)
                    wandb.log({'train_loss': avg_train_loss, 'epoch': epoch})

                    # Validation loss calculation after each epoch
                    model.eval()
                    val_running_loss = 0.0
                    with torch.no_grad():
                        for inputs, targets in dev_loader:
                            inputs, targets = inputs.to(device), targets.to(device)
                            outputs = model(inputs, targets[:, :-1])

                            val_loss = criterion(outputs.view(-1, outputs.size(-1)), targets[:, 1:].contiguous().view(-1))
                            val_running_loss += val_loss.item()

                    avg_val_loss = val_running_loss / len(dev_loader)
                    val_losses.append(avg_val_loss)
                    wandb.log({'val_loss': avg_val_loss, 'epoch': epoch})

                # Evaluate model using test_loader_small for BLEU scores
                avg_test_bleu_score = evaluate_model(model, test_loader_small, word_to_idx_en['<PAD>'], word_to_idx_en['</s>'])

                # Store performance records
                performance_records.append({
                    'num_layers': num_layer,
                    'num_heads': num_head,
                    'embedding_dim': emb_dim,
                    'dropout': dp,
                    'train_losses': train_losses,
                    'val_losses': val_losses,
                    'test_bleu_score': avg_test_bleu_score
                })

                # Update best BLEU score and hyperparameters
                if avg_test_bleu_score > best_test_bleu_score:
                    best_test_bleu_score = avg_test_bleu_score
                    best_hyperparams = {
                        'num_layers': num_layer,
                        'num_heads': num_head,
                        'embedding_dim': emb_dim,
                        'dropout': dp
                    }

                # End the current wandb run
                wandb.finish()


[34m[1mwandb[0m: Currently logged in as: [33mashnadua[0m ([33mashna-dua[0m). Use [1m`wandb login --relogin`[0m to force relogin


Training Epoch 1/10: 100%|██████████| 938/938 [00:27<00:00, 34.45it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:25<00:00, 36.09it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:25<00:00, 36.22it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:26<00:00, 35.94it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:26<00:00, 35.96it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:25<00:00, 36.09it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:25<00:00, 36.08it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:26<00:00, 35.91it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:26<00:00, 35.98it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:26<00:00, 36.03it/s]


Average BLEU score over 1305 batches: 0.11175358233224507


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,9.0
train_loss,3.51869
val_loss,3.94994


Training Epoch 1/10: 100%|██████████| 938/938 [00:26<00:00, 35.88it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:26<00:00, 35.79it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:25<00:00, 36.19it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:25<00:00, 36.14it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:26<00:00, 36.04it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:26<00:00, 36.04it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:25<00:00, 36.12it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:25<00:00, 36.10it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:25<00:00, 36.11it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:26<00:00, 36.06it/s]


Average BLEU score over 1305 batches: 0.07599221064143759


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▄▄▃▃▂▂▁▁

0,1
epoch,9.0
train_loss,4.14032
val_loss,4.24104


Training Epoch 1/10: 100%|██████████| 938/938 [00:56<00:00, 16.46it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:56<00:00, 16.48it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:57<00:00, 16.43it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:57<00:00, 16.43it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:57<00:00, 16.43it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]


Average BLEU score over 1305 batches: 0.10314102708839719


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,9.0
train_loss,2.27019
val_loss,3.46208


Training Epoch 1/10: 100%|██████████| 938/938 [00:56<00:00, 16.46it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:56<00:00, 16.46it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:57<00:00, 16.43it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:57<00:00, 16.43it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:57<00:00, 16.43it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:57<00:00, 16.43it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:57<00:00, 16.44it/s]


Average BLEU score over 1305 batches: 0.1411090794025331


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,9.0
train_loss,3.07928
val_loss,3.62896


Training Epoch 1/10: 100%|██████████| 938/938 [00:30<00:00, 30.64it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:30<00:00, 30.76it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:30<00:00, 30.93it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:30<00:00, 30.79it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:30<00:00, 30.82it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:30<00:00, 30.86it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:30<00:00, 30.83it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:30<00:00, 30.79it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:30<00:00, 30.81it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:30<00:00, 30.79it/s]


Average BLEU score over 1305 batches: 0.11017816659361733


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,9.0
train_loss,3.55628
val_loss,3.96864


Training Epoch 1/10: 100%|██████████| 938/938 [00:30<00:00, 30.63it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:30<00:00, 30.73it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:30<00:00, 30.90it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:30<00:00, 30.81it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:30<00:00, 30.80it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:30<00:00, 30.84it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:30<00:00, 30.84it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:30<00:00, 30.78it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:30<00:00, 30.77it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:30<00:00, 30.82it/s]


Average BLEU score over 1305 batches: 0.07922974324927712


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▄▄▃▃▂▂▁▁

0,1
epoch,9.0
train_loss,4.14545
val_loss,4.23686


Training Epoch 1/10: 100%|██████████| 938/938 [01:01<00:00, 15.31it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [01:00<00:00, 15.45it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [01:00<00:00, 15.42it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [01:00<00:00, 15.39it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [01:00<00:00, 15.40it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [01:00<00:00, 15.42it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [01:00<00:00, 15.42it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [01:00<00:00, 15.40it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [01:00<00:00, 15.39it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [01:00<00:00, 15.38it/s]


Average BLEU score over 1305 batches: 0.1443643528107353


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▄▃▂▂▂▁▁▁

0,1
epoch,9.0
train_loss,2.27504
val_loss,3.5607


Training Epoch 1/10: 100%|██████████| 938/938 [01:01<00:00, 15.35it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [01:00<00:00, 15.47it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [01:00<00:00, 15.40it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [01:00<00:00, 15.43it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [01:00<00:00, 15.44it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [01:00<00:00, 15.41it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [01:00<00:00, 15.40it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [01:00<00:00, 15.40it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [01:00<00:00, 15.41it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [01:00<00:00, 15.39it/s]


Average BLEU score over 1305 batches: 0.12789027634355818


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,9.0
train_loss,3.0912
val_loss,3.67583


Training Epoch 1/10: 100%|██████████| 938/938 [00:32<00:00, 28.77it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:32<00:00, 28.93it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:32<00:00, 28.99it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:32<00:00, 28.87it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:32<00:00, 28.92it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:32<00:00, 28.91it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:32<00:00, 28.88it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:32<00:00, 28.84it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:32<00:00, 28.89it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:32<00:00, 28.88it/s]


Average BLEU score over 1305 batches: 0.11628547140362307


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▄▄▃▂▂▂▁▁

0,1
epoch,9.0
train_loss,3.41286
val_loss,3.94294


Training Epoch 1/10: 100%|██████████| 938/938 [00:32<00:00, 28.68it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:32<00:00, 28.81it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:32<00:00, 28.98it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:32<00:00, 28.81it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:32<00:00, 28.89it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:32<00:00, 28.93it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:32<00:00, 28.86it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:32<00:00, 28.89it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:32<00:00, 28.88it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:32<00:00, 28.89it/s]


Average BLEU score over 1305 batches: 0.0892893065381556


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,9.0
train_loss,4.00872
val_loss,4.14822


Training Epoch 1/10: 100%|██████████| 938/938 [01:12<00:00, 12.92it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [01:12<00:00, 13.02it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [01:12<00:00, 13.02it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [01:12<00:00, 12.99it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [01:12<00:00, 12.99it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [01:12<00:00, 12.99it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [01:12<00:00, 12.99it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [01:12<00:00, 13.01it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [01:12<00:00, 13.02it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [01:12<00:00, 13.01it/s]


Average BLEU score over 1305 batches: 0.13421352407292522


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▅▃▂▂▂▁▁▁▁

0,1
epoch,9.0
train_loss,2.12746
val_loss,3.38989


Training Epoch 1/10: 100%|██████████| 938/938 [01:12<00:00, 12.94it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [01:12<00:00, 13.01it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [01:12<00:00, 13.01it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [01:12<00:00, 12.99it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [01:12<00:00, 13.02it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [01:12<00:00, 13.02it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [01:12<00:00, 13.00it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [01:12<00:00, 13.00it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [01:12<00:00, 12.99it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [01:12<00:00, 12.99it/s]


Average BLEU score over 1305 batches: 0.16191088650511332


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▄▃▂▂▂▁▁
val_loss,█▆▅▄▃▂▂▁▁▁

0,1
epoch,9.0
train_loss,2.91998
val_loss,3.47864


Training Epoch 1/10: 100%|██████████| 938/938 [00:39<00:00, 23.86it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:38<00:00, 24.10it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:38<00:00, 24.08it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:39<00:00, 24.03it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:38<00:00, 24.08it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:38<00:00, 24.05it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:38<00:00, 24.06it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:38<00:00, 24.09it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:38<00:00, 24.09it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:38<00:00, 24.08it/s]


Average BLEU score over 1305 batches: 0.11615159024016182


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▅▄▃▂▂▂▁▁

0,1
epoch,9.0
train_loss,3.4196
val_loss,3.90612


Training Epoch 1/10: 100%|██████████| 938/938 [00:39<00:00, 23.83it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [00:38<00:00, 24.09it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [00:38<00:00, 24.12it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [00:39<00:00, 24.02it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [00:38<00:00, 24.12it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [00:38<00:00, 24.06it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [00:38<00:00, 24.07it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [00:38<00:00, 24.09it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [00:38<00:00, 24.09it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [00:38<00:00, 24.08it/s]


Average BLEU score over 1305 batches: 0.08758599148381778


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_loss,█▆▅▄▃▃▂▂▂▁

0,1
epoch,9.0
train_loss,4.02001
val_loss,4.1321


Training Epoch 1/10: 100%|██████████| 938/938 [01:18<00:00, 11.93it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [01:18<00:00, 11.99it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [01:18<00:00, 12.01it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [01:18<00:00, 11.98it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [01:18<00:00, 11.98it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [01:18<00:00, 12.00it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [01:18<00:00, 12.00it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [01:18<00:00, 11.98it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [01:18<00:00, 11.98it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [01:18<00:00, 11.98it/s]


Average BLEU score over 1305 batches: 0.1617744217449566


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▆▄▃▃▂▂▂▁▁
val_loss,█▅▃▂▁▁▁▁▁▁

0,1
epoch,9.0
train_loss,2.11504
val_loss,3.49614


Training Epoch 1/10: 100%|██████████| 938/938 [01:18<00:00, 11.94it/s]
Training Epoch 2/10: 100%|██████████| 938/938 [01:18<00:00, 11.99it/s]
Training Epoch 3/10: 100%|██████████| 938/938 [01:18<00:00, 11.99it/s]
Training Epoch 4/10: 100%|██████████| 938/938 [01:18<00:00, 11.99it/s]
Training Epoch 5/10: 100%|██████████| 938/938 [01:18<00:00, 12.01it/s]
Training Epoch 6/10: 100%|██████████| 938/938 [01:18<00:00, 11.99it/s]
Training Epoch 7/10: 100%|██████████| 938/938 [01:18<00:00, 11.98it/s]
Training Epoch 8/10: 100%|██████████| 938/938 [01:18<00:00, 11.97it/s]
Training Epoch 9/10: 100%|██████████| 938/938 [01:18<00:00, 11.99it/s]
Training Epoch 10/10: 100%|██████████| 938/938 [01:18<00:00, 12.00it/s]


Average BLEU score over 1305 batches: 0.14837762247271266


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train_loss,█▅▄▄▃▂▂▂▁▁
val_loss,█▆▅▄▃▂▂▂▁▁

0,1
epoch,9.0
train_loss,2.92105
val_loss,3.52945


In [54]:
import pandas as pd

# Convert performance records to a DataFrame and save to CSV
df = pd.DataFrame(performance_records)
df.to_csv('performance_metrics.csv', index=False)


In [55]:
print(f'Best Test BLEU Score: {best_test_bleu_score}')
print('Best Hyperparameters:', best_hyperparams)

Best Test BLEU Score: 0.16191088650511332
Best Hyperparameters: {'num_layers': 3, 'num_heads': 5, 'embedding_dim': 300, 'dropout': 0.3}


In [60]:
fig_loss_line = go.Figure()

for record in performance_records:
    fig_loss_line.add_trace(go.Scatter(
        x=list(range(1, len(record['train_losses']) + 1)),
        y=record['train_losses'],
        mode='lines+markers',
        name=f'Train Loss (Layers: {record["num_layers"]}, Heads: {record["num_heads"]})'
    ))
    fig_loss_line.add_trace(go.Scatter(
        x=list(range(1, len(record['val_losses']) + 1)),
        y=record['val_losses'],
        mode='lines+markers',
        name=f'Validation Loss (Layers: {record["num_layers"]}, Heads: {record["num_heads"]})',
        line=dict(dash='dash')
    ))

# Update layout for the loss line plot
fig_loss_line.update_layout(
    title='Training and Validation Loss for Different Models (Line Graph)',
    xaxis_title='Epoch',
    yaxis_title='Loss',
    legend_title='Model Configurations',
    template='plotly_white'
)

# Show the line loss plot
fig_loss_line.show()


In [69]:
import plotly.graph_objects as go

# Create a figure for BLEU scores
fig_bleu = go.Figure()

# Add a bar for each configuration
for record in performance_records:
    fig_bleu.add_trace(go.Bar(
        x=[f'Layers: {record["num_layers"]}, Heads: {record["num_heads"]}, Embedding: {record["embedding_dim"]}, Dropout: {record["dropout"]}'],
        y=[record['test_bleu_score']],
        # Removing the name for the bar will hide the legend
    ))

# Update layout for the BLEU score plot with increased height and no legend
fig_bleu.update_layout(
    title='BLEU Scores for Different Models (Bar Graph)',
    xaxis_title='Model Configurations',
    yaxis_title='BLEU Score',
    xaxis_tickangle=-90,  # Rotate x-axis labels to 90 degrees
    height=800,           # Set the height of the figure
    showlegend=False,     # Hide the legend
    template='plotly_white'
)

# Show the BLEU score plot
fig_bleu.show()


# Saving and Reloading the model

In [94]:
torch.save(model.state_dict(), 'transformer_model.pt')

In [95]:
import json

vocab_data = {
    'word_to_idx_en': word_to_idx_en,
    'word_to_idx_fr': word_to_idx_fr,}

with open('transformer_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(vocab_data, f, ensure_ascii=False, indent=4)

In [97]:
torch.save(train_dataset, 'train_dataset.pth')
torch.save(dev_dataset, 'dev_dataset.pth')
torch.save(test_dataset, 'test_dataset.pth')


In [98]:
model_new = Transformer(en_vocab_size, fr_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)
model_new.load_state_dict(torch.load('/kaggle/working/transformer_model.pt', weights_only=True))
model.eval()

with open('/kaggle/working/transformer_vocab.json', 'r', encoding='utf-8') as f:
    vocab_data = json.load(f)
    word_to_idx_en_new = vocab_data['word_to_idx_en']
    word_to_idx_fr_new = vocab_data['word_to_idx_fr']

In [99]:
evaluate_model(model_new, test_loader, word_to_idx_en_new['<PAD>'], word_to_idx_en_new['</s>'])

Average BLEU score over 1305 batches: 0.15285215229406368


0.15285215229406368