In [69]:
!pip install rouge_score



In [70]:
import os
import re
import random
import numpy as np
from tqdm import tqdm
from torchinfo import summary
from collections import Counter
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [71]:
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")

from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Config

In [72]:
en_train = '/kaggle/input/ted-talks-corpus/train.en'
fr_train = '/kaggle/input/ted-talks-corpus/train.fr'
en_val = '/kaggle/input/ted-talks-corpus/dev.en'
fr_val = '/kaggle/input/ted-talks-corpus/dev.fr'
en_test = '/kaggle/input/ted-talks-corpus/test.en'
fr_test = '/kaggle/input/ted-talks-corpus/test.fr'

In [73]:
train = True
padding_before = False
plot_losses = False

In [74]:
embedding_dim = 300
max_length = 64
lr=1e-4

heads = 6
layers = 6

epochs = 10
batch_size = 32

In [75]:
os.makedirs("models", exist_ok=True)

save_path="./models/transformer"

save_path = save_path + f"_heads{heads}_layers{layers}"

save_path = save_path + ".pth"

print(f"Saving model to {save_path}")

Saving model to ./models/transformer_heads6_layers6.pth


In [76]:
random_seed = 42

random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
print("Using Random Seed:", random_seed)

Using Random Seed: 42


In [77]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

Using cuda


## Utils

In [78]:
def clean_text(text):
    text = str(text).lower().strip()
    text = text.rstrip('\n')
    # text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s.,;!?':()\[\]{}-]", " ", text)  # Keep selected punctuation marks, symbols and apostrophes
    text = re.sub(r"\s+", " ", text)

    text = text.encode("utf-8", errors="ignore").decode("utf-8")  # Corrected encoding

    return text

def clean_sentences(sentences):
    sentences = [clean_text(sentence) for sentence in sentences]
    sentences = [s for s in sentences if s and s != ""]  # remove empty strings
    return sentences

In [79]:
def read_data(en_path, fr_path):
    with open(en_path, "r") as f:
        en_data = f.readlines()
    with open(fr_path, "r") as f:
        fr_data = f.readlines()

    assert len(en_data) == len(fr_data), "Data mismatch"

    en_data = clean_sentences(en_data)
    fr_data = clean_sentences(fr_data)

    assert len(en_data) == len(fr_data), "Data mismatch in cleaned data"

    return en_data, fr_data

def word_tokenizer(sentence):
    words = word_tokenize(sentence)
    return words

In [80]:
def flatten_concatenation(list_of_lists, unique=False):
    # flat_list = []
    # for sublist in list_of_lists:
    #     flat_list += sublist

    # flat_list = list(set(flat_list))
    # return flat_list
    flat_array = np.concatenate(list_of_lists)
    if unique:
        flat_list = np.unique(flat_array).tolist()
    else:
        flat_list = flat_array.tolist()
    return flat_list

In [81]:
def reverse_vocab(vocab):
    return {v: k for k, v in vocab.items()}

In [82]:
def return_words_till_EOS(lst, eos=2):
    if eos not in lst:
        return lst
    return lst[:lst.index(eos)]

### Dataset

In [83]:
def pad_sequence(sequence, max_len, before=True, pad_token=0):
    if len(sequence) > max_len:
        return sequence[:max_len]
    elif before:
        return [pad_token] * (max_len - len(sequence)) + sequence
    else:
        return sequence + [pad_token] * (max_len - len(sequence))

In [84]:
class MyDataset(Dataset):
    def __init__(
        self,
        en_data,
        fr_data,
        en_vocab,
        fr_vocab,
        pad_before=False,
    ):
        self.en_data = []
        self.fr_data = []
        self.labels = []
        self.en_vocab = en_vocab
        self.fr_vocab = fr_vocab

        assert len(en_data) == len(fr_data)
        self.length = len(en_data)

        en_pad = self.en_vocab["<pad>"]
        en_unk = self.en_vocab["<unk>"]
        en_sos = self.en_vocab["<sos>"]
        en_eos = self.en_vocab["<eos>"]
        fr_pad = self.fr_vocab["<pad>"]
        fr_unk = self.fr_vocab["<unk>"]
        fr_sos = self.fr_vocab["<sos>"]
        fr_eos = self.fr_vocab["<eos>"]

        tqdm_obj = tqdm(
            total=self.length, desc="Creating dataset"
        )
        for index, (en_sentence, fr_sentence) in enumerate(zip(en_data, fr_data)):
            en_indices = [int(self.en_vocab.get(w, en_unk)) for w in en_sentence]
            en_indices = [en_sos] + en_indices[: max_length - 2] + [en_eos]
            en_indices = pad_sequence(
                en_indices, max_length, before=pad_before, pad_token=en_pad
            )
            self.en_data.append(
                torch.tensor(en_indices, dtype=torch.int, device=device)
            )

            fr_indices1 = [int(self.fr_vocab.get(w, fr_unk)) for w in fr_sentence]
            fr_indices = [fr_sos] + fr_indices1
            fr_indices = pad_sequence(
                fr_indices, max_length, before=pad_before, pad_token=fr_pad
            )
            self.fr_data.append(
                torch.tensor(fr_indices, dtype=torch.int, device=device)
            )

            fr_indices = fr_indices1 + [fr_eos]
            fr_indices = pad_sequence(
                fr_indices, max_length, before=pad_before, pad_token=fr_pad
            )
            self.labels.append(torch.tensor(fr_indices, device=device))

            if index % 10 == 0:
                tqdm_obj.update(10)

        tqdm_obj.close()

        print(f"Dataset created with {self.length} samples")

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.en_data[idx], self.fr_data[idx], self.labels[idx]

### Model

In [85]:
def create_positional_encoding(max_length, embedding_dim):
    pe = torch.zeros(max_length, embedding_dim)
    position = torch.arange(0, max_length).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, embedding_dim, 2) * -(np.log(10000.0) / embedding_dim)
    )
    pe[:, 0::2] = torch.sin(position.float() * div_term)
    pe[:, 1::2] = torch.cos(position.float() * div_term)
    return pe.to(device)


def make_src_mask(src):
    src1 = src
    if len(src.shape) == 3:
        src1 = torch.sum(src, dim=-1)

    src_mask = (src1 != 0).unsqueeze(1).unsqueeze(2)
    return src_mask.to(device)


def make_trg_mask(trg):
    trg1 = trg
    if len(trg.shape) == 3:
        trg1 = torch.sum(trg, dim=-1)
    
    n, trg_len = trg1.size()
    trg_mask = torch.tril(torch.ones(trg_len, trg_len)).expand(n, 1, trg_len, trg_len)
    return trg_mask.to(device)

In [86]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim: int = 512, num_heads: int = 8):
        super(MultiHeadAttention, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        assert (
            self.head_dim * num_heads == embedding_dim
        ), "Embedding dimension must be divisible by number of heads"

        self.q = nn.Linear(self.head_dim, self.head_dim)
        self.k = nn.Linear(self.head_dim, self.head_dim)
        self.v = nn.Linear(self.head_dim, self.head_dim)
        self.fc = nn.Linear(self.embedding_dim, self.embedding_dim)

    def forward(self, value, key, query, mask):
        n = query.size(0)
        query_len, key_len, value_len = query.size(1), key.size(1), value.size(1)

        value = self.v(value.reshape(n, value_len, self.num_heads, self.head_dim))
        query = self.q(query.reshape(n, query_len, self.num_heads, self.head_dim))
        key = self.k(key.reshape(n, key_len, self.num_heads, self.head_dim))

        energy = torch.einsum("nqhd,nkhd->nhqk", [query, key])
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -float("inf"))
        attention = F.softmax(energy / np.sqrt(self.head_dim), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, value]).reshape(
            n, query_len, self.embedding_dim
        )
        out = self.fc(out)

        return out

In [87]:
class TransformerBlock(nn.Module):
    def __init__(
        self,
        embed_size: int,
        heads: int,
        forward_expansion: int,
        dropout: float,
    ):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.layer_norm1 = nn.Sequential(
            nn.LayerNorm(embed_size),
            nn.Dropout(dropout),
        )
        self.layer_norm2 = nn.Sequential(
            nn.LayerNorm(embed_size),
            nn.Dropout(dropout),
        )

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.layer_norm1(attention + query)
        forward = self.feed_forward(x)
        out = self.layer_norm2(forward + x)
        return out

In [88]:
class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size: int,
        embed_size: int,
        num_layers: int,
        heads: int,
        forward_expansion: int,
        dropout: float,
        max_len: int,
    ):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_len, embed_size)
        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size, heads, forward_expansion, dropout)
                for _ in range(num_layers)
            ]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        n, seq_len = x.size()
        positions = torch.arange(0, seq_len).expand(n, seq_len).to(device)
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out

In [89]:
class DecoderBlock(nn.Module):
    def __init__(
        self, embed_size: int, heads: int, forward_expansion: int, dropout: float
    ):
        super(DecoderBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, forward_expansion, dropout
        )
        self.layer_norm = nn.Sequential(
            nn.LayerNorm(embed_size),
            nn.Dropout(dropout),
        )

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.layer_norm(attention + x)
        out = self.transformer_block(value, key, query, src_mask)
        return out

In [90]:
class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size: int,
        embed_size: int,
        num_layers: int,
        heads: int,
        forward_expansion: int,
        dropout: float,
        max_len: int,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_len, embed_size)
        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout)
                for _ in range(num_layers)
            ]
        )
        self.fc = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        n, seq_len = x.size()
        positions = torch.arange(0, seq_len).expand(n, seq_len).to(device)
        x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)
        out = self.fc(x)
        return out

In [91]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size: int,
        trg_vocab_size: int,
        embed_size: int = 512,
        num_layers: int = 6,
        forward_expansion: int = 4,
        heads: int = 8,
        dropout: float = 0.2,
        max_len: int = 50,
        save_path=None,
    ):
        super(Transformer, self).__init__()
        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            max_len,
        )
        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            max_len,
        )
        self.best_val_loss = float("inf")
        self.save_path = save_path

    def forward(self, src, trg):
        src_mask = make_src_mask(src)
        trg_mask = make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

    def fit(self, train_loader, val_loader, criterion, optimizer, num_epochs: int = 10):
        train_losses = []
        val_losses = []

        for epoch in range(num_epochs):
            self.train()
            train_loss = 0
            for src, trg, label in tqdm(train_loader, total=len(train_loader)):
                optimizer.zero_grad()
                output = self(src, trg)
                output = output.reshape(-1, output.size(-1))
                label = label.reshape(-1)

                loss = criterion(output, label)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

            train_loss /= len(train_loader)
            train_losses.append(train_loss)
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss}")

            if device == "cuda":
                torch.cuda.empty_cache()

            val_loss = self.evaluate(val_loader, criterion, True)
            val_losses.append(val_loss)
            print(f"Validation Loss: {val_loss}")

            if self.save_path and val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                torch.save(self.state_dict(), self.save_path)

        return train_losses, val_losses

    def evaluate(self, val_loader, criterion, tqdm_disabled: bool = False):
        self.eval()
        val_loss = 0
        with torch.no_grad():
            for src, trg_input, trg_target in tqdm(
                val_loader, total=len(val_loader), disable=tqdm_disabled
            ):
                output = self(src, trg_input)
                output_dim = output.shape[-1]
                output = output.view(-1, output_dim)
                trg_target = trg_target.view(-1)
                loss = criterion(output, trg_target)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        return val_loss

    def load(self, path=None):
        if path:
            self.load_state_dict(torch.load(path))
        elif self.save_path:
            self.load_state_dict(torch.load(self.save_path))
        else:
            raise ValueError("No model path provided")

    def predict(
        self,
        src,
        src_preprocessed=False,
        max_len=64,
        return_sentence=False,
        fr_vocab=None,
        en_vocab=None,
        start_token_idx=1,
        end_token_idx=2,
    ):
        self.eval()  # Set the model to evaluation mode

        if not src_preprocessed:
            assert en_vocab is not None
            src = word_tokenizer(src)
            src = [en_vocab.get(w, en_vocab["<unk>"]) for w in src]
            src = [start_token_idx] + src[: max_len - 2] + [end_token_idx]
            src = pad_sequence(src, max_len, before=padding_before, pad_token=0)
            src = torch.tensor([src], dtype=torch.int, device=device)

        trg = torch.tensor([[start_token_idx]], dtype=torch.int, device=device)

        src_mask = make_src_mask(src)
        with torch.no_grad():
            enc_src = self.encoder(src, src_mask)

        for _ in range(max_len):
            trg_mask = make_trg_mask(trg)

            with torch.no_grad():
                output = self.decoder(trg, enc_src, src_mask, trg_mask)
                output = output[:, -1]

            next_token = output.argmax(-1).unsqueeze(0)
            trg = torch.cat((trg, next_token), dim=1)

            if next_token.item() == end_token_idx:
                break

        generated_sequence = trg.squeeze(0).tolist()[1:]
        if generated_sequence[-1] == 2:
            generated_sequence = generated_sequence[:-1]

        if return_sentence:
            assert fr_vocab is not None
            fr_vocab_rev = reverse_vocab(fr_vocab)
            generated_sequence = [fr_vocab_rev[idx] for idx in generated_sequence]

        return generated_sequence

    def test(self, test_loader, en_vocab, fr_vocab):
        self.eval()
        bleu_scores = []
        rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}

        reverse_fr_vocab = reverse_vocab(fr_vocab)
        scorer = rouge_scorer.RougeScorer(
            ["rouge1", "rouge2", "rougeL"], use_stemmer=True
        )

        with torch.no_grad():
            for src, _, label in tqdm(test_loader, total=len(test_loader)):
                for i in range(src.size(0)):
                    src_i = src[i].unsqueeze(0)
                    trg_target_i = label[i].unsqueeze(0)

                    candidate = self.predict(
                        src_i,
                        src_preprocessed=True,
                        max_len=max_length,
                        fr_vocab=fr_vocab,
                        start_token_idx=en_vocab["<sos>"],
                        end_token_idx=en_vocab["<eos>"],
                    )
                    candidate = [reverse_fr_vocab[idx] for idx in candidate]

                    reference = return_words_till_EOS(
                        trg_target_i.squeeze(0).tolist(), eos=fr_vocab["<eos>"]
                    )
                    reference = [reverse_fr_vocab[idx] for idx in reference]

                    bleu_score = sentence_bleu(
                        [reference],
                        candidate,
                        smoothing_function=SmoothingFunction().method1,
                    )
                    bleu_scores.append(bleu_score)

                    # Convert lists to strings for ROUGE calculation
                    candidate_str = " ".join(candidate)
                    reference_str = " ".join(reference)

                    # ROUGE Score calculation
                    rouge_score = scorer.score(reference_str, candidate_str)
                    rouge_scores["rouge1"].append(rouge_score["rouge1"].fmeasure)
                    rouge_scores["rouge2"].append(rouge_score["rouge2"].fmeasure)
                    rouge_scores["rougeL"].append(rouge_score["rougeL"].fmeasure)

        if device == "cuda":
            torch.cuda.empty_cache()
        
        print(f"Test BLEU Score: {np.mean(bleu_scores)}")
        print(f"Test ROUGE-1 Score: {np.mean(rouge_scores['rouge1'])}")
        print(f"Test ROUGE-2 Score: {np.mean(rouge_scores['rouge2'])}")
        print(f"Test ROUGE-L Score: {np.mean(rouge_scores['rougeL'])}")

        return (
            np.mean(bleu_scores),
            np.mean(rouge_scores["rouge1"]),
            np.mean(rouge_scores["rouge2"]),
            np.mean(rouge_scores["rougeL"]),
        )

## Main

In [92]:
train_en, train_fr = read_data(en_train, fr_train)
val_en, val_fr = read_data(en_val, fr_val)
test_en, test_fr = read_data(en_test, fr_test)

In [93]:
train_en_words = [word_tokenizer(s) for s in train_en]
train_fr_words = [word_tokenizer(s) for s in train_fr]
val_en_words = [word_tokenizer(s) for s in val_en]
val_fr_words = [word_tokenizer(s) for s in val_fr]
test_en_words = [word_tokenizer(s) for s in test_en]
test_fr_words = [word_tokenizer(s) for s in test_fr]

all_en_words = flatten_concatenation(train_en_words + val_en_words + test_en_words)
all_fr_words = flatten_concatenation(train_fr_words + val_fr_words + test_fr_words)

In [94]:
en_word_counts = Counter(all_en_words)
assert en_word_counts.total() == len(all_en_words)
fr_word_counts = Counter(all_fr_words)
assert fr_word_counts.total() == len(all_fr_words)

en_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
fr_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}

for word, count in en_word_counts.items():
    # if count > 1:
        en_vocab[word] = len(en_vocab.keys())

for word, count in fr_word_counts.items():
    # if count > 1:
        fr_vocab[word] = len(fr_vocab.keys())

In [95]:
if train:
    train_dataset = MyDataset(
        train_en_words,
        train_fr_words,
        en_vocab,
        fr_vocab,
        padding_before,
    )
    val_dataset = MyDataset(
        val_en_words,
        val_fr_words,
        en_vocab,
        fr_vocab,
        padding_before,
    )
test_dataset = MyDataset(
    test_en_words,
    test_fr_words,
    en_vocab,
    fr_vocab,
    padding_before,
)

Creating dataset: 100%|██████████| 30000/30000 [00:04<00:00, 6680.88it/s]


Dataset created with 30000 samples


Creating dataset: 890it [00:00, 6989.21it/s]                         


Dataset created with 887 samples


Creating dataset: 1310it [00:00, 6880.07it/s]                         

Dataset created with 1305 samples





In [96]:
if train:
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
    )
    print("Length of train_loader:", len(train_loader))
    print("Length of val_loader:", len(val_loader))

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
print("Length of test_loader:", len(test_loader))

Length of train_loader: 938
Length of val_loader: 28
Length of test_loader: 1305


In [97]:
hyper_params1 = [
    {"embedding_dim": 300, "dropout": 0.2, "layers": 6},
    {"embedding_dim": 300, "dropout": 0.4, "layers": 6},
]
train_losses1, val_losses1 = [], []
scores1 = []
losses1 = []

hyper_params2 = [
    {"embedding_dim": 300, "dropout": 0.2, "layers": 6},
    {"embedding_dim": 600, "dropout": 0.2, "layers": 6},
]
train_losses2, val_losses2 = [], []
scores2 = []
losses2 = []

hyper_params3 = [
    {"embedding_dim": 300, "dropout": 0.2, "layers": 4},
    {"embedding_dim": 300, "dropout": 0.2, "layers": 6},
]
train_losses3, val_losses3 = [], []
scores3 = []
losses3 = []

In [98]:
def run_full_model(hyper_params):
    print("Running model with hyperparameters:", hyper_params)
    embedding_dim = hyper_params["embedding_dim"]
    dropout = hyper_params["dropout"]
    layers = hyper_params["layers"]
    save_path = f"./models/transformer_layers{layers}_embedding{embedding_dim}_dropout{dropout}.pth"

    model = Transformer(
        len(en_vocab),
        len(fr_vocab),
        embed_size=hyper_params["embedding_dim"],
        num_layers=hyper_params["layers"],
        heads=heads,
        forward_expansion=4,
        dropout=hyper_params["dropout"],
        max_len=max_length,
        save_path=save_path,
    ).to(device)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_losses = val_losses = [0]*epochs

    # train_losses, val_losses = model.fit(
    #     train_loader, val_loader, criterion, optimizer, epochs
    # )

    # model.load(save_path)
    test_loss = model.evaluate(test_loader, criterion)
    scores = model.test(test_loader, en_vocab, fr_vocab)

    return train_losses, val_losses, test_loss, scores

In [99]:
def plot_scores(blue_scores, rouge1_scores, rouge2_scores, rougeL_scores, labels):
    # Number of indices
    indices = np.arange(len(blue_scores))
    
    # Define bar width and spacing
    bar_width = 0.2
    spacing = 0.05  # Space between groups of bars
    
    # Offset for each set of bars
    offset_blue = 0
    offset_rouge1 = bar_width + spacing
    offset_rouge2 = (bar_width + spacing) * 2
    offset_rougeL = (bar_width + spacing) * 3

    # Adjusting x positions for side-by-side bars
    x_blue = indices + offset_blue
    x_rouge1 = indices + offset_rouge1
    x_rouge2 = indices + offset_rouge2
    x_rougeL = indices + offset_rougeL

    plt.figure(figsize=(10, 6))
    
    # Plot each set of bars side-by-side for each index
    plt.bar(x_blue, blue_scores, width=bar_width, label='BLEU', color='b')
    plt.bar(x_rouge1, rouge1_scores, width=bar_width, label='ROUGE-1', color='g')
    plt.bar(x_rouge2, rouge2_scores, width=bar_width, label='ROUGE-2', color='r')
    plt.bar(x_rougeL, rougeL_scores, width=bar_width, label='ROUGE-L', color='orange')

    # Set labels and titles
    plt.xlabel('Index')
    plt.ylabel('Scores')
    plt.title('BLEU and ROUGE Scores')
    
    # Adjust x-axis ticks to the middle of each group
    plt.xticks(indices + (offset_rougeL + offset_blue) / 2, labels)
    
    plt.legend()
    plt.grid(True)
    
    # Show plot
    plt.show()


def plot(train_losses, val_losses, test_loss, scores, labels=[]):
    assert (
        len(train_losses)
        == len(val_losses)
        == len(scores)
        == len(test_loss)
        == len(labels)
    )

    plt.figure(figsize=(10, 5))

    # Plot train losses
    for i, losses in enumerate(train_losses):
        plt.plot(losses, label=f'Train Loss {labels[i]}')
    
    # Plot validation losses
    for i, losses in enumerate(val_losses):
        plt.plot(losses, label=f'Val Loss {labels[i]}', linestyle='--')

    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Train and Validation Loss")
    plt.legend()
    plt.show()

    plt.figure(figsize=(7, 5))
    plt.bar(labels, test_loss, color='orange')
    plt.xlabel("Hyperparameter")
    plt.ylabel('Loss')
    plt.title('Test Losses')
    
    # Show plot for test losses
    plt.show()

    blue_scores = [score[0] for score in scores]
    rouge1_scores = [score[1] for score in scores]
    rouge2_scores = [score[2] for score in scores]
    rougeL_scores = [score[3] for score in scores]

    plot_scores(blue_scores, rouge1_scores, rouge2_scores, rougeL_scores, labels)
    

In [None]:
for hyper_params in hyper_params1:
    train_losses, val_losses, test_loss, scores = run_full_model(hyper_params)
    train_losses1.append(train_losses)
    val_losses1.append(val_losses)
    scores1.append(scores)
    losses1.append(test_loss)

In [None]:
plot(train_losses1, val_losses1, losses1, scores1, ["300-0.2-6", "300-0.4-6"])

In [None]:
for hyper_params in hyper_params2:
    train_losses, val_losses, test_loss, scores = run_full_model(hyper_params)
    train_losses2.append(train_losses)
    val_losses2.append(val_losses)
    scores2.append(scores)
    losses2.append(test_loss)

In [None]:
plot(train_losses2, val_losses2, losses2, scores2, ["300-0.2-6", "600-0.2-6"])

In [None]:
for hyper_params in hyper_params3:
    train_losses, val_losses, test_loss, scores = run_full_model(hyper_params)
    train_losses3.append(train_losses)
    val_losses3.append(val_losses)
    scores3.append(scores)
    losses3.append(test_loss)

In [None]:
plot(train_losses3, val_losses3, losses3, scores3, ["300-0.2-4", "300-0.2-6"])