In [1]:
import os
import wandb
import torch
import torch.nn as nn
import random
from torch.autograd import Variable
from torch.utils.data import DataLoader
import pandas as pd
import torch.optim as optim
import torch.nn.functional as Function
import argparse


In [2]:
SYMBOL_BEGIN, SYMBOL_END, SYMBOL_UNKNOWN, SYMBOL_PADDING = 0, 1, 2, 3

INPUT_LABEL = "input"
TARGET_LABEL = "target"
DELIMETER = ","

RNN_KEY = "RNN"
GRU_KEY = "GRU"
LSTM_KEY = "LSTM"

INPUT_LANG_KEY = "input_lang"
OUTPUT_LANG_KEY = "output_lang"
PAIRS_KEY = "pairs"
MAX_LEN_KEY = "max_len"

input_lang = "eng"
TARGET_LANG = "hin"

TRAIN_LABEL = "train"
TEST_LABEL = "test"
VALID_LABEL = "valid"

DEFAULT_PATH = "/kaggle/input/aksharantar-sampled/aksharantar_sampled"
TRAIN_DATASET_PATH = f"{DEFAULT_PATH}/{TARGET_LANG}/{TARGET_LANG}_{TRAIN_LABEL}.csv"
VALIDATION_DATASET_PATH = f"{DEFAULT_PATH}/{TARGET_LANG}/{TARGET_LANG}_{VALID_LABEL}.csv"
TEST_DATASET_PATH = f"{DEFAULT_PATH}/{TARGET_LANG}/{TARGET_LANG}_{TEST_LABEL}.csv"

NADAM_KEY = "Nadam"

# Sweep param labels
EMBEDDING_SIZE_KEY = "embedding_size"
EPOCHS_KEY = "epochs"
ENCODER_LAYER_KEY = "encoder_layers"
DECODER_LAYER_KEY = "decoder_layers"
HIDDEN_LAYER_KEY = "hidden_layer"
IS_BIDIRECTIONAL_KEY = "bidirectional"
DROPOUT_KEY = "dropout"
CELL_TYPE_KEY = "cell_type"
LEARNING_RATE_KEY = "learning_rate"
BATCH_SIZE_KEY = "batch_size"

# wandb constants
WANDB_PROJECT_NAME="dl-assignment-3"
WANDB_ENTITY_NAME="cs23m007"

# wandb plot titles
TRAIN_ACCURACY_TITLE = "train_acc"
VALIDATION_ACCURACY_TITLE = "val_acc"
TEST_ACCURACY_TITLE = "test_acc"
TRAIN_LOSS_TITLE = "train_loss"
VALIDATION_LOSS_TITLE = "val_loss"
TEST_LOSS_TITLE = "test_loss"

best_params = {
    EMBEDDING_SIZE_KEY :256,
    EPOCHS_KEY :5,
    ENCODER_LAYER_KEY :2,
    DECODER_LAYER_KEY :2,
    HIDDEN_LAYER_KEY :256,
    IS_BIDIRECTIONAL_KEY :False,
    DROPOUT_KEY :0.2,
    CELL_TYPE_KEY :LSTM_KEY,
    BATCH_SIZE_KEY : 32,
    LEARNING_RATE_KEY: 0.001

}




# Set the device type to CUDA if available, otherwise use CPU
is_gpu = torch.cuda.is_available()
if is_gpu:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


In [None]:
sweep_config = {
    "name" : "CS6910_Assignemnt3_Without Attention",
    "method" : "random",
    'metric': {
        'name': VALIDATION_ACCURACY_TITLE,
        'goal': 'maximize'
    },
    "parameters" : {
        EMBEDDING_SIZE_KEY : {
          "values" : [16, 32, 64, 256]  
        },
        EPOCHS_KEY : {
            "values" : [5,10]
        },
        ENCODER_LAYER_KEY: {
            "values": [1,2,3]
        },
        DECODER_LAYER_KEY: {
            "values": [1,2,3]
        },
        HIDDEN_LAYER_KEY:{
            "values": [16, 32, 64, 256]
        },
        IS_BIDIRECTIONAL_KEY:{
            "values": [True, False]
        },
        DROPOUT_KEY: {
            "values": [0,0.2,0.3]       
        }, 
        CELL_TYPE_KEY: {
            "values": [RNN_KEY,GRU_KEY,LSTM_KEY]       
        },
        LEARNING_RATE_KEY:{
            "values":[0.001,0.01]
        },
        BATCH_SIZE_KEY:{
            "values":[32,64,128] 
        }
    }
}

# Utility Functions and classes

In [3]:
class Vocabulary:
    """
    Initialize the Vocabulary object.

    Attributes:
    - str_count: A dictionary to store the count of each character encountered.
    - int_encodding: A dictionary to map characters to integer encodings.
    - n_chars: An integer representing the total number of unique characters encountered.
    - str_encodding: A dictionary to map integer encodings back to characters.
    """

    def __init__(self):
        self.str_count,self.int_encodding = dict(),dict()
        self.n_chars = 4
        self.str_encodding = {0: "<", 1: ">", 2: "?", 3: "."}

    def addWord(self, word):
        """
        Add a word to the vocabulary.

        Parameters:
        - word: A string representing the word to be added to the vocabulary.
        """


        for char in word:
            try:
                self.str_count[char] += 1
            except:
                self.int_encodding[char] = self.n_chars
                self.str_encodding[self.n_chars] = char
                self.str_count[char] = 1
                self.n_chars += 1

def prepareData(dir):
    """
    Prepare data for training a sequence-to-sequence model.

    Parameters:
    - dir: A string representing the directory path of the data file.
           The data file is expected to be in CSV format with two columns:
           one for input sequences and another for target sequences.

    Returns:
    - input_lang: An instance of the Vocabulary class containing the vocabulary
                  for the input sequences.
    - output_lang: An instance of the Vocabulary class containing the vocabulary
                   for the target sequences.
    - pairs: A list of tuples representing input-target pairs extracted from the data.
    - max_len: An integer representing the maximum sequence length among input and
               target sequences in the dataset.
    """

    data = pd.read_csv(dir, sep=DELIMETER, names=[INPUT_LABEL, TARGET_LABEL])

    max_input_length = data[INPUT_LABEL].apply(len).max()
    max_target_length = data[TARGET_LABEL].apply(len).max()
    
    max_len=max(max_input_length,max_target_length)

    input_lang, output_lang = Vocabulary(), Vocabulary()

    pairs = pd.concat([data[INPUT_LABEL], data[TARGET_LABEL]], axis=1).values.tolist()

    for pair in pairs:
        input_lang.addWord(pair[0])
        output_lang.addWord(pair[1])

    return input_lang,output_lang,pairs,max_len


def helpTensor(lang, word, max_length):
    """
    Convert a word into a PyTorch tensor of character indexes according to a provided language mapping,
    padding it to a specified maximum length.

    Parameters:
    - lang (dict): A dictionary mapping characters to their corresponding indexes in the language.
    - word (str): The input word to be converted into a tensor.
    - max_length (int): The maximum length of the tensor after padding.

    Returns:
    - result (torch.Tensor): A PyTorch tensor containing the indexes of characters in the word, 
      padded with SYMBOL_PADDING up to the max_length, and terminated with SYMBOL_END.
    """

    index_list = []
    for char in word:
        try:
            index_list.append(lang.char2index[char])
        except:
            index_list.append(SYMBOL_UNKNOWN)

    indexes = index_list
    indexes.append(SYMBOL_END)
    n = len(indexes)
    indexes.extend([SYMBOL_PADDING] * (max_length - n))
    result = torch.LongTensor(indexes)
    if is_gpu:
        return result.cuda()
    return result

def makeTensor(input_lang, output_lang, pairs, reach):
    res = [(helpTensor(input_lang, pairs[i][0], reach), helpTensor(output_lang, pairs[i][1], reach)) for i in range(len(pairs))]
    return res


def accuracy(encoder, decoder, loader, batch_size, criterion, cell_type, num_layers_enc, max_length, output_lang):
    """
    Calculate the accuracy of a sequence-to-sequence model on a given dataset.

    Args:
    - encoder (torch.nn.Module): The encoder module of the sequence-to-sequence model.
    - decoder (torch.nn.Module): The decoder module of the sequence-to-sequence model.
    - loader (torch.utils.data.DataLoader): DataLoader containing the dataset.
    - batch_size (int): The batch size for processing data.
    - criterion: The loss criterion used during training.
    - cell_type (str): Type of RNN cell used in the model (e.g., LSTM_KEY).
    - num_layers_enc (int): Number of layers in the encoder.
    - max_length (int): Maximum length of input/output sequences.
    - output_lang: The language object representing the output language.
    - input_lang: The language object representing the input language.
    - is_test (bool): Flag indicating whether the function is used for testing.

    Returns:
    - accuracy (float): The accuracy of the model on the dataset, as a percentage.

    """

    with torch.no_grad():
        total = correct = 0

        for batch_x, batch_y in loader:
            # Initialize encoder hidden state
            encoder_hidden = encoder.initHidden(batch_size, num_layers_enc)

            input_variable = Variable(batch_x.transpose(0, 1))
            target_variable = Variable(batch_y.transpose(0, 1))

            # Check if LSTM and initialize cell state
            if cell_type == LSTM_KEY:
                encoder_cell_state = encoder.initHidden(batch_size, num_layers_enc)
                encoder_hidden = (encoder_hidden, encoder_cell_state)

            # input_length = input_variable.size()[0]
            # target_length = target_variable.size()[0]

            output = torch.LongTensor(target_variable.size()[0], batch_size)

            # Initialize encoder outputs
            # encoder_outputs = Variable(torch.zeros(max_length, batch_size, encoder.hidden_size))
            # encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

            # Encoder forward pass
            for ei in range(input_variable.size()[0]):
                encoder_hidden = encoder(input_variable[ei], batch_size, encoder_hidden)[1]

            decoder_input = Variable(torch.LongTensor([SYMBOL_BEGIN] * batch_size))
            if is_gpu:
                decoder_input = decoder_input.cuda()

            decoder_hidden = encoder_hidden

            # Decoder forward pass
            for di in range(target_variable.size()[0]):
                decoder_output, decoder_hidden = decoder(decoder_input, batch_size, decoder_hidden)
                topi = decoder_output.data.topk(1)[1]
                output[di], decoder_input = torch.cat(tuple(topi)), torch.cat(tuple(topi))
            output = output.transpose(0, 1)

            # Calculate accuracyWithoutAttn
            for di in range(output.size()[0]):
                ignore = [SYMBOL_BEGIN, SYMBOL_END, SYMBOL_PADDING]
                sent = [output_lang.str_encodding[letter.item()] for letter in output[di] if letter not in ignore]
                y = [output_lang.str_encodding[letter.item()] for letter in batch_y[di] if letter not in ignore]
                if sent == y:
                    correct += 1
                total += 1

    return (correct / total) * 100


def calc_loss(encoder, decoder, input_tensor, target_tensor, batch_size, encoder_optimizer, decoder_optimizer, criterion, cell_type, num_layers_enc, max_length, is_training, teacher_forcing_ratio=0.5):
    """
    Calculate the loss of a sequence-to-sequence model for a single batch.

    Args:
    - encoder (torch.nn.Module): The encoder module of the sequence-to-sequence model.
    - decoder (torch.nn.Module): The decoder module of the sequence-to-sequence model.
    - input_tensor (torch.Tensor): Input tensor representing the source sequence.
    - target_tensor (torch.Tensor): Target tensor representing the target sequence.
    - batch_size (int): The batch size for processing data.
    - encoder_optimizer (torch.optim.Optimizer): Optimizer for updating encoder parameters.
    - decoder_optimizer (torch.optim.Optimizer): Optimizer for updating decoder parameters.
    - criterion: The loss criterion used during training.
    - cell_type (str): Type of RNN cell used in the model (e.g., LSTM_KEY).
    - num_layers_enc (int): Number of layers in the encoder.
    - max_length (int): Maximum length of input/output sequences.
    - is_training (bool): Flag indicating whether the function is called during training or validation.
    - teacher_forcing_ratio (float, optional): The probability of using teacher forcing during training. Default is 0.5.

    Returns:
    - loss (float): The average loss per target length for the batch.
    """

    # Initialize the encoder hidden state
    output_hidden = encoder.initHidden(batch_size, num_layers_enc)

    # Check if LSTM and initialize cell state
    if cell_type == LSTM_KEY:
        encoder_cell_state = encoder.initHidden(batch_size, num_layers_enc)
        output_hidden = (output_hidden, encoder_cell_state)

    # Zero the gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Get input and target sequence lengths
    # input_length = input_tensor.size(0)
    # target_length = target_tensor.size(0)

    # Initialize loss
    loss = 0

    # Encoder forward pass
    for ei in range(input_tensor.size(0)):
        output_hidden = encoder(input_tensor[ei], batch_size, output_hidden)[1]

    # Initialize decoder input
    decoder_input = torch.LongTensor([SYMBOL_BEGIN] * batch_size)
    decoder_input = decoder_input.cuda() if is_gpu else decoder_input

    # Determine if using teacher forcing
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Loop over target sequence
    if is_training:
        # Training phase
        for di in range(target_tensor.size(0)):
            decoder_output, output_hidden = decoder(decoder_input, batch_size, output_hidden)
            decoder_input = target_tensor[di] if use_teacher_forcing else decoder_output.argmax(dim=1)
            loss = criterion(decoder_output, target_tensor[di]) + loss
    else:
        # Validation phase
        with torch.no_grad():
            for di in range(target_tensor.size(0)):
                decoder_output, output_hidden = decoder(decoder_input, batch_size, output_hidden)
                loss += criterion(decoder_output, target_tensor[di])
                decoder_input = decoder_output.argmax(dim=1)

    # Backpropagation and optimization in training phase
    if is_training:
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

    # Return the average loss per target length
    return loss.item() / target_tensor.size(0)


# Train and evaluate the Seq2SeqWithoutAttn model
def seq2seq(encoder, decoder, train_loader, val_loader, test_loader, lr, optimizer, epochs, max_length_word, num_layers_enc, output_lang,batch_size,cell_type,is_wandb):
    """
    Calculate the loss of a sequence-to-sequence model with attention mechanism for a single batch.

    Args:
    - encoder (torch.nn.Module): The encoder module of the sequence-to-sequence model.
    - decoder (torch.nn.Module): The decoder module of the sequence-to-sequence model.
    - encoder_optimizer (torch.optim.Optimizer): Optimizer for updating encoder parameters.
    - decoder_optimizer (torch.optim.Optimizer): Optimizer for updating decoder parameters.
    - input_tensor (torch.Tensor): Input tensor representing the source sequence.
    - target_tensor (torch.Tensor): Target tensor representing the target sequence.
    - criterion: The loss criterion used during training.
    - batch_size (int): The batch size for processing data.
    - cell_type (str): Type of RNN cell used in the model (e.g., LSTM_KEY).
    - num_layers_enc (int): Number of layers in the encoder.
    - max_length (int): Maximum length of input/output sequences.
    - is_training (bool): Flag indicating whether the function is called during training or validation.
    - teacher_forcing_ratio (float, optional): The probability of using teacher forcing during training. Default is 0.5.

    Returns:
    - avg_loss (float): The average loss per target length for the batch.

    """

    max_length = max_length_word - 1
    # Define the optimizer and criterion
    encoder_optimizer = optim.NAdam(encoder.parameters(), lr=lr) if optimizer == "nadam" else optim.Adam(encoder.parameters(), lr=lr)
    decoder_optimizer = optim.NAdam(decoder.parameters(), lr=lr) if optimizer == "nadam" else optim.Adam(decoder.parameters(), lr=lr)
    criterion = nn.NLLLoss()

    for epoch in range(epochs):
        train_loss_total = 0
        val_loss_total = 0

        # Training phase
        for batch_x, batch_y in train_loader:
            batch_x = Variable(batch_x.transpose(0, 1))
            batch_y = Variable(batch_y.transpose(0, 1))
            # Calculate the training loss
            loss = calc_loss(encoder, decoder, batch_x, batch_y, batch_size, encoder_optimizer, decoder_optimizer, criterion, cell_type, num_layers_enc, max_length, is_training=True)
            train_loss_total += loss

        train_loss_avg = train_loss_total / len(train_loader)
        print(f"Epoch: {epoch} | Train Loss: {train_loss_avg:.4f} |", end="")

        # Validation phase
        for batch_x, batch_y in val_loader:
            batch_x = Variable(batch_x.transpose(0, 1))
            batch_y = Variable(batch_y.transpose(0, 1))
            # Calculate the validation loss
            loss = calc_loss(encoder, decoder, batch_x, batch_y, batch_size, encoder_optimizer, decoder_optimizer, criterion, cell_type, num_layers_enc, max_length, is_training=False)
            val_loss_total += loss

        val_loss_avg = val_loss_total / len(val_loader)
        print(f"Val Loss: {val_loss_avg:.4f} |", end="")


        train_acc = accuracy(encoder, decoder, train_loader, batch_size, criterion, cell_type, num_layers_enc, max_length, output_lang)
        train_acc /= 100
        print(f"train Accuracy: {train_acc:.4%} |", end="")

        # Calculate validation accuracyWithoutAttn
        val_acc = accuracy(encoder, decoder, val_loader, batch_size, criterion, cell_type, num_layers_enc, max_length, output_lang)
        val_acc /= 100
        print(f"Val Accuracy: {val_acc:.4%} |", end="")
        
        test_acc = accuracy(encoder, decoder, test_loader, batch_size, criterion, cell_type, num_layers_enc, max_length, output_lang)
        test_acc /= 100
        print(f"Test Accuracy: {test_acc:.4%}")
        if is_wandb:
            wandb.log(
                {
                    TRAIN_ACCURACY_TITLE: train_acc,
                    VALIDATION_ACCURACY_TITLE: val_acc,
                    TEST_ACCURACY_TITLE: test_acc,
                    TRAIN_LOSS_TITLE: train_loss_avg,
                    VALIDATION_LOSS_TITLE: val_loss_avg,
                    # TEST_LOSS_TITLE: test_loss
                }
            )

            


# Encoder Class

In [6]:
# EncoderRNNWithoutAttn
class EncoderRNN(nn.Module):
    """
    Encoder module of a sequence-to-sequence model.

    Args:
    - input_size (int): Size of the input vocabulary.
    - embedding_size (int): Size of the embedding layer.
    - hidden_size (int): Size of the hidden state of the RNN.
    - num_layers_encoder (int): Number of layers in the encoder.
    - cell_type (str): Type of RNN cell used in the encoder (e.g., 'LSTM', 'GRU', 'RNN').
    - drop_out (float): Dropout probability.
    - bi_directional (bool): Flag indicating whether the encoder is bidirectional.

    Attributes:
    - emb_n (int): Embedding size.
    - hid_n (int): Hidden size.
    - encoder_n (int): Number of layers in the encoder.
    - model_key (str): Type of RNN cell used in the encoder.
    - is_dropout (float): Dropout probability.
    - is_bi_dir (bool): Flag indicating whether the encoder is bidirectional.
    - embedding (nn.Embedding): Embedding layer.
    - dropout (nn.Dropout): Dropout layer.
    - cell_layer (nn.Module): RNN cell layer.

    Methods:
    - forward(input, batch_size, hidden): Forward pass of the encoder.
    - initHidden(batch_size, num_layers_enc): Initialize the hidden state of the encoder.
    """

    def __init__(self, input_size, embedding_size, hidden_size, num_layers_encoder, cell_type, drop_out, bi_directional):
        super(EncoderRNN, self).__init__()

        self.emb_n = embedding_size
        self.hid_n = hidden_size
        self.encoder_n = num_layers_encoder
        self.model_key = cell_type
        self.is_dropout = drop_out
        self.is_bi_dir = bi_directional

        self.embedding = nn.Embedding(input_size, self.emb_n)
        self.dropout = nn.Dropout(self.is_dropout)

        cell_map = dict({RNN_KEY: nn.RNN, GRU_KEY: nn.GRU, LSTM_KEY: nn.LSTM})
        self.cell_layer = cell_map[self.model_key](
            input_size = self.emb_n,
            hidden_size = self.hid_n,
            num_layers=self.encoder_n,
            dropout=self.is_dropout,
            bidirectional=self.is_bi_dir,
        )

    def forward(self, input, batch_size, hidden):
        """
        Forward pass of the encoder.

        Args:
        - input (torch.Tensor): Input tensor of shape (seq_len, batch_size).
        - batch_size (int): Batch size.
        - hidden (torch.Tensor): Initial hidden state.

        Returns:
        - y_cap (torch.Tensor): Output tensor of the encoder.
        - hidden (torch.Tensor): Updated hidden state.
        """

        embedded = self.dropout(self.embedding(input).view(1, batch_size, -1))

        output, hidden = self.cell_layer(embedded, hidden)
        return output, hidden

    def initHidden(self, batch_size, num_layers_enc):
        """
        Initialize the hidden state of the encoder.

        Args:
        - batch_size (int): Batch size.
        - num_layers_enc (int): Number of layers in the encoder.

        Returns:
        - torch.Tensor: Initial hidden state.
        """

        if self.is_bi_dir:
            weights = torch.zeros(num_layers_enc * 2 , batch_size, self.hid_n)
        else:
            weights = torch.zeros(num_layers_enc, batch_size, self.hid_n)

        if is_gpu:
            return weights.cuda()
        return weights
    

# Decoder Class

In [None]:
class DecoderRNN(nn.Module):
    """
    Decoder module of a sequence-to-sequence model with attention mechanism.

    Args:
    - hidden_size (int): Size of the hidden state of the RNN.
    - embedding_size (int): Size of the embedding layer.
    - cell_type (str): Type of RNN cell used in the decoder (e.g., 'LSTM', 'GRU', 'RNN').
    - num_layers_decoder (int): Number of layers in the decoder.
    - drop_out (float): Dropout probability.
    - max_length_word (int): Maximum length of a word in the input sequence.
    - output_size (int): Size of the output vocabulary.

    Attributes:
    - hid_n (int): Hidden size.
    - emb_n (int): Embedding size.
    - model_key (str): Type of RNN cell used in the decoder.
    - decoder_n (int): Number of layers in the decoder.
    - drop_out (float): Dropout probability.
    - max_length_word (int): Maximum length of a word in the input sequence.
    - embedding (nn.Embedding): Embedding layer.
    - attention_layer (nn.Linear): Linear layer for attention mechanism.
    - attention_combine (nn.Linear): Linear layer for combining attention and embedded input.
    - dropout (nn.Dropout): Dropout layer.
    - cell_layer (nn.Module): RNN cell layer.
    - out (nn.Linear): Linear layer for output.

    Methods:
    - forward(input, batch_size, hidden, encoder_outputs): Forward pass of the decoder with attention mechanism.

    Note:
    - This class represents the decoder module of a sequence-to-sequence model with an attention mechanism.
    - It takes embedded input tokens, hidden states, and encoder outputs as inputs, and produces output tokens with attention weights.
    - The type of RNN cell (e.g., LSTM, GRU) can be specified during initialization.
    """

    def __init__(self, embedding_size, hidden_size, num_layers_decoder, cell_type, drop_out, bi_directional, output_size):
        super(DecoderRNN, self).__init__()

        self.emb_n = embedding_size
        self.hid_n = hidden_size
        self.decoder_n = num_layers_decoder
        self.model_key = cell_type
        self.is_dropout = drop_out
        self.is_bi_dir = bi_directional

        # Create an embedding layer
        self.embedding = nn.Embedding(output_size, self.emb_n)
        self.dropout = nn.Dropout(self.is_dropout)

        cell_map = {RNN_KEY: nn.RNN, GRU_KEY: nn.GRU, LSTM_KEY: nn.LSTM}
        self.cell_layer = cell_map[self.model_key](
            input_size = self.emb_n,
            hidden_size = self.hid_n,
            num_layers=self.decoder_n,
            dropout=self.is_dropout,
            bidirectional=self.is_bi_dir,
        )

        # Linear layer for output
        if self.is_bi_dir :
            self.out = nn.Linear(self.hid_n * 2, output_size)
        else:
            self.out = nn.Linear(self.hid_n,output_size)

        # Softmax activation
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, batch_size, hidden):
        """
        Forward pass of the decoder with attention mechanism.

        Args:
        - input (torch.Tensor): Input tensor of shape (1, batch_size).
        - batch_size (int): Batch size.
        - hidden (torch.Tensor): Hidden state tensor.
        - encoder_outputs (torch.Tensor): Encoder outputs tensor.

        Returns:
        - y_cap (torch.Tensor): Output tensor of the decoder.
        - hidden (torch.Tensor): Updated hidden state tensor.
        - attention_weights (torch.Tensor): Attention weights tensor.
        """

        output = Function.relu(self.dropout(self.embedding(input).view(1, batch_size, -1)))
        output, hidden = self.cell_layer(output, hidden)

        output = self.softmax(self.out(output[0]))
        return output, hidden


# Train Function

In [None]:
def train(config_defaults = best_params,flag = False,is_wandb = True):
    """
    Function to train a sequence-to-sequence model.

    Args:
    - config_defaults (dict): Dictionary containing default hyperparameters.
    - flag (bool): Flag indicating whether to use attention mechanism.
    - is_wandb (bool): Flag indicating whether to use Weights & Biases logging.
    - is_heat_map (bool): Flag indicating whether to generate attention heatmaps.

    Returns:
    - None

    Note:
    - This function is responsible for training a sequence-to-sequence model based on the provided configurations.
    - If `flag` is True, the function trains a model with attention mechanism. Otherwise, it trains a vanilla sequence-to-sequence model without attention.
    - If `is_wandb` is True, the function logs the training process using Weights & Biases.
    - If `is_heat_map` is True, the function generates attention heatmaps for the test data.
    """

    optimizer = NADAM_KEY
    if flag:
        pass
    else:
        # Prepare training data
        if is_wandb:
            wandb.init(project=WANDB_PROJECT_NAME, entity=WANDB_ENTITY_NAME,config = config_defaults)
            args = wandb.config
            # Set the name of the run

            wandb.run.name = 'ep-'+str(args[EPOCHS_KEY])+'-lr-'+str(args[LEARNING_RATE_KEY])+'-bs-'+str(args[BATCH_SIZE_KEY])+'-el-'+str(args[ENCODER_LAYER_KEY])+'-dl-'+str(args[DECODER_LAYER_KEY]) \
                            +'-hl-'+str(args[HIDDEN_LAYER_KEY])+'-do-'+ str(args[DROPOUT_KEY])+ '-es-'+str(args[EMBEDDING_SIZE_KEY]) \
                            + '-is_bd-'+str(args[IS_BIDIRECTIONAL_KEY])+'-model'+str(args[CELL_TYPE_KEY])

        input_langs,output_langs,pairs,max_len = prepareData(TRAIN_DATASET_PATH)
        print("train:sample:", random.choice(pairs))
        train_n = len(pairs)
        print(f"Number of training examples: {train_n}")

        # Prepare validation data
        input_langs,output_langs,val_pairs,max_len_val = prepareData(VALIDATION_DATASET_PATH)
        val_n = len(val_pairs)
        print("validation:sample:", random.choice(val_pairs))
        print(f"Number of validation examples: {val_n}")

        # Prepare test data
        input_langs,output_langs,test_pairs,max_len_test = prepareData(TEST_DATASET_PATH)
        test_n = len(test_pairs)
        print("Test:sample:", random.choice(test_pairs))
        print(f"Number of Test examples: {test_n}")

        max_len = max(max_len, max(max_len_val, max_len_test)) + 4
        print(max_len)

        # Convert data to tensors and create data loaders
        pairs = makeTensor(input_langs, output_langs, pairs, max_len)
        val_pairs = makeTensor(input_langs, output_langs, val_pairs, max_len)
        test_pairs = makeTensor(input_langs, output_langs, test_pairs, max_len)

        train_loader = DataLoader(dataset = pairs, batch_size=config_defaults[BATCH_SIZE_KEY], shuffle=True)
        val_loader = DataLoader(dataset = val_pairs, batch_size=config_defaults[BATCH_SIZE_KEY], shuffle=True)
        test_loader = DataLoader(dataset = test_pairs, batch_size=config_defaults[BATCH_SIZE_KEY], shuffle=True)

        # Create the encoder and decoder models
        encoder1 = EncoderRNN(
            input_size = input_langs.n_chars,
            embedding_size =  config_defaults[EMBEDDING_SIZE_KEY],
            hidden_size =  config_defaults[HIDDEN_LAYER_KEY],
            num_layers_encoder = config_defaults[ENCODER_LAYER_KEY],
            cell_type = config_defaults[CELL_TYPE_KEY],
            drop_out = config_defaults[DROPOUT_KEY],
            bi_directional = config_defaults[IS_BIDIRECTIONAL_KEY]
            )
        decoder1 = DecoderRNN(
            embedding_size = config_defaults[EMBEDDING_SIZE_KEY], 
            hidden_size = config_defaults[HIDDEN_LAYER_KEY],
            num_layers_decoder = config_defaults[ENCODER_LAYER_KEY],
            cell_type = config_defaults[CELL_TYPE_KEY],
            drop_out = config_defaults[DROPOUT_KEY],
            bi_directional = config_defaults[IS_BIDIRECTIONAL_KEY], 
            output_size = output_langs.n_chars
            )

        if is_gpu:
            encoder1, decoder1 = encoder1.cuda(), decoder1.cuda()

        print("vanilla seq2seqWithoutAttn")
        # Train and evaluate the Seq2SeqWithoutAttn model
        seq2seq(
            encoder = encoder1,
            decoder = decoder1,
            train_loader = train_loader,
            val_loader = val_loader,
            test_loader = test_loader,
            lr = config_defaults[LEARNING_RATE_KEY],
            optimizer = optimizer,
            epochs = config_defaults[EPOCHS_KEY],
            max_length_word = max_len,
            num_layers_enc = config_defaults[ENCODER_LAYER_KEY],
            output_lang = output_langs,
            batch_size = config_defaults[BATCH_SIZE_KEY],
            cell_type = config_defaults[CELL_TYPE_KEY],
            is_wandb = is_wandb
            )


In [None]:
sweep_id = wandb.sweep(sweep_config, project="dl-assignment-3", entity="cs23m007")
print('sweep_id: ', sweep_id)

