<a href="https://colab.research.google.com/github/hemanths03/CS6910_Assignment_3/blob/main/Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
from random import sample

from tensorflow import keras
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

#WandB Login

In [None]:
!pip install wandb -qqq
import wandb
wandb.login()

from wandb.keras import WandbCallback

[K     |████████████████████████████████| 1.8 MB 5.3 MB/s 
[K     |████████████████████████████████| 181 kB 50.2 MB/s 
[K     |████████████████████████████████| 144 kB 49.1 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


#Mounting the google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#Load the datasets

In [None]:
train_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.train.tsv', sep = '\t', header = None)
val_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.dev.tsv', sep = '\t', header = None)
test_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.test.tsv', sep = '\t', header = None)

#Preprocessing the dataset into required format

In [None]:
def preprocess_data():

    #Declaring some required variables

    train_input_lexicons = []
    train_target_lexicons = []
    val_input_lexicons = []
    val_target_lexicons = []
    test_input_lexicons = []
    test_target_lexicons = []

    #Splitting the datasets in input_lexicons and target_lexicons

    for i in range(len(train_dataset)):
      train_input_lexicons.append(str(train_dataset[1][i]))
      train_target_lexicons.append("\t" + str(train_dataset[0][i])+"\n")

    for i in range(len(val_dataset)):
      val_input_lexicons.append(str(val_dataset[1][i]))
      val_target_lexicons.append("\t" + str(val_dataset[0][i])+"\n")

    for i in range(len(test_dataset)):
      test_input_lexicons.append(str(test_dataset[1][i]))
      test_target_lexicons.append("\t" + str(test_dataset[0][i])+"\n")


    #Creating unique input and target character sets

    input_characters = set()
    input_characters.add(' ')
    target_characters = set()
    target_characters.add(' ')

    #union of all input words

    ip_words = train_input_lexicons + val_input_lexicons + test_input_lexicons

    #Union of all target words

    op_words = train_target_lexicons + val_target_lexicons + test_target_lexicons

    #Adding unique characters in their respective sets

    for word in ip_words:
      for char in word:
        input_characters.add(char)

    for word in op_words:
      for char in word:
        target_characters.add(char)

    #Sorting the list

    input_characters = sorted(list(input_characters))
    num_encoder_tokens = len(input_characters)

    target_characters = sorted(list(target_characters))
    num_decoder_tokens = len(target_characters)

    #Find the max sequence length input and target

    max_encoder_seq_length = max([len(word) for word in ip_words])
    max_decoder_seq_length = max([len(word) for word in op_words])

    #Printing the summary :

    print("Summary of the dataset :")
    print("Number of train samples :" , len(train_input_lexicons))
    print("Number of val samples :" , len(val_input_lexicons))
    print("Number of test samples :" , len(test_input_lexicons))
    print("Number of unique input tokens :" , num_encoder_tokens)
    print("Number of unique output tokens :" , num_decoder_tokens)
    print("Max sequence length for inputs:" , max_encoder_seq_length)
    print("Max sequence length for outputs:" , max_decoder_seq_length)

    #Creating a dictionary for input words and target words

    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])


    encoder_train_input_data = np.zeros((len(train_input_lexicons), max_encoder_seq_length), dtype="float32")

    encoder_val_input_data = np.zeros((len(val_input_lexicons), max_encoder_seq_length), dtype="float32")

    encoder_test_input_data = np.zeros((len(test_input_lexicons), max_encoder_seq_length), dtype="float32")


    decoder_train_input_data = np.zeros((len(train_input_lexicons), max_decoder_seq_length), dtype="float32")
    decoder_train_target_data = np.zeros((len(train_input_lexicons), max_decoder_seq_length, num_decoder_tokens ), dtype="float32")

   
    decoder_val_input_data = np.zeros((len(val_input_lexicons), max_decoder_seq_length), dtype="float32")
    decoder_val_target_data = np.zeros((len(val_input_lexicons), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    
    #TRAIN DATA

    for i, (input_text, target_text) in enumerate(zip(train_input_lexicons, train_target_lexicons)):
      for t, char in enumerate(input_text):
          encoder_train_input_data[i, t] = input_token_index[char]
      encoder_train_input_data[i, t + 1 :] = input_token_index[' ']
      for t, char in enumerate(target_text):
          # decoder_target_data is ahead of decoder_input_data by one timestep
          decoder_train_input_data[i, t] = target_token_index[char]
          if t > 0:
              # decoder_target_data will be ahead by one timestep
              # and will not include the start character.
              decoder_train_target_data[i, t - 1, target_token_index[char]] = 1.0
      decoder_train_input_data[i, t + 1 :] = target_token_index[' ']
      decoder_train_target_data[i, t:, target_token_index[' ']] = 1.0

    #VALIDATION DATA

    for i, (input_text, target_text) in enumerate(zip(val_input_lexicons, val_target_lexicons)):
      for t, char in enumerate(input_text):
          encoder_val_input_data[i, t] = input_token_index[char]
      encoder_val_input_data[i, t + 1 :] = input_token_index[' ']
      for t, char in enumerate(target_text):
          # decoder_target_data is ahead of decoder_input_data by one timestep
          decoder_val_input_data[i, t] = target_token_index[char]
          if t > 0:
              # decoder_target_data will be ahead by one timestep
              # and will not include the start character.
              decoder_val_target_data[i, t - 1, target_token_index[char]] = 1.0
      decoder_val_input_data[i, t + 1 :] = target_token_index[' ']
      decoder_val_target_data[i, t:, target_token_index[' ']] = 1.0

    #TEST DATA

    for i, input_text in enumerate(test_input_lexicons):
      for t, char in enumerate(input_text):
          encoder_test_input_data[i, t] = input_token_index[char]
      encoder_test_input_data[i, t + 1 :] = input_token_index[' ']
      

    inverse_input_token_index = dict((i, char) for char, i in input_token_index.items())
    inverse_target_token_index = dict((i, char) for char, i in target_token_index.items())

    return ((encoder_train_input_data , encoder_val_input_data , encoder_test_input_data),
    (decoder_train_input_data , decoder_val_input_data),
    (decoder_train_target_data , decoder_val_target_data),
    (val_input_lexicons , test_input_lexicons),
    (val_target_lexicons , test_target_lexicons),
    (num_encoder_tokens , num_decoder_tokens),
    (max_encoder_seq_length , max_decoder_seq_length),
    (target_token_index , inverse_input_token_index , inverse_target_token_index))

#Load the preprocessed data

In [None]:
((encoder_train_input_data , encoder_val_input_data , encoder_test_input_data),
 (decoder_train_input_data , decoder_val_input_data),
 (decoder_train_target_data , decoder_val_target_data),
 (val_input_lexicons , test_input_lexicons),
 (val_target_lexicons , test_target_lexicons),
 (num_encoder_tokens , num_decoder_tokens),
 (max_encoder_seq_length , max_decoder_seq_length),
 (target_token_index , inverse_input_token_index , inverse_target_token_index)) = preprocess_data()

Summary of the dataset :
Number of train samples : 58550
Number of val samples : 5683
Number of test samples : 5747
Number of unique input tokens : 27
Number of unique output tokens : 66
Max sequence length for inputs: 25
Max sequence length for outputs: 22


#Building an Attention layer

In [None]:
import tensorflow as tf
class AttentionLayer(Layer):

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):

        self.W_a = self.add_weight(name='W_a',
                                   shape = tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer = 'uniform',
                                   trainable = True)

        self.U_a = self.add_weight(name = 'U_a',
                                   shape = tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer = 'uniform',
                                   trainable = True)

        self.V_a = self.add_weight(name = 'V_a',
                                   shape = tf.TensorShape((input_shape[0][2], 1)),
                                   initializer = 'uniform',
                                   trainable = True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs):
       
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        encoder_out_seq, decoder_out_seq = inputs

        def energy_step(inputs, states):
           
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)
            
            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)

            return c_i, [c_i]

        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

#Building an RNN Model

In [None]:
def train():

    np.random.seed(77)

    #Initializing WandB
    run = wandb.init()
    config = run.config

    #Setting up the Run name
    name = "ES_" + str(config.embedding_size) + "_CT_" + config.cell_type + "_DO_" + str(config.dropout) + "_BS_" + str(config.beam_size)
    run.name = name

    #Set the variables

    #Define input sequence and Setting the Encoder
    encoder_inputs = keras.Input(shape =(None, ))
    encoder_outputs = keras.layers.Embedding(input_dim = num_encoder_tokens, output_dim = config.embedding_size, input_length = max_encoder_seq_length)(encoder_inputs)

    #Setting the Decoder
    decoder_inputs = keras.Input(shape=(None, ))
    decoder_outputs = keras.layers.Embedding(input_dim = num_decoder_tokens, output_dim = config.embedding_size, input_length = max_decoder_seq_length)(decoder_inputs)

    # We discard encoder_outputs and only keep the states.
    # Set up the decoder, using encoder_states as initial state.
    encoder_states = list()

    if config.cell_type == "rnn":
        encoder_outputs, state = keras.layers.SimpleRNN(config.hidden_layer_size, dropout = config.dropout, return_state = True, recurrent_dropout = config.recurrent_dropout, return_sequences = True)(encoder_outputs)
        encoder_states = [state]
    if config.cell_type == "lstm":
        encoder_outputs, state_h, state_c = keras.layers.LSTM(config.hidden_layer_size, dropout = config.dropout, return_state = True, recurrent_dropout = config.recurrent_dropout, return_sequences = True)(encoder_outputs)
        encoder_states = [state_h,state_c]
    if config.cell_type == "gru":
        encoder_outputs, state = keras.layers.GRU(config.hidden_layer_size, dropout = config.dropout, return_state = True, recurrent_dropout = config.recurrent_dropout, return_sequences = True)(encoder_outputs)
        encoder_states = [state]

    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.

    if config.cell_type == "rnn":
        decoder = keras.layers.SimpleRNN(config.hidden_layer_size, dropout = config.dropout, return_sequences = True, recurrent_dropout = config.recurrent_dropout, return_state = True)
        decoder_outputs, state = decoder(decoder_outputs, initial_state = encoder_states)
        decoder_states = [state]
    if config.cell_type == "lstm":
        decoder = keras.layers.LSTM(config.hidden_layer_size, dropout = config.dropout, return_sequences = True, recurrent_dropout = config.recurrent_dropout, return_state = True)
        decoder_outputs, state_h, state_c = decoder(decoder_outputs, initial_state = encoder_states)
        decoder_states = [state_h , state_c]
    if config.cell_type == "gru":
        decoder = keras.layers.GRU(config.hidden_layer_size, dropout = config.dropout, return_sequences = True, recurrent_dropout = config.recurrent_dropout, return_state = True)
        decoder_outputs, state = decoder(decoder_outputs, initial_state = encoder_states)
        decoder_states = [state]
    

    # Attention
    attention_mech = AttentionLayer()
    attention_output, _ = attention_mech([encoder_outputs, decoder_outputs])
    decoder_input_concate = keras.layers.Concatenate(axis = -1)([decoder_outputs, attention_output])

    #Adding a dense layer
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation = "softmax")
    decoder_outputs = decoder_dense(decoder_input_concate)

    # Define the model that will turn on
    # encoder_train_input_data & decoder_train_input_data into decoder_train_target_data

    model = keras.Model([encoder_inputs , decoder_inputs] , decoder_outputs)

    #Compiling the model
    model.compile(optimizer=config.learning_algo , loss = "categorical_crossentropy" , metrics=["accuracy"])

    #Fitting the model
    model.fit(
        [encoder_train_input_data, decoder_train_input_data],
        decoder_train_target_data,
        batch_size = config.batch_size,
        epochs = config.epochs,
        callbacks = [WandbCallback()]
    )

    #save the model
    model.save("seq2seq_2")

    #Calculating validation accuracy using inference on validation data
    #val_accuracy = run_inference(encoder_val_input_data, val_input_lexicons, val_target_lexicons, num_decoder_tokens, max_decoder_seq_length, target_token_index, inverse_target_token_index, config.hidden_layer_size ,config.cell_type)
    #print("VALIDATION ACCURACY :" , val_accuracy)
    #wandb.log({"val_accuracy": val_accuracy})

    #Calculating test accuracy using inference on test data
    #test_accuracy = run_inference(encoder_test_input_data, test_input_lexicons, test_target_lexicons, num_decoder_tokens, max_decoder_seq_length, target_token_index, inverse_target_token_index, config.hidden_layer_size , config.cell_type)
    #print("TEST ACCURACY :" , test_accuracy)
    # wandb.log({"test_accuracy": test_accuracy})


#Sweep Configuration

In [None]:
sweep_config_temp = {
  "name": "attention_bayes_1",

  "method": "grid",

  "metric": {
      "name": "accuracy",
      "goal": "maximize"  
    },
    
  "parameters": {
        "batch_size": {
            "values": [256]
        },
        "beam_size": {
            "values": [0]
        },
        "cell_type": {
            "values": ["gru"]
        },
        "decoder_layers": {
            "values": [1]
        },
        "dropout": {
            "values": [0.3]
        },
        "embedding_size": {
            "values": [256]
        },
        "encoder_layers" :{
            "values" : [1]
        },
        "epochs": {
            "values": [25]
        },
        "hidden_layer_size": {
            "values": [256]
        },
        "learning_algo": {
            "values": ["adam"]
        },
        "recurrent_dropout": {
            "values": [0.0]
        }
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config_temp, entity="cs21m027_cs21m011", project="DL_ASG_3_final_Attention")
wandb.agent(sweep_id, train)

Create sweep with ID: 5vrnimpy
Sweep URL: https://wandb.ai/cs21m027_cs21m011/DL_ASG_3_temp/sweeps/5vrnimpy


[34m[1mwandb[0m: Agent Starting Run: onntl9za with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 0
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 25
[34m[1mwandb[0m: 	hidden_layer_size: 256
[34m[1mwandb[0m: 	learning_algo: adam
[34m[1mwandb[0m: 	recurrent_dropout: 0
[34m[1mwandb[0m: Currently logged in as: [33mcs21m027_cs21m011[0m (use `wandb login --relogin` to force relogin)


Epoch 1/25
 49/229 [=====>........................] - ETA: 33s - loss: 1.8580 - accuracy: 0.6014

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
