<a href="https://colab.research.google.com/github/hemanths03/CS6910_Assignment_3/blob/main/No_Attention_cmd_line.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
from random import sample

from tensorflow import keras

#WandB Login

In [2]:
!pip install wandb -qqq
import wandb
wandb.login()

from wandb.keras import WandbCallback

[K     |████████████████████████████████| 1.8 MB 4.5 MB/s 
[K     |████████████████████████████████| 144 kB 41.1 MB/s 
[K     |████████████████████████████████| 181 kB 39.3 MB/s 
[K     |████████████████████████████████| 63 kB 893 kB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


#Mounting the google drive

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#Load the datasets

In [4]:
train_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.train.tsv', sep = '\t', header = None)
val_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.dev.tsv', sep = '\t', header = None)
test_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.test.tsv', sep = '\t', header = None)

#Preprocessing the dataset into required format

In [5]:
def preprocess_data():

    #Declaring some required variables

    train_input_lexicons = []
    train_target_lexicons = []
    val_input_lexicons = []
    val_target_lexicons = []
    test_input_lexicons = []
    test_target_lexicons = []

    #Splitting the datasets in input_lexicons and target_lexicons

    for i in range(len(train_dataset)):
      train_input_lexicons.append(str(train_dataset[1][i]))
      train_target_lexicons.append("\t" + str(train_dataset[0][i])+"\n")

    for i in range(len(val_dataset)):
      val_input_lexicons.append(str(val_dataset[1][i]))
      val_target_lexicons.append("\t" + str(val_dataset[0][i])+"\n")

    for i in range(len(test_dataset)):
      test_input_lexicons.append(str(test_dataset[1][i]))
      test_target_lexicons.append("\t" + str(test_dataset[0][i])+"\n")


    #Creating unique input and target character sets

    input_characters = set()
    input_characters.add(' ')
    target_characters = set()
    target_characters.add(' ')

    #union of all input words

    ip_words = train_input_lexicons + val_input_lexicons + test_input_lexicons

    #Union of all target words

    op_words = train_target_lexicons + val_target_lexicons + test_target_lexicons

    #Adding unique characters in their respective sets

    for word in ip_words:
      for char in word:
        input_characters.add(char)

    for word in op_words:
      for char in word:
        target_characters.add(char)

    #Sorting the list

    input_characters = sorted(list(input_characters))
    num_encoder_tokens = len(input_characters)

    target_characters = sorted(list(target_characters))
    num_decoder_tokens = len(target_characters)

    #Find the max sequence length input and target

    max_encoder_seq_length = max([len(word) for word in ip_words])
    max_decoder_seq_length = max([len(word) for word in op_words])

    #Printing the summary :

    print("Summary of the dataset :")
    print("Number of train samples :" , len(train_input_lexicons))
    print("Number of val samples :" , len(val_input_lexicons))
    print("Number of test samples :" , len(test_input_lexicons))
    print("Number of unique input tokens :" , num_encoder_tokens)
    print("Number of unique output tokens :" , num_decoder_tokens)
    print("Max sequence length for inputs:" , max_encoder_seq_length)
    print("Max sequence length for outputs:" , max_decoder_seq_length)

    #Creating a dictionary for input words and target words

    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])


    encoder_train_input_data = np.zeros((len(train_input_lexicons), max_encoder_seq_length), dtype="float32")

    encoder_val_input_data = np.zeros((len(val_input_lexicons), max_encoder_seq_length), dtype="float32")

    encoder_test_input_data = np.zeros((len(test_input_lexicons), max_encoder_seq_length), dtype="float32")


    decoder_train_input_data = np.zeros((len(train_input_lexicons), max_decoder_seq_length), dtype="float32")
    decoder_train_target_data = np.zeros((len(train_input_lexicons), max_decoder_seq_length, num_decoder_tokens ), dtype="float32")

   
    decoder_val_input_data = np.zeros((len(val_input_lexicons), max_decoder_seq_length), dtype="float32")
    decoder_val_target_data = np.zeros((len(val_input_lexicons), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    
    #TRAIN DATA

    for i, (input_text, target_text) in enumerate(zip(train_input_lexicons, train_target_lexicons)):
      for t, char in enumerate(input_text):
          encoder_train_input_data[i, t] = input_token_index[char]
      encoder_train_input_data[i, t + 1 :] = input_token_index[' ']
      for t, char in enumerate(target_text):
          # decoder_target_data is ahead of decoder_input_data by one timestep
          decoder_train_input_data[i, t] = target_token_index[char]
          if t > 0:
              # decoder_target_data will be ahead by one timestep
              # and will not include the start character.
              decoder_train_target_data[i, t - 1, target_token_index[char]] = 1.0
      decoder_train_input_data[i, t + 1 :] = target_token_index[' ']
      decoder_train_target_data[i, t:, target_token_index[' ']] = 1.0

    #VALIDATION DATA

    for i, (input_text, target_text) in enumerate(zip(val_input_lexicons, val_target_lexicons)):
      for t, char in enumerate(input_text):
          encoder_val_input_data[i, t] = input_token_index[char]
      encoder_val_input_data[i, t + 1 :] = input_token_index[' ']
      for t, char in enumerate(target_text):
          # decoder_target_data is ahead of decoder_input_data by one timestep
          decoder_val_input_data[i, t] = target_token_index[char]
          if t > 0:
              # decoder_target_data will be ahead by one timestep
              # and will not include the start character.
              decoder_val_target_data[i, t - 1, target_token_index[char]] = 1.0
      decoder_val_input_data[i, t + 1 :] = target_token_index[' ']
      decoder_val_target_data[i, t:, target_token_index[' ']] = 1.0

    #TEST DATA

    for i, input_text in enumerate(test_input_lexicons):
      for t, char in enumerate(input_text):
          encoder_test_input_data[i, t] = input_token_index[char]
      encoder_test_input_data[i, t + 1 :] = input_token_index[' ']
      

    inverse_input_token_index = dict((i, char) for char, i in input_token_index.items())
    inverse_target_token_index = dict((i, char) for char, i in target_token_index.items())

    return ((encoder_train_input_data , encoder_val_input_data , encoder_test_input_data),
    (decoder_train_input_data , decoder_val_input_data),
    (decoder_train_target_data , decoder_val_target_data),
    (val_input_lexicons , test_input_lexicons),
    (val_target_lexicons , test_target_lexicons),
    (num_encoder_tokens , num_decoder_tokens),
    (max_encoder_seq_length , max_decoder_seq_length),
    (target_token_index , inverse_input_token_index , inverse_target_token_index))

#Load the preprocessed data

In [6]:
((encoder_train_input_data , encoder_val_input_data , encoder_test_input_data),
 (decoder_train_input_data , decoder_val_input_data),
 (decoder_train_target_data , decoder_val_target_data),
 (val_input_lexicons , test_input_lexicons),
 (val_target_lexicons , test_target_lexicons),
 (num_encoder_tokens , num_decoder_tokens),
 (max_encoder_seq_length , max_decoder_seq_length),
 (target_token_index , inverse_input_token_index , inverse_target_token_index)) = preprocess_data()

Summary of the dataset :
Number of train samples : 58550
Number of val samples : 5683
Number of test samples : 5747
Number of unique input tokens : 27
Number of unique output tokens : 66
Max sequence length for inputs: 25
Max sequence length for outputs: 22


#Sigmoid function

In [7]:
#Sigmoid function
def sigmoid(x):
    temp = []
    for i in range(len(x)):
      temp.append(1/(1 + np.exp(-x[i])))
    return list(temp)

#Decode sequence along with Beam Search Decoder 

In [8]:
def decode_sequence(input_seq , encoder_model , decoder_model , decoder_cell_index , target_token_index , inverse_target_token_index , max_decoder_seq_length , beam_size):
    #Encode the input as state vectors.
    states_value = [encoder_model.predict(input_seq)] * len(decoder_cell_index)

    #Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    #Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index["\t"]

    sequences = [[0.0, 0, states_value, target_seq,  list(),list()]]
        
    #Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ""  

    while not stop_condition:

        if beam_size == 0:

            output = decoder_model.predict([target_seq] + states_value)
            output_tokens, states_value = output[0], output[1:]

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])

            sampled_char = inverse_target_token_index[sampled_token_index]

            decoded_sentence += sampled_char

            # Exit condition: either hit max length
            # or find stop character.

            if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
                stop_condition = True

            # Update the target sequence (of length 1).
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index 

        else:

            #Temporary list
            temp_list = list()
            
            #Beam search Decoder
            for i in range(len(sequences)):

              output = decoder_model.predict([sequences[i][3]] + sequences[i][2])
              output_tokens, states_value = output[0], output[1:]
              seq_prob = output_tokens[0,-1,:]
                  
              score, flag, sv, t_seq, seq, dec_word = sequences[i]
                  
              if flag == 0:
                for j in range(len(inverse_target_token_index)):
                  char = inverse_target_token_index[j]

                  target_seq = np.zeros((1, 1))
                  target_seq[0, 0] = j

                  chr = [score - np.log(seq_prob[j]), 0, states_value, target_seq,  seq + [j] , dec_word + [char] ]
                  temp_list.append(chr)
                
            ordered_list = sorted(temp_list, key=lambda x:x[0])

            minlen = min(beam_size, len(ordered_list))

            sequences = ordered_list[:minlen]

            stop_condition = True
              
            for i in range(len(sequences)):
                  
                score, flag, sv, t_seq, seq, dec_word = sequences[i]

                if dec_word[-1] == "\n": flag = 1

                if len(dec_word) > max_decoder_seq_length : flag = 1

                sequences[i] = [score, flag, sv, t_seq, seq, dec_word].copy()

                if flag == 0: stop_condition = False

            if sequences[0][-1][-1]=="\n": stop_condition = True

    if beam_size != 0 : 
      decoded_sentence = ''.join(sequences[0][5])

    return decoded_sentence

#Inference call to calculate the accuracy
1.)encode input and retrieve initial decoder state

2.)run one step of decoder with this initial state and a "start of sequence" token as target. Output will be the next target token.

3.)Repeat with the current target token and current states

In [13]:
def run_inference(encoder_test_input_data, test_input_lexicons, test_target_lexicons , num_decoder_tokens, max_decoder_seq_length, target_token_index, inverse_target_token_index, enc_latent_dims, dec_latent_dims, cell_type, beam_size):
      # Define sampling models
      # Restore the model and construct the encoder and decoder.
      model = keras.models.load_model("seq2seq")

      #Declare index variable for encoder embedding and decoder embedding 
      encoder_embedding_index = -1
      decoder_embedding_index = -1

      dense_layer_index = -1

      encoder_cell_count = 0

      encoder_cell_index = []
      decoder_cell_index = []

      num_encoder_layers = len(enc_latent_dims)
      num_decoder_layers = len(dec_latent_dims)

      count = 0
      size_ = len(test_input_lexicons)

      predictions_vanilla_RNN = open("predictions_vanilla_RNN.csv", "w", encoding='utf-8')
      predictions_vanilla_RNN.write("Input Word,Predicted Word,True Word\n")

      for i, layer in enumerate(model.layers):
          #For cell_type layer  
          if cell_type in layer.name :
            if encoder_cell_count < num_encoder_layers :
              encoder_cell_index.append(i)
              encoder_cell_count += 1
            else :
              decoder_cell_index.append(i)

          #For embedding layer
          if "embedding" in layer.name :
            if encoder_embedding_index < 0 :
              encoder_embedding_index = i
            else :
              decoder_embedding_index = i
          
          #For Dense layer
          if "dense" in layer.name :
            dense_layer_index = i

      #Defining the encoder
      encoder_inputs = model.input[0] #input_1

      if cell_type == "rnn" or cell_type == "gru" :
        encoder_outputs , state = model.layers[encoder_cell_index[-1]].output
        encoder_model = keras.Model(encoder_inputs , [state])
      elif cell_type == "lstm" :
        encoder_output , state_h_enc , state_c_enc = model.layers[encoder_cell_index[-1]].output
        encoder_model = keras.Model(encoder_inputs , [state_h_enc , state_c_enc])
      else :
         print("Write the Required cell type encoder definition here...!")
         return

      #Defining the Decoder
      decoder_inputs = model.input[1]  # input_2
      decoder_outputs =  model.layers[decoder_embedding_index](decoder_inputs)

      decoder_states_inputs =  []
      decoder_states = []

      #Decoder
      for i in range(len(decoder_cell_index)):

          if cell_type == "rnn" or cell_type == "gru":
              state = keras.Input(shape = (dec_latent_dims[i], ))
              current_states_inputs = [state]
              decoder_outputs, state = model.layers[decoder_cell_index[i]](decoder_outputs, initial_state = current_states_inputs)
              decoder_states += [state]

          elif cell_type == "lstm":
              state_h_dec, state_c_dec = keras.Input(shape = (dec_latent_dims[i],)),  keras.Input(shape = (dec_latent_dims[i],))
              current_states_inputs = [state_h_dec, state_c_dec]
              decoder_outputs, state_h_dec,state_c_dec = model.layers[decoder_cell_index[i]](decoder_outputs, initial_state = current_states_inputs)
              decoder_states += [state_h_dec, state_c_dec]

          decoder_states_inputs += current_states_inputs

      # Dense layer
      decoder_outputs = model.layers[dense_layer_index](decoder_outputs)

      # Decoder model
      decoder_model = keras.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)


      for seq_index in range(size_):        
          input_seq = encoder_test_input_data[seq_index : seq_index + 1]

          decoded_word = decode_sequence(input_seq , encoder_model , decoder_model , decoder_cell_index , target_token_index , inverse_target_token_index , max_decoder_seq_length , beam_size)

          orig_word = test_target_lexicons[seq_index][1:]

          predictions_vanilla_RNN.write(test_input_lexicons[seq_index] + "," + decoded_word[:-1] + "," + orig_word[:-1] + "\n")
          
          if(orig_word == decoded_word): count += 1

          if(seq_index % 100 == 0): 
            print("current index is "+str(seq_index)+"/5683 \n")

      return count / size_

#Building an RNN Model

In [11]:
def train():

    np.random.seed(77)

    hidden_layer_size = int(input('enter size of hidden layer : '))
    encoder_layers = int(input('enter number of encoder layers : '))
    decoder_layers = encoder_layers
    embedding_size = int(input('enter embedding size : '))
    cell_type = input('enter cell type : ')
    dropout = float(input('enter dropout : '))
    beam_size = int(input('enter beam size : '))
    recurrent_dropout = float(input('enter recurrent dropout value : '))
    learning_algo = input('enter learning algo, adam or rmsprop : ')
    batch_size = int(input('enter batch size : '))
    epochs = int(input('enter number of epochs : '))


    #Set the variables
    enc_latent_dims = [hidden_layer_size] * encoder_layers
    dec_latent_dims  = [hidden_layer_size] * decoder_layers

    #Define input sequence and Setting the Encoder
    encoder_inputs = keras.Input(shape = (None, ))
    encoder_outputs = keras.layers.Embedding(input_dim = num_encoder_tokens, output_dim = embedding_size, input_length = max_encoder_seq_length)(encoder_inputs)

    #Setting the Decoder
    decoder_inputs = keras.Input(shape=(None, ))
    decoder_outputs = keras.layers.Embedding(input_dim = num_decoder_tokens, output_dim = embedding_size, input_length = max_decoder_seq_length)(decoder_inputs)

    # We discard encoder_outputs and only keep the states.
    # Set up the decoder, using encoder_states as initial state.
    encoder_states = list()

    for i in range(len(enc_latent_dims)):
        if cell_type == "rnn":
            encoder_outputs, state = keras.layers.SimpleRNN(enc_latent_dims[i], dropout = dropout, return_state = True, recurrent_dropout = recurrent_dropout, return_sequences = True)(encoder_outputs)
            encoder_states = [state]
        if cell_type == "lstm":
            encoder_outputs, state_h, state_c = keras.layers.LSTM(enc_latent_dims[i], dropout = dropout, return_state = True, recurrent_dropout = recurrent_dropout, return_sequences = True)(encoder_outputs)
            encoder_states = [state_h,state_c]
        if cell_type == "gru":
            encoder_outputs, state = keras.layers.GRU(enc_latent_dims[i], dropout = dropout, return_state = True, recurrent_dropout = recurrent_dropout, return_sequences = True)(encoder_outputs)
            encoder_states = [state]

    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_states = encoder_states.copy()

    for j in range(len(dec_latent_dims)):
        if cell_type == "rnn":
            decoder = keras.layers.SimpleRNN(dec_latent_dims[i], dropout = dropout, return_sequences = True, recurrent_dropout = recurrent_dropout, return_state = True)
            decoder_outputs, state = decoder(decoder_outputs, initial_state = decoder_states)
        if cell_type == "lstm":
            decoder = keras.layers.LSTM(dec_latent_dims[i], dropout = dropout, return_sequences = True, recurrent_dropout = recurrent_dropout, return_state = True)
            decoder_outputs, state_h, state_c = decoder(decoder_outputs, initial_state = decoder_states)
        if cell_type == "gru":
            decoder = keras.layers.GRU(dec_latent_dims[i], dropout = dropout, return_sequences = True, recurrent_dropout = recurrent_dropout, return_state = True)
            decoder_outputs, state = decoder(decoder_outputs, initial_state = decoder_states)
    
    #Adding a dense layer
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation = "softmax")
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn on
    # encoder_train_input_data & decoder_train_input_data into decoder_train_target_data

    model = keras.Model([encoder_inputs , decoder_inputs] , decoder_outputs)

    #Compiling the model
    model.compile(optimizer=learning_algo , loss = "categorical_crossentropy" , metrics=["accuracy"])

    #Fitting the model
    model.fit(
        [encoder_train_input_data, decoder_train_input_data],
        decoder_train_target_data,
        batch_size = batch_size,
        epochs = epochs,
        #callbacks = [WandbCallback()]
    )

    #save the model
    model.save("seq2seq")

    #Calculating validation accuracy using inference on validation data
    val_accuracy = run_inference(encoder_val_input_data, val_input_lexicons, val_target_lexicons, num_decoder_tokens, max_decoder_seq_length, target_token_index, inverse_target_token_index, enc_latent_dims, dec_latent_dims, cell_type, beam_size)
    print("VALIDATION ACCURACY :" , val_accuracy)
    #wandb.log({"val_accuracy": val_accuracy})

    #Calculating test accuracy using inference on test data
    test_accuracy = run_inference(encoder_test_input_data, test_input_lexicons, test_target_lexicons, num_decoder_tokens, max_decoder_seq_length, target_token_index, inverse_target_token_index, enc_latent_dims, dec_latent_dims, cell_type, beam_size)
    print("TEST ACCURACY :" , test_accuracy)
    #wandb.log({"test_accuracy": test_accuracy})


#Start the Execution

In [13]:
train()