<a href="https://colab.research.google.com/github/hemanths03/CS6910_Assignment_3/blob/main/No_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing Required Libraries

In [50]:
import pandas as pd
import numpy as np
from random import sample

from tensorflow import keras

#WandB Login

In [51]:
!pip install wandb -qqq
import wandb
wandb.login()

from wandb.keras import WandbCallback

#Mounting the google drive

In [52]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#Load the datasets

In [53]:
train_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.train.tsv', sep = '\t', header = None)
val_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.dev.tsv', sep = '\t', header = None)
test_dataset = pd.read_csv('/content/gdrive/MyDrive/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.test.tsv', sep = '\t', header = None)

#Preprocessing the dataset into required format

In [69]:
def preprocess_data():

    #Declaring some required variables

    train_input_lexicons = []
    train_target_lexicons = []
    val_input_lexicons = []
    val_target_lexicons = []
    test_input_lexicons = []
    test_target_lexicons = []

    #Splitting the datasets in input_lexicons and target_lexicons

    for i in range(len(train_dataset)):
      train_input_lexicons.append(str(train_dataset[1][i]))
      train_target_lexicons.append("\t" + str(train_dataset[0][i])+"\n")

    for i in range(len(val_dataset)):
      val_input_lexicons.append(str(val_dataset[1][i]))
      val_target_lexicons.append("\t" + str(val_dataset[0][i])+"\n")

    for i in range(len(test_dataset)):
      test_input_lexicons.append(str(test_dataset[1][i]))
      test_target_lexicons.append("\t" + str(test_dataset[0][i])+"\n")


    #Creating unique input and target character sets

    input_characters = set()
    input_characters.add(' ')
    target_characters = set()
    target_characters.add(' ')

    #union of all input words

    ip_words = train_input_lexicons + val_input_lexicons + test_input_lexicons

    #Union of all target words

    op_words = train_target_lexicons + val_target_lexicons + test_target_lexicons

    #Adding unique characters in their respective sets

    for word in ip_words:
      for char in word:
        input_characters.add(char)

    for word in op_words:
      for char in word:
        target_characters.add(char)

    #Sorting the list

    input_characters = sorted(list(input_characters))
    num_encoder_tokens = len(input_characters)

    target_characters = sorted(list(target_characters))
    num_decoder_tokens = len(target_characters)

    #Find the max sequence length input and target

    max_encoder_seq_length = max([len(word) for word in ip_words])
    max_decoder_seq_length = max([len(word) for word in op_words])

    #Printing the summary :

    print("Summary of the dataset :")
    print("Number of train samples :" , len(train_input_lexicons))
    print("Number of val samples :" , len(val_input_lexicons))
    print("Number of test samples :" , len(test_input_lexicons))
    print("Number of unique input tokens :" , num_encoder_tokens)
    print("Number of unique output tokens :" , num_decoder_tokens)
    print("Max sequence length for inputs:" , max_encoder_seq_length)
    print("Max sequence length for outputs:" , max_decoder_seq_length)

    #Creating a dictionary for input words and target words

    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])


    encoder_train_input_data = np.zeros((len(train_input_lexicons), max_encoder_seq_length), dtype="float32")

    encoder_val_input_data = np.zeros((len(val_input_lexicons), max_encoder_seq_length), dtype="float32")

    encoder_test_input_data = np.zeros((len(test_input_lexicons), max_encoder_seq_length), dtype="float32")


    decoder_train_input_data = np.zeros((len(train_input_lexicons), max_decoder_seq_length), dtype="float32")
    decoder_train_target_data = np.zeros((len(train_input_lexicons), max_decoder_seq_length, num_decoder_tokens ), dtype="float32")

   
    decoder_val_input_data = np.zeros((len(val_input_lexicons), max_decoder_seq_length), dtype="float32")
    decoder_val_target_data = np.zeros((len(val_input_lexicons), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    
    #TRAIN DATA

    for i, (input_text, target_text) in enumerate(zip(train_input_lexicons, train_target_lexicons)):
      for t, char in enumerate(input_text):
          encoder_train_input_data[i, t] = input_token_index[char]
      encoder_train_input_data[i, t + 1 :] = input_token_index[' ']
      for t, char in enumerate(target_text):
          # decoder_target_data is ahead of decoder_input_data by one timestep
          decoder_train_input_data[i, t] = target_token_index[char]
          if t > 0:
              # decoder_target_data will be ahead by one timestep
              # and will not include the start character.
              decoder_train_target_data[i, t - 1, target_token_index[char]] = 1.0
      decoder_train_input_data[i, t + 1 :] = target_token_index[' ']
      decoder_train_target_data[i, t:, target_token_index[' ']] = 1.0

    #VALIDATION DATA

    for i, (input_text, target_text) in enumerate(zip(val_input_lexicons, val_target_lexicons)):
      for t, char in enumerate(input_text):
          encoder_val_input_data[i, t] = input_token_index[char]
      encoder_val_input_data[i, t + 1 :] = input_token_index[' ']
      for t, char in enumerate(target_text):
          # decoder_target_data is ahead of decoder_input_data by one timestep
          decoder_val_input_data[i, t] = target_token_index[char]
          if t > 0:
              # decoder_target_data will be ahead by one timestep
              # and will not include the start character.
              decoder_val_target_data[i, t - 1, target_token_index[char]] = 1.0
      decoder_val_input_data[i, t + 1 :] = target_token_index[' ']
      decoder_val_target_data[i, t:, target_token_index[' ']] = 1.0

    #TEST DATA

    for i, input_text in enumerate(test_input_lexicons):
      for t, char in enumerate(input_text):
          encoder_test_input_data[i, t] = input_token_index[char]
      encoder_test_input_data[i, t + 1 :] = input_token_index[' ']
      

    inverse_input_token_index = dict((i, char) for char, i in input_token_index.items())
    inverse_target_token_index = dict((i, char) for char, i in target_token_index.items())

    return ((encoder_train_input_data , encoder_val_input_data , encoder_test_input_data),
    (decoder_train_input_data , decoder_val_input_data),
    (decoder_train_target_data , decoder_val_target_data),
    (val_input_lexicons , test_input_lexicons),
    (val_target_lexicons , test_target_lexicons),
    (num_encoder_tokens , num_decoder_tokens),
    (max_encoder_seq_length , max_decoder_seq_length),
    (target_token_index , inverse_input_token_index , inverse_target_token_index))

#Load the preprocessed data

In [71]:
((encoder_train_input_data , encoder_val_input_data , encoder_test_input_data),
 (decoder_train_input_data , decoder_val_input_data),
 (decoder_train_target_data , decoder_val_target_data),
 (val_input_lexicons , test_input_lexicons),
 (val_target_lexicons , test_target_lexicons),
 (num_encoder_tokens , num_decoder_tokens),
 (max_encoder_seq_length , max_decoder_seq_length),
 (target_token_index , inverse_input_token_index , inverse_target_token_index)) = preprocess_data()

Summary of the dataset :
Number of train samples : 58550
Number of val samples : 5683
Number of test samples : 5747
Number of unique input tokens : 27
Number of unique output tokens : 66
Max sequence length for inputs: 25
Max sequence length for outputs: 22
