<a href="https://colab.research.google.com/github/anandhc6/Assignment-3/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required packages
import math
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import backend
from random import randrange 
from tensorflow import keras
from google.colab import files
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tensorflow.python.keras.models import load_model
from tensorflow.python.keras.callbacks import EarlyStopping

In [None]:
# Downloading dataset

!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf 'dakshina_dataset_v1.0.tar'

In [None]:
# embedding train data

def embed_train_data(train_data_lines):

    lenk = len(train_data_lines) - 1
    train_input_data = []
    train_target_data = []
    input_data_characters = set()
    target_data_characters = set()
    
    for line in train_data_lines[: lenk]:
        target_data, input_data, _ = line.split("\t")

        # We are using "tab" as the "start sequence" and "\n" as "end sequence".
        target_data = "\t" + target_data + "\n"
        train_input_data.append(input_data)
        train_target_data.append(target_data)

        # Finding unique characters.
        for ch in input_data:
            if ch not in input_data_characters:
                input_data_characters.add(ch)
        for ch in target_data:
            if ch not in target_data_characters:
                target_data_characters.add(ch)

    print("Number of samples:", len(train_input_data))
    # adding space 
    input_data_characters.add(" ")
    target_data_characters.add(" ")

    # sorting
    input_data_characters = sorted(list(input_data_characters))
    target_data_characters = sorted(list(target_data_characters))

    # maximum length of the words
    encoder_max_length = max([len(txt) for txt in train_input_data])
    decoder_max_length = max([len(txt) for txt in train_target_data])

    print("Max sequence length for inputs:", encoder_max_length)
    print("Max sequence length for outputs:", decoder_max_length)

    # number of input and target characters
    num_encoder_tokens = len(input_data_characters)
    num_decoder_tokens = len(target_data_characters)  
    
    print("Number of unique input tokens:", num_encoder_tokens)
    print("Number of unique output tokens:", num_decoder_tokens)

    # create an index
    input_token_idx = dict([(char, i) for i, char in enumerate(input_data_characters)])
    target_token_idx = dict([(char, i) for i, char in enumerate(target_data_characters)])
   
    # creating 0 array for encoder,decoder 
    encoder_input_data = np.zeros((len(train_input_data), encoder_max_length), dtype="float32")

    decoder_input_data = np.zeros((len(train_input_data), decoder_max_length), dtype="float32")

    decoder_target_data = np.zeros((len(train_input_data), decoder_max_length, num_decoder_tokens), dtype="float32")

    # index of the character is encoded for all the sample whereas target data is one hot encoded.
    for i, (input_data, target_data) in enumerate(zip(train_input_data, train_target_data)):
        for t, char in enumerate(input_data):
            encoder_input_data[i, t] = input_token_idx[char]
        
        encoder_input_data[i, t + 1:] = input_token_idx[" "]
        
        # decoder data
        for t, char in enumerate(target_data):
            # decoder_target_data is one timestep ahead of decoder_input_data
            decoder_input_data[i, t] = target_token_idx[char]

            if t > 0:
                # excluding the start character since decoder target data is one timestep ahead.
                decoder_target_data[i, t - 1, target_token_idx[char]] = 1.0
        # append the remaining positions with empty space
       
        decoder_input_data[i, t + 1:] = target_token_idx[" "]
        decoder_target_data[i, t:, target_token_idx[" "]] = 1.0

    return encoder_input_data,decoder_input_data,decoder_target_data,num_encoder_tokens,num_decoder_tokens,input_token_idx,target_token_idx,encoder_max_length,decoder_max_length


In [None]:
# embedding validation data

def embed_val_data(val_data_lines,num_decoder_tokens,input_token_idx,target_token_idx):
    val_input_data = []
    val_target_data = []
    lenk = len(val_data_lines) - 1

    for line in val_data_lines[: lenk]:
        target_data, input_data, _ = line.split("\t")
        
        # We use "tab" as the "start sequence" character and "\n" as "end sequence" character.
        target_data = "\t" + target_data + "\n"
        val_input_data.append(input_data)
        val_target_data.append(target_data)

    val_encoder_max_length = max([len(txt) for txt in val_input_data])
    val_decoder_max_length = max([len(txt) for txt in val_target_data])

    val_encoder_input_data = np.zeros((len(val_input_data), val_encoder_max_length), dtype="float32")
    val_decoder_input_data = np.zeros((len(val_input_data), val_decoder_max_length), dtype="float32")
    val_decoder_target_data = np.zeros((len(val_input_data), val_decoder_max_length, num_decoder_tokens), dtype="float32")

    for i, (input_data, target_data) in enumerate(zip(val_input_data, val_target_data)):
        for t, ch in enumerate(input_data):
            val_encoder_input_data[i, t] = input_token_idx[ch]
        val_encoder_input_data[i, t + 1:] = input_token_idx[" "]
        
        for t, ch in enumerate(target_data):
            # decoder_target_data is one timestep ahead of decoder_input_data
            val_decoder_input_data[i, t] = target_token_idx[ch]
            if t > 0:
                # excluding the start character since decoder target data is one timestep ahead.
                val_decoder_target_data[i, t - 1, target_token_idx[ch]] = 1.0
       
        val_decoder_input_data[i, t + 1:] = target_token_idx[" "]
        val_decoder_target_data[i, t:, target_token_idx[" "]] = 1.0

    return val_encoder_input_data,val_decoder_input_data,val_decoder_target_data,target_token_idx,val_target_data


In [None]:
# Embedding data
encoder_input_data,decoder_input_data,decoder_target_data,num_encoder_tokens,num_decoder_tokens,input_token_idx,target_token_idx,encoder_max_length,decoder_max_length = embed_train_data(train_data_lines)

val_encoder_input_data,val_decoder_input_data,val_decoder_target_data,target_token_idx,val_target_data = embed_val_data(val_data_lines,num_decoder_tokens,input_token_idx,target_token_idx)

reverse_input_char_index = dict((i, char) for char, i in input_token_idx.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_idx.items())


Number of samples: 68217
Max sequence length for inputs: 30
Max sequence length for outputs: 28
Number of unique input tokens: 27
Number of unique output tokens: 49
