<a href="https://colab.research.google.com/github/anandhc6/Assignment-3/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import math
import matplotlib.ticker as ticker
import numpy as np
from random import randrange 
from google.colab import files
import pandas as pd
import random
import tensorflow as tf
from tensorflow import keras
from keras import backend
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf 'dakshina_dataset_v1.0.tar'

--2022-05-06 19:25:48--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.97.128, 142.251.107.128, 173.194.210.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.97.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2022-05-06 19:25:57 (212 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [None]:
batch_size = 64  # Batch size for training.
# epochs = 10  # Number of epochs to train for.
# latent_dim = 256  # Latent dimensionality of the encoding space. #hidden states hyperparameter
# Path to the data txt file on disk.
train_data = "dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv"
val_data = "dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv"
# open and save the files to lists
with open(train_data, "r", encoding="utf-8") as f:
    train_lines = f.read().split("\n")
with open(val_data, "r", encoding="utf-8") as f:
    val_lines = f.read().split("\n")
# popping the last element of all the lists since it is empty character
train_lines.pop()
val_lines.pop()
random.shuffle(train_lines)
print(train_lines[0:2])


['பரிந்துரை\tparinthurai\t2', 'எடுப்பதில்லை\tyeduppathillai\t1']


In [None]:
# embedding train model
def embedding_train(train_lines):

    input_texts = []
    target_texts = []
    input_characters = set()
    target_characters = set()
    # go through the train lines and split them into 3 and save input and target
    for line in train_lines[: (len(train_lines) - 1)]:
        # because we want english to devanagiri conversion
        target_text, input_text, _ = line.split("\t")
        # We use "tab" as the "start sequence" character
        # for the targets, and "\n" as "end sequence" character.
        target_text = "\t" + target_text + "\n"
        # append it to the main input texts list
        input_texts.append(input_text)
        # append it to the main target texts list
        target_texts.append(target_text)
        # to find the number of unique characters in both
        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)
        for char in target_text:
            if char not in target_characters:
                target_characters.add(char)
    # add the space character to both
    input_characters.add(" ")
    target_characters.add(" ")
    # sort it
    input_characters = sorted(list(input_characters))
    target_characters = sorted(list(target_characters))
    # find the number
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    # find the maximum length of input word and target word
    max_encoder_seq_length = max([len(txt) for txt in input_texts])
    max_decoder_seq_length = max([len(txt) for txt in target_texts])
    
    print("Number of samples:", len(input_texts))
    print("Number of unique input tokens:", num_encoder_tokens)
    print("Number of unique output tokens:", num_decoder_tokens)
    print("Max sequence length for inputs:", max_encoder_seq_length)
    print("Max sequence length for outputs:", max_decoder_seq_length)
    # create an index
    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
   
    # create an 0 array for encoder input size of (input_texts,max_seqlen,tokens)
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype="float32")
    # create decoder input
    decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length), dtype="float32")
    # create decoder target
    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
    # for each sample convert it into character encoding i.e. if
    # at that position a character is present then encode the index of that character there
    # this is done for both encoder and decoder input data for further word embedding
    # but target data is one hot encoded.
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t] = input_token_index[char]
        # remaining positions set as empty space
        encoder_input_data[i, t + 1:] = input_token_index[" "]
        # similarly do for decoder data
        for t, char in enumerate(target_text):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t] = target_token_index[char]
            # check if t >0 since decoder targer data is ahead
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        # append both the remaining positions of both the datas with empty space
        decoder_input_data[i, t + 1:] = target_token_index[" "]
        decoder_target_data[i, t:, target_token_index[" "]] = 1.0

    return encoder_input_data,decoder_input_data,decoder_target_data,num_encoder_tokens,num_decoder_tokens,input_token_index,target_token_index,max_encoder_seq_length,max_decoder_seq_length


In [None]:
# embedding validation data
# for validation data, almost same
def embedding_val(val_lines,num_decoder_tokens,input_token_index,target_token_index):
    val_input_texts = []
    val_target_texts = []
    
    for line in val_lines[: (len(val_lines) - 1)]:
        target_text, input_text, _ = line.split("\t")
        # We use "tab" as the "start sequence" character
        # for the targets, and "\n" as "end sequence" character.
        target_text = "\t" + target_text + "\n"
        val_input_texts.append(input_text)
        val_target_texts.append(target_text)
    val_max_encoder_seq_length = max([len(txt) for txt in val_input_texts])
    val_max_decoder_seq_length = max([len(txt) for txt in val_target_texts])
    val_encoder_input_data = np.zeros(
        (len(val_input_texts), val_max_encoder_seq_length), dtype="float32")
    val_decoder_input_data = np.zeros(
        (len(val_input_texts), val_max_decoder_seq_length), dtype="float32")
    val_decoder_target_data = np.zeros(
        (len(val_input_texts), val_max_decoder_seq_length, num_decoder_tokens), dtype="float32")
    for i, (input_text, target_text) in enumerate(zip(val_input_texts, val_target_texts)):
        for t, char in enumerate(input_text):
            val_encoder_input_data[i, t] = input_token_index[char]
        val_encoder_input_data[i, t + 1:] = input_token_index[" "]
        for t, char in enumerate(target_text):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            val_decoder_input_data[i, t] = target_token_index[char]
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                val_decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        val_decoder_input_data[i, t + 1:] = target_token_index[" "]
        val_decoder_target_data[i, t:, target_token_index[" "]] = 1.0

    return val_encoder_input_data,val_decoder_input_data,val_decoder_target_data,target_token_index,val_target_texts


In [None]:
#Embedding data
encoder_input_data,decoder_input_data,decoder_target_data,num_encoder_tokens,num_decoder_tokens,input_token_index,target_token_index,max_encoder_seq_length,max_decoder_seq_length = embedding_train(train_lines)
val_encoder_input_data,val_decoder_input_data,val_decoder_target_data,target_token_index,val_target_texts = embedding_val(val_lines,num_decoder_tokens,input_token_index,target_token_index)

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


Number of samples: 68217
Number of unique input tokens: 27
Number of unique output tokens: 49
Max sequence length for inputs: 30
Max sequence length for outputs: 28


In [None]:
#yoursssss

batch_size = 64  # Batch size for training.
epochs = 2  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space. #hidden states hyperparameter
# Path to the data txt file on disk.
train_data = "dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv"
val_data = "dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv"
# open and save the files to lists
with open(train_data, "r", encoding="utf-8") as f:
    train_lines = f.read().split("\n")
with open(val_data, "r", encoding="utf-8") as f:
    val_lines = f.read().split("\n")
# popping the last element of all the lists since it is empty character
train_lines.pop()
val_lines.pop()
random.shuffle(train_lines)
print(train_lines[0:2])

# embedding pre processing
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
# go through the train lines and split them into 3 and save input and target
for line in train_lines[: (len(train_lines) - 1)]:
    # because we want english to devanagiri conversion
    target_text, input_text, _ = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    # append it to the main input texts list
    input_texts.append(input_text)
    # append it to the main target texts list
    target_texts.append(target_text)
    # to find the number of unique characters in both
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
# add the space character to both
input_characters.add(" ")
target_characters.add(" ")
# sort it
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
# find the number
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
# find the maximum length of input word and target word
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)
# create an index
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
print((input_token_index))
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
print((target_token_index))
# create an 0 array for encoder input size of (input_texts,max_seqlen,tokens)
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length), dtype="float32"
)
# create decoder input
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length), dtype="float32"
)
# create decoder target
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
# for each sample convert it into character encoding i.e. if
# at that position a character is present then encode the index of that character there
# this is done for both encoder and decoder input data for further word embedding
# but target data is one hot encoded.
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[char]
    # remaining positions set as empty space
    encoder_input_data[i, t + 1:] = input_token_index[" "]
    # similarly do for decoder data
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[char]
        # check if t >0 since decoder targer data is ahead
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    # append both the remaining positions of both the datas with empty space
    decoder_input_data[i, t + 1:] = target_token_index[" "]
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0


# for validation data,
val_input_texts = []
val_target_texts = []
for line in val_lines[: (len(val_lines) - 1)]:
    target_text, input_text, _ = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    val_input_texts.append(input_text)
    val_target_texts.append(target_text)
val_max_encoder_seq_length = max([len(txt) for txt in val_input_texts])
val_max_decoder_seq_length = max([len(txt) for txt in val_target_texts])
val_encoder_input_data = np.zeros(
    (len(val_input_texts), val_max_encoder_seq_length), dtype="float32"
)
val_decoder_input_data = np.zeros(
    (len(val_input_texts), val_max_decoder_seq_length), dtype="float32"
)
val_decoder_target_data = np.zeros(
    (len(val_input_texts), val_max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
for i, (input_text, target_text) in enumerate(zip(val_input_texts, val_target_texts)):
    for t, char in enumerate(input_text):
        val_encoder_input_data[i, t] = input_token_index[char]
    val_encoder_input_data[i, t + 1:] = input_token_index[" "]
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        val_decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            val_decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    val_decoder_input_data[i, t + 1:] = target_token_index[" "]
    val_decoder_target_data[i, t:, target_token_index[" "]] = 1.0

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


In [None]:
def seq2seq(embedding_size, n_encoder_tokens, n_decoder_tokens, n_encoder_layers,
                 n_decoder_layers, latent_dimension, cell_type,
                 target_token_index, max_decoder_seq_length, reverse_target_char_index,
                 dropout,encoder_input_data, decoder_input_data,
                decoder_target_data,
                batch_size,epochs):
  encoder_inputs = keras.Input(shape=(None,), name='encoder_input')
  # word embedding layer
  encoder = None
  encoder_outputs = None
  state_h = None
  state_c = None
  e_layer=n_encoder_layers

  if cell_type=="RNN":
    embed = tf.keras.layers.Embedding(input_dim=n_encoder_tokens, output_dim=embedding_size,
                                             name='encoder_embedding')(encoder_inputs)
    encoder = keras.layers.SimpleRNN(latent_dimension, return_state=True, return_sequences=True,
                                             name='encoder_hidden_1', dropout=dropout)
    print("Embed done")
    encoder_outputs, state_h = encoder(embed)
    for i in range(2,e_layer+1):
      layer_name = ('encoder_hidden_%d') % i
      print("Starting 2nd")
      encoder = keras.layers.SimpleRNN(latent_dimension, return_state=True, return_sequences=True,
                                                 name=layer_name, dropout=dropout)
      print("Ending 2nd")

      encoder_outputs, state_h = encoder(encoder_outputs, initial_state=[state_h])
    encoder_states = None
    encoder_states = [state_h]

    decoder_inputs = keras.Input(shape=(None,), name='decoder_input')
    embed_dec = tf.keras.layers.Embedding(n_decoder_tokens, embedding_size, name='decoder_embedding')(
        decoder_inputs)
    # number of decoder layers
    d_layer = n_decoder_layers
    decoder = None
    decoder = keras.layers.SimpleRNN(latent_dimension, return_sequences=True, return_state=True,
                                             name='decoder_hidden_1', dropout=dropout)
    # all decoders the initial state is encoder last state of last layer
    decoder_outputs, _ = decoder(embed_dec, initial_state=encoder_states)
    for i in range(2,d_layer+1):
      layer_name = 'decoder_hidden_%d' % i
      decoder = keras.layers.SimpleRNN(latent_dimension, return_sequences=True, return_state=True,
                                                 name=layer_name, dropout=dropout)
      decoder_outputs, _ = decoder(decoder_outputs, initial_state=encoder_states)
    decoder_dense = keras.layers.Dense(n_decoder_tokens, activation="softmax", name='decoder_output')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    model.compile(
          optimizer="rmsprop", loss="categorical_crossentropy",
          metrics=['accuracy'])#, metrics=[my_metric]                 

    # earlystopping = EarlyStopping(
    #     monitor="val_loss", min_delta=0.01, patience=5, verbose=2, mode="min")

    model.fit(
          [encoder_input_data, decoder_input_data],
          decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=WandbCallback()
      )
    encoder_inputs = model.input[0]
    encoder_outputs, state_h_enc = model.get_layer(
              'encoder_hidden_' + str(n_encoder_layers)).output
    encoder_states = [state_h_enc]
    encoder_model = keras.Model(encoder_inputs, encoder_states)

    decoder_inputs = model.input[1]  # input_2
    decoder_outputs = model.get_layer('decoder_embedding')(decoder_inputs)
    decoder_states_inputs = []
    decoder_states = []

    for j in range(1, n_decoder_layers + 1):
        decoder_state_input_h = keras.Input(shape=(latent_dimension,))
        current_states_inputs = [decoder_state_input_h]
        decoder = model.get_layer('decoder_hidden_' + str(j))
        decoder_outputs, state_h_dec = decoder(decoder_outputs, initial_state=current_states_inputs)
        decoder_states += [state_h_dec]
        decoder_states_inputs += current_states_inputs
    decoder_dense = model.get_layer('decoder_output')
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = keras.Model(
        [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
    )
    return encoder_model, decoder_model

  elif cell_type=="GRU":
    embed = tf.keras.layers.Embedding(input_dim=n_encoder_tokens, output_dim=embedding_size,
                                             name='encoder_embedding')(encoder_inputs)
    encoder = keras.layers.GRU(latent_dimension, return_state=True, return_sequences=True,
                                             name='encoder_hidden_1', dropout=dropout)
    encoder_outputs, state_h = encoder(embed)
    for i in range(2,e_layer+1):
      layer_name = ('encoder_hidden_%d') % i
      encoder = keras.layers.GRU(latent_dimension, return_state=True, return_sequences=True,
                                                 name=layer_name, dropout=dropout)
      encoder_outputs, state_h = encoder(encoder_outputs, initial_state=[state_h])
    encoder_states = None
    encoder_states = [state_h]

    decoder_inputs = keras.Input(shape=(None,), name='decoder_input')
    embed_dec = tf.keras.layers.Embedding(n_decoder_tokens, embedding_size, name='decoder_embedding')(
        decoder_inputs)
    # number of decoder layers
    d_layer = n_decoder_layers
    decoder = None
    decoder = keras.layers.GRU(latent_dimension, return_sequences=True, return_state=True,
                                             name='decoder_hidden_1', dropout=dropout)
    # all decoders the initial state is encoder last state of last layer
    decoder_outputs, _ = decoder(embed_dec, initial_state=encoder_states)
    for i in range(2,d_layer+1):
      layer_name = 'decoder_hidden_%d' % i
      decoder = keras.layers.GRU(latent_dimension, return_sequences=True, return_state=True,
                                                 name=layer_name, dropout=dropout)
      decoder_outputs, _ = decoder(decoder_outputs, initial_state=encoder_states)
    decoder_dense = keras.layers.Dense(n_decoder_tokens, activation="softmax", name='decoder_output')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    model.compile(
          optimizer="rmsprop", loss="categorical_crossentropy",
          metrics=['accuracy'])#, metrics=[my_metric]                 

    # earlystopping = EarlyStopping(
    #     monitor="val_loss", min_delta=0.01, patience=5, verbose=2, mode="min")
    
    model.fit(
          [encoder_input_data, decoder_input_data],
          decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          #callbacks=WandbCallback()
      )
    encoder_inputs = model.input[0]
    encoder_outputs, state_h_enc = model.get_layer(
              'encoder_hidden_' + str(n_encoder_layers)).output
    encoder_states = [state_h_enc]
    encoder_model = keras.Model(encoder_inputs, encoder_states)

    decoder_inputs = model.input[1]  # input_2
    decoder_outputs = model.get_layer('decoder_embedding')(decoder_inputs)
    decoder_states_inputs = []
    decoder_states = []

    for j in range(1, n_decoder_layers + 1):
        decoder_state_input_h = keras.Input(shape=(latent_dimension,))
        current_states_inputs = [decoder_state_input_h]
        decoder = model.get_layer('decoder_hidden_' + str(j))
        decoder_outputs, state_h_dec = decoder(decoder_outputs, initial_state=current_states_inputs)
        decoder_states += [state_h_dec]
        decoder_states_inputs += current_states_inputs
    decoder_dense = model.get_layer('decoder_output')
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = keras.Model(
        [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
    )
    return encoder_model, decoder_model


  elif cell_type=="LSTM":
    embed = tf.keras.layers.Embedding(input_dim=n_encoder_tokens, output_dim=embedding_size,
                                             name='encoder_embedding')(encoder_inputs)
    encoder = keras.layers.LSTM(latent_dimension, return_state=True, return_sequences=True,
                                             name='encoder_hidden_1', dropout=dropout)
    encoder_outputs, state_h, state_c = encoder(embed)
    for i in range(2,e_layer+1):
      layer_name = ('encoder_hidden_%d') % i
      encoder = keras.layers.LSTM(latent_dimension, return_state=True, return_sequences=True,
                                                 name=layer_name, dropout=dropout)
      encoder_outputs, state_h, state_c = encoder(encoder_outputs, initial_state=[state_h,state_c])
    encoder_states = None
    encoder_states = [state_h, state_c]

    decoder_inputs = keras.Input(shape=(None,), name='decoder_input')
    embed_dec = tf.keras.layers.Embedding(n_decoder_tokens, embedding_size, name='decoder_embedding')(
        decoder_inputs)
    # number of decoder layers
    d_layer = n_decoder_layers
    decoder = None
    decoder = keras.layers.LSTM(latent_dimension, return_sequences=True, return_state=True,
                                             name='decoder_hidden_1', dropout=dropout)
    # all decoders the initial state is encoder last state of last layer
    decoder_outputs, _,_ = decoder(embed_dec, initial_state=encoder_states)
    for i in range(2,d_layer+1):
      layer_name = 'decoder_hidden_%d' % i
      decoder = keras.layers.LSTM(latent_dimension, return_sequences=True, return_state=True,
                                                 name=layer_name, dropout=dropout)
      decoder_outputs, _,_ = decoder(decoder_outputs, initial_state=encoder_states)
    decoder_dense = keras.layers.Dense(n_decoder_tokens, activation="softmax", name='decoder_output')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    model.compile(
          optimizer="rmsprop", loss="categorical_crossentropy",
          metrics=['accuracy'])#, metrics=[my_metric]                 
    
    # earlystopping = EarlyStopping(
    #     monitor="val_loss", min_delta=0.01, patience=5, verbose=2, mode="min")
    
    model.fit(
          [encoder_input_data, decoder_input_data],
          decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          #callbacks=WandbCallback()
      )
    encoder_inputs = model.input[0]
    encoder_outputs, state_h_enc, state_c_enc = model.get_layer(
              'encoder_hidden_' + str(n_encoder_layers)).output
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model = keras.Model(encoder_inputs, encoder_states)

    decoder_inputs = model.input[1]  # input_2
    decoder_outputs = model.get_layer('decoder_embedding')(decoder_inputs)
    decoder_states_inputs = []
    decoder_states = []

    for j in range(1,n_decoder_layers + 1):
        decoder_state_input_h = keras.Input(shape=(latent_dimension,))
        decoder_state_input_c = keras.Input(shape=(latent_dimension,))
        current_states_inputs = [decoder_state_input_h, decoder_state_input_c]
        decoder = model.get_layer('decoder_hidden_' + str(j))
        decoder_outputs, state_h_dec, state_c_dec = decoder(decoder_outputs, initial_state=current_states_inputs)
        decoder_states += [state_h_dec, state_c_dec]
        decoder_states_inputs += current_states_inputs
    decoder_dense = model.get_layer('decoder_output')
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = keras.Model(
        [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
    )
    return encoder_model, decoder_model
      

In [None]:
def decode_sequence(input_seq,n_decoder_layers,cell_type,encoder_model,decoder_model):
        # Encode the input as state vectors.
        states_value = [encoder_model.predict(input_seq)]*n_decoder_layers

        # Generate empty target sequence of length 1.
        empty_seq = np.zeros((1, 1))
        # Populate the first character of target sequence with the start character.
        empty_seq[0, 0] = target_token_index["\t"]
        target_seq = empty_seq

        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        decoded_sentence = ""
        while not stop_condition:
            if cell_type is not None and (cell_type.lower() == 'rnn' or cell_type.lower() == 'gru'):
                temp = decoder_model.predict([target_seq] + [states_value])
                output_tokens, states_value = temp[0], temp[1:]
            else:
                temp = decoder_model.predict([target_seq] + states_value )
                output_tokens, states_value = temp[0], temp[1:]

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_char = reverse_target_char_index[sampled_token_index]
            decoded_sentence += sampled_char

            # Exit condition: either hit max length
            # or find stop character.
            if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
                stop_condition = True

            # Update the target sequence (of length 1).
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

        return decoded_sentence

In [None]:
def accuracy(val_encoder_input_data, val_target_texts,n_decoder_layers,encoder_model,decoder_model, verbose=False):
        n_correct = 0
        n_total = 0
        for seq_index in range(len(val_encoder_input_data)):
            # Take one sequence (part of the training set)
            # for trying out decoding.
            input_seq = val_encoder_input_data[seq_index: seq_index + 1]
            # Generate empty target sequence of length 1.
            # empty_seq = np.zeros((1, 1))
            # # Populate the first character of target sequence with the start character.
            # empty_seq[0, 0] = self.target_token_index["\t"]
            decoded_sentence = decode_sequence(input_seq,n_decoder_layers,'LSTM',encoder_model,decoder_model)

            if decoded_sentence.strip() == val_target_texts[seq_index].strip():
                n_correct += 1

            n_total += 1

            if verbose:
                print('Prediction ', decoded_sentence.strip(), ',Ground Truth ', val_target_texts[seq_index].strip())

        return n_correct * 100.0 / n_total

In [None]:
# parameters
embedding_size=256
n_encoder_tokens=num_encoder_tokens
n_decoder_tokens=num_decoder_tokens
n_encoder_layers=3
n_decoder_layers=3
latent_dimension=256
cell_type='GRU'
target_token_index=target_token_index
max_decoder_seq_length=max_decoder_seq_length
reverse_target_char_index=reverse_target_char_index
dropout=0.5
epochs=25

In [None]:
#calling rnn
encoder_model, decoder_model=seq2seq(embedding_size, num_encoder_tokens,num_decoder_tokens,n_encoder_layers, n_decoder_layers,latent_dimension,
                cell_type, target_token_index, max_decoder_seq_length,reverse_target_char_index, dropout ,encoder_input_data, decoder_input_data,
                decoder_target_data,batch_size,epochs)

# val_accuracy= accuracy(val_encoder_input_data, val_target_texts,n_decoder_layers,encoder_model,decoder_model)
# print('Validation accuracy: ', val_accuracy)

# subset = 0
# val_accuracy = accuracy(val_encoder_input_data[0:subset], val_target_texts[0:subset],n_decoder_layers,encoder_model,decoder_model) if subset>0 \
#     else accuracy(val_encoder_input_data, val_target_texts,n_decoder_layers,encoder_model,decoder_model)
# print('Validation accuracy: ', val_accuracy)

In [None]:
!pip install wandb
import wandb
from wandb.keras import WandbCallback

In [None]:
def fit():
  config_defaults = {
            "cell_type":'LSTM',
            "num_encoder_layers":2,
            "num_decoder_layers":3,
            "embedding_size":256,
            "latent_dimension":256,
            "dropout":0.2,
            "epochs":25
        }
  wandb.init(config=config_defaults)

  config = wandb.config
  
  cell_type=config.cell_type
  n_encoder_layers=config.num_encoder_layers
  n_decoder_layers=config.num_decoder_layers
  embedding_size=config.embedding_size
  latent_dimension=config.latent_dimension
  dropout=config.dropout
  epochs=config.epochs

  run_name = "cell_type_{}_nel_{}_ndl_{}_drop_{}_emd_{}_ld_{}".format(cell_type, n_encoder_layers, n_decoder_layers, dropout, embedding_size, latent_dimension )
  
  encoder_model, decoder_model=seq2seq(embedding_size, num_encoder_tokens,num_decoder_tokens,n_encoder_layers, n_decoder_layers,latent_dimension,
                cell_type, target_token_index, max_decoder_seq_length,reverse_target_char_index, dropout ,encoder_input_data, decoder_input_data,
                decoder_target_data,batch_size,epochs)
  
  val_accuracy=accuracy(val_encoder_input_data, val_target_texts,n_decoder_layers,encoder_model,decoder_model)
  print("Validation Accuracy:", val_accuracy)
  wandb.log({'val_accuracy': val_accuracy})
  wandb.run.name = run_name
  wandb.run.save()
  wandb.run.finish()
  

In [None]:
# run sweeps
sweep_config = {
    'method': 'bayes',  # grid, random
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'embedding_size': {
            'values': [64,128,256]
        },
        'num_encoder_layers': {
            'values': [1,2,3]
        },
        'num_decoder_layers': {
            'values': [1,2,3]
        },
        'latent_dimension': {
            'values': [64, 256, 512]
        },
        'cell_type': {
            'values': ['RNN', 'GRU', 'LSTM']
        },                             
        'dropout': {
            'values': [0.3,0.4,0.5,0.0,0.2]
        },
        'epochs': {
            'values': [25,20,30]
        }
    }
}

#sweep_id = wandb.sweep(sweep_config,entity="anandh" ,project="CS6910_Assignment3_S2S")
# wandb.agent(sweep_id, fit, count=10)
sweep_id="5y0u4iyv"
wandb.agent(sweep_id, fit, entity="anandh", project = "CS6910_Assignment3_S2S", count = 6)


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8unjpy35 with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	latent_dimension: 512
[34m[1mwandb[0m: 	num_decoder_layers: 2
[34m[1mwandb[0m: 	num_encoder_layers: 1
[34m[1mwandb[0m: Currently logged in as: [33manandh[0m (use `wandb login --relogin` to force relogin)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[34m[1mwandb[0m: [32m[41mERROR[0m Error while calling W&B API: context deadline exceeded (<Response [500]>)


Validation Accuracy: 12.701435687078817


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
val_accuracy,▁

0,1
val_accuracy,12.70144


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: u6apf3ma with config:
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	latent_dimension: 64
[34m[1mwandb[0m: 	num_decoder_layers: 2
[34m[1mwandb[0m: 	num_encoder_layers: 3


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Accuracy: 23.893934954585408


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▃▄▅▅▆▆▆▇▇▇▇▇▇██████
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,█▆▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
val_accuracy,▁

0,1
accuracy,0.93414
epoch,19.0
loss,0.22777
val_accuracy,23.89393


[34m[1mwandb[0m: Agent Starting Run: zydnwy8a with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	latent_dimension: 512
[34m[1mwandb[0m: 	num_decoder_layers: 2
[34m[1mwandb[0m: 	num_encoder_layers: 1


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Validation Accuracy: 7.588631702314679


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
val_accuracy,▁

0,1
val_accuracy,7.58863


[34m[1mwandb[0m: Agent Starting Run: iz740uz2 with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	latent_dimension: 64
[34m[1mwandb[0m: 	num_decoder_layers: 1
[34m[1mwandb[0m: 	num_encoder_layers: 2


Embed done
Starting 2nd
Ending 2nd
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Accuracy: 0.0


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▅▅▆▆▇▇▇▇▇▇█████████
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,█▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁

0,1
accuracy,0.85468
epoch,19.0
loss,0.49006
val_accuracy,0.0


[34m[1mwandb[0m: Agent Starting Run: uzrsug3u with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_size: 128
[34m[1mwandb[0m: 	epochs: 25
[34m[1mwandb[0m: 	latent_dimension: 512
[34m[1mwandb[0m: 	num_decoder_layers: 3
[34m[1mwandb[0m: 	num_encoder_layers: 1


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Validation Accuracy: 8.54087313214181


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
val_accuracy,▁

0,1
val_accuracy,8.54087


[34m[1mwandb[0m: Agent Starting Run: va0p15cl with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	latent_dimension: 512
[34m[1mwandb[0m: 	num_decoder_layers: 3
[34m[1mwandb[0m: 	num_encoder_layers: 3


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


In [None]:
#####
encoder_model, decoder_model=seq2seq(256, num_encoder_tokens,num_decoder_tokens,
                2, 3,latent_dim,
                'LSTM', target_token_index, max_decoder_seq_length,
                reverse_target_char_index, 0.2,encoder_input_data, decoder_input_data,
                decoder_target_data,batch_size)

val_accuracy= accuracy(val_encoder_input_data, val_target_texts,3,encoder_model,decoder_model)
print('Validation accuracy: ', val_accuracy)

Epoch 1/2
Epoch 2/2
Validation accuracy:  21.725754468209786


In [None]:
def beam_search(input_seq,encoder_model,decoder_model, beam_size,n_decoder_layers,cell_type):
        sequences = [([target_token_index["\t"]], 0.0)]
        # Encode the input as state vectors.
        states_value = [[encoder_model.predict(input_seq)]*n_decoder_layers]

        stop_condition = False
        t = 0
        while not stop_condition:
            all_seq = list()
            char_sequences = []
            for seq, score in sequences:
                char_seq = ''
                for index in seq:
                    char_seq += reverse_target_char_index[index]
                char_sequences.append((char_seq, score))
            #print('at time ', t, char_sequences)
            t += 1
            for i in range(len(sequences)):
                seq, score = sequences[i]
                if seq[-1] == target_token_index["\n"] or seq[-1] == target_token_index[" "]:
                    all_seq.append((seq, score))
                    continue
                target_seq = np.zeros((1, 1))
                target_seq[0, 0] = seq[-1]
                # print('target seq', seq[-1], self.reverse_target_char_index[seq[-1]])
                if cell_type is not None and (cell_type.lower() == 'rnn' or cell_type.lower() == 'gru'):
                    temp = decoder_model.predict([target_seq] + [states_value[i]])
                    output_tokens, temp_states = temp[0], temp[1:]
                else:
                    temp = decoder_model.predict([target_seq] + states_value[i] )
                    output_tokens, temp_states = temp[0], temp[1:]

                if t == 1:
                    states_value = [temp_states] * beam_size
                else:
                    states_value[i] = temp_states

                for j in range(len(output_tokens[0, -1, :])):
                    candidate = (seq + [j], score - math.log(output_tokens[0, -1, j]))
                    all_seq.append(candidate)

                # Exit condition: either hit max length
                # or find stop character.
                sampled_token_index = np.argmax(output_tokens[0, -1, :])
                sampled_char = reverse_target_char_index[sampled_token_index]
                # print('prob', output_tokens[0, -1, :])
                # print('sampledchar ', sampled_char)

            sorted_by_prob = sorted(all_seq, key=lambda tup: tup[1])

            # print all possible sequences
            char_sequences = []
            for seq, score in sequences:
                char_seq = ''
                for index in seq:
                    char_seq += reverse_target_char_index[index]
                char_sequences.append((char_seq, score))
            # print('Printing all sequences')
            # print(char_sequences)

            # select the top k sequences
            sequences = sorted_by_prob[:beam_size]
            if t > max_decoder_seq_length:
                stop_condition = True
            # if every sequence has predicted \n we should stop
            all_seq_ended = True
            for seq, _ in sequences:
                if seq[-1] != target_token_index["\n"]:
                    all_seq_ended = False
                    break
            if all_seq_ended:
                stop_condition = True
        # create character out of indexes
        char_sequences = []
        for seq, score in sequences:
            char_seq = ''
            for index in seq:
                char_seq += reverse_target_char_index[index]
            char_sequences.append((char_seq, score))
        return char_sequences

In [None]:
encoder_model, decoder_model=seq2seq(embedding_size, num_encoder_tokens,num_decoder_tokens,n_encoder_layers, n_decoder_layers,latent_dimension,
                cell_type, target_token_index, max_decoder_seq_length,reverse_target_char_index, dropout ,encoder_input_data, decoder_input_data,
                decoder_target_data,batch_size,epochs)
# compute test accuracy
print('Reading test data')
test_data = "dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.test.tsv"
# open and save the files to lists
with open(test_data, "r", encoding="utf-8") as f:
    test_lines = f.read().split("\n")
# popping the last element of all the lists since it is empty character
test_lines.pop()
# embedding test
# for test data, almost same
test_input_texts = []
test_target_texts = []
for line in test_lines[: (len(test_lines) - 1)]:
    target_text, input_text, _ = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    test_input_texts.append(input_text)
    test_target_texts.append(target_text)
test_max_encoder_seq_length = max([len(txt) for txt in test_input_texts])
test_max_decoder_seq_length = max([len(txt) for txt in test_target_texts])
test_encoder_input_data = np.zeros(
    (len(test_input_texts), test_max_encoder_seq_length), dtype="float32"
)
test_decoder_input_data = np.zeros(
    (len(test_input_texts), test_max_decoder_seq_length), dtype="float32"
)
test_decoder_target_data = np.zeros(
    (len(test_input_texts), test_max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
for i, (input_text, target_text) in enumerate(zip(test_input_texts, test_target_texts)):
    for t, char in enumerate(input_text):
        test_encoder_input_data[i, t] = input_token_index[char]
    test_encoder_input_data[i, t + 1:] = input_token_index[" "]
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        test_decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            test_decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    test_decoder_input_data[i, t + 1:] = target_token_index[" "]
    test_decoder_target_data[i, t:, target_token_index[" "]] = 1.0

print('Calculating test accuracy')
test_accuracy = {}
for beamSize in range(1,4):
  df = pd.DataFrame(columns=['Source', 'Predictions', 'GroundTruth'])
  n_correct = 0
  n_total = 0
  for seq_index in range(len(test_encoder_input_data)):
      decoded_sentence = beam_search(test_encoder_input_data[seq_index:seq_index+1],encoder_model,decoder_model,beamSize,n_decoder_layers,cell_type)

      if test_target_texts[seq_index].strip() == decoded_sentence[0][0].strip():
          n_correct += 1

      n_total += 1
      row = {}
      row['SourceText'] = test_input_texts[seq_index].strip()
      row['GroundTruth'] = test_target_texts[seq_index].strip()
      row['Prediction'] = decoded_sentence[0][0].strip()
      df = df.append(row, ignore_index=True)
  df.to_csv('predictions_'+str(beamSize)+'.csv', index=False)  
  test_accuracy[beamSize] = (n_correct * 100.0 / n_total)
print('Test accuracy ', test_accuracy)

import time
for beamSize in range(1,4):
  files.download('predictions_'+str(beamSize)+'.csv')
  time.sleep(30)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Reading test data
Calculating test accuracy
Test accuracy  {1: 49.04560687745884, 2: 49.147603089028124, 3: 49.16217397639516}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>