In [1]:
import io
import csv
import numpy as np
import tensorflow as tf
from random import randint
import keras.backend as K
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from numpy import array, argmax, array_equal 
from tensorflow.keras import backend as K
from tensorflow.keras import models, Input
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import LSTM, Bidirectional, SimpleRNN, GRU, Dense, Flatten, TimeDistributed, RepeatVector, Lambda
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
tf.keras.backend.set_floatx('float64')

In [None]:
#%pip install wandb -q
#import wandb
#from wandb.keras import WandbCallback

In [None]:
#wandb.login()

In [None]:
#wandb.init(project="Assignment 3", entity="shubham-argha")

In [2]:
# Dataset
!yes | wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"

--2022-05-08 10:58:51--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 209.85.147.128, 142.250.125.128, 142.250.136.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|209.85.147.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2022-05-08 10:59:07 (114 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [3]:
# Unzip
!yes | tar xopf dakshina_dataset_v1.0.tar

In [4]:
train_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
test_dir = "./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

In [5]:
# reads text document and returns a list of lists comprising the romanized and native versions of the words
def read(f):
    hindi = []
    latin = []
    with io.open(f, encoding ='utf-8') as f:
        for line in f:
            if '\t' not in line:
                continue
            tokens = line.rstrip().split("\t")
            latin.append(tokens[1])
            hindi.append(tokens[0])
    return latin, hindi

In [6]:
tr_src, tr_tar = read(train_dir)
tst_src, tst_tar = read(test_dir)

In [7]:
print("Number of training samples: ", len(tr_src))
print("Number of testing samples: ", len(tst_src))

Number of training samples:  44204
Number of testing samples:  4502


In [8]:
A = np.arange(len(tr_src))
np.random.shuffle(A)
A1 = np.arange(len(tst_src))
np.random.shuffle(A1)

In [9]:
char_inp = set()
char_tar = set()
ns_txt_inp = []
ns_txt_tar = []
ns_txt_inp_tst = []
ns_txt_tar_tst = []

In [10]:
for (txt_inp, txt_tar) in zip(tr_src, tr_tar):

    txt_tar = "B" + txt_tar + "E"
    ns_txt_inp.append(txt_inp)
    ns_txt_tar.append(txt_tar)
    for char in txt_inp:
        if char not in char_inp:
            char_inp.add(char)
    for char in txt_tar:
        if char not in char_tar:
            char_tar.add(char)

In [11]:
for (txt_inp, txt_tar) in zip(tst_src, tst_tar):
    txt_tar = "B" + txt_tar + "E"
    ns_txt_inp_tst.append(txt_inp)
    ns_txt_tar_tst.append(txt_tar)
    for char in txt_inp:
        if char not in char_inp:
            char_inp.add(char)
    for char in txt_tar:
        if char not in char_tar:
            char_tar.add(char)

In [12]:
inps_txt = []
tars_txt = []

for i in range(len(tr_src)):
    inps_txt.append(ns_txt_inp[A[i]])
    tars_txt.append(ns_txt_tar[A[i]])


In [13]:
inps_txt_tst = []
txt_tar_tst = []

for i in range(len(tst_src)):
    inps_txt_tst.append(ns_txt_inp_tst[A1[i]])
    txt_tar_tst.append(ns_txt_tar_tst[A1[i]])


In [14]:
char_inp.add(" ")
char_tar.add(" ")
char_inp = sorted(list(char_inp))
char_tar = sorted(list(char_tar))

In [15]:
enc_tok_num = len(char_inp)
dec_tok_num = len(char_tar)

In [16]:
len_max_enc = max([len(txt) for txt in inps_txt])
len_max_dec = max([len(txt) for txt in tars_txt])
len_max_enc_tst = max([len(txt) for txt in inps_txt_tst])
len_max_dec_tst = max([len(txt) for txt in inps_txt_tst])

In [17]:
tok_ind_inp = dict([(j, k) for k, j in enumerate(char_inp)])
tok_ind_tar = dict([(j, k) for k, j in enumerate(char_tar)])

In [18]:
reverse_source_char_index = dict((i, char) for char, i in tok_ind_inp.items())
reverse_target_char_index = dict((i, char) for char, i in tok_ind_tar.items())

In [19]:
inp_txt_trnc = inps_txt[:44160]
tar_txt_trnc = tars_txt[:44160]

In [20]:
enc_inp = np.zeros(
            (len(inp_txt_trnc), len_max_enc, enc_tok_num), dtype="float64")
dec_tar = np.zeros(
    (len(inp_txt_trnc), len_max_dec, dec_tok_num), dtype="float64")

In [21]:
for i, (txt_inp, txt_tar) in enumerate(zip(inp_txt_trnc, tar_txt_trnc)):
    for t, char in enumerate(txt_inp):
        enc_inp[i, t, tok_ind_inp[char]] = 1.0
    enc_inp[i, t + 1 :, tok_ind_inp[" "]] = 1.0
    for t, char in enumerate(txt_tar):
        dec_tar[i, t, tok_ind_tar[char]] = 1.0
    dec_tar[i, t + 1 :, tok_ind_tar[" "]] = 1.0

In [22]:
inp_tst_enc_dt = np.zeros(
    (len(inps_txt_tst), len_max_enc, enc_tok_num), dtype="float64"
)
tar_tst_dec_dt = np.zeros(
    (len(txt_tar_tst), len_max_dec, dec_tok_num), dtype="float64"
)

In [23]:
for i, (txt_inp, txt_tar) in enumerate(zip(inps_txt_tst, txt_tar_tst)):
    for t, char in enumerate(txt_inp):
        inp_tst_enc_dt[i, t, tok_ind_inp[char]] = 1.0
    inp_tst_enc_dt[i, t + 1 :, tok_ind_inp[" "]] = 1.0

    for t, char in enumerate(txt_tar):
        tar_tst_dec_dt[i, t, tok_ind_tar[char]] = 1.0
    tar_tst_dec_dt[i, t + 1: ,tok_ind_tar[" "]] = 1.0

In [24]:
class BahdanauAttention(tf.keras.layers.Layer):
  
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)
    
  def call(self, query, values):
    
    qt = tf.expand_dims(query, 1)
    
    sc = self.V(tf.nn.tanh(
        self.W1(qt) + self.W2(values)))
    
    attn_wgt = tf.nn.softmax(sc, axis=1)
    vec_cxt = attn_wgt * values
    vec_cxt = tf.reduce_sum(vec_cxt, axis=1)

    return vec_cxt, attn_wgt

In [26]:
class model_with_attention(object):

  def __init__(self, Type = 'RNN', hid_layer_size=32, l_r= 1e-3, drop_prob = 0.3, number_of_epochs = 10, batch_size = 32, attn = 'bahdanau'):
    
    self.Type = Type
    self.hid_layer_size = hid_layer_size
    self.l_r = l_r
    self.drop_prob = drop_prob
    self.number_of_epochs = number_of_epochs
    self.batch_size = batch_size
    self.attn = attn

    
  def fit(self, enc_inp, dec_tar):

    enc_inps = Input(shape=(len_max_enc, enc_tok_num), name='encoder_inputs')

    if self.Type == 'LSTM':

      enc_LSTM = LSTM(self.hid_layer_size,return_sequences=True, return_state=True, dropout = self.drop_prob, name='encoder_lstm')
      enc_outs, hs, cs = enc_LSTM(enc_inps)
      states_enc = [hs, cs]

    elif self.Type == 'GRU':

      enc_GRU = GRU(self.hid_layer_size,return_sequences=True, return_state=True, dropout = self.drop_prob, name='encoder_gru')
      enc_outs, hs = enc_GRU(enc_inps)
      states_enc = [hs]

    elif self.Type == 'RNN':

      enc_rnn = SimpleRNN(self.hid_layer_size,return_sequences=True, return_state=True, dropout = self.drop_prob, name='encoder_rnn')
      enc_outs, hs = enc_rnn(enc_inps)
      states_enc = [hs]

    # Attention Layer
    if self.attn == 'bahdanau':
      attn= BahdanauAttention(self.hid_layer_size)

    # Decoder Layers
    inps_deco = Input(shape=(1, (dec_tok_num + self.hid_layer_size)),name='decoder_inputs')

    if self.Type == 'LSTM':

      dec_LSTM = LSTM(self.hid_layer_size, dropout = self.drop_prob, return_state=True, name='decoder_lstm')
    
    elif self.Type == 'GRU':

      dec_GRU = GRU(self.hid_layer_size, dropout = self.drop_prob, return_state=True, name='decoder_gru')
    
    elif self.Type == 'RNN':

      dec_RNN = SimpleRNN(self.hid_layer_size, dropout = self.drop_prob, return_state=True, name='decoder_rnn')  
    
    
    dec_den = Dense(dec_tok_num, activation='softmax',  name='decoder_dense')
    oa = []

    ip = np.zeros((self.batch_size, 1, dec_tok_num))
    ip[:, 0, 0] = 1 

    dec_outs = hs
    states = states_enc

    for _ in range(len_max_dec):

      vec_cxt, attn_wgt = attn(dec_outs, enc_outs)
      vec_cxt = tf.expand_dims(vec_cxt, 1)
      
      ip = tf.concat([vec_cxt, ip], axis=-1)

      if self.Type == 'LSTM':

        dec_outs, h, c = dec_LSTM(ip, initial_state=states)

      if self.Type == 'GRU':

        dec_outs, h = dec_GRU(ip, initial_state=states)

      if self.Type == 'RNN':

        dec_outs, h = dec_RNN(ip, initial_state=states)
      
      op = dec_den(dec_outs)
      op = tf.expand_dims(op, 1)
      oa.append(op)
      ip = op
      if self.Type == 'LSTM':

        states = [h, c]

      if self.Type == 'GRU' or self.Type == 'RNN':
        
        states = [h]



    dec_outs = Lambda(lambda x: K.concatenate(x, axis=1))(oa)
    model = Model(enc_inps, dec_outs, name='model_encoder_decoder')
    
    optimizer = Adam(lr=self.l_r, beta_1=0.9, beta_2=0.999)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    model.fit(enc_inp, dec_tar,
              batch_size=self.batch_size, 
              epochs=self.number_of_epochs,
              #callbacks = [WandbCallback()]
              )

    g_t = 0
    g_c = 0
    test_count = 4480

    p = model.predict(inp_tst_enc_dt[:test_count], batch_size = self.batch_size)

    data_list = [["SNO", "Input Data", "Target Data", "Predicted Data"]]

    for j in range(0,test_count):
        p_v = p[j]
        t_v = tar_tst_dec_dt[j]
        p_i = tf.argmax(p_v, axis=1)
        t_i = tf.argmax(t_v, axis=1)

        if (p_i.numpy() == t_i.numpy()).all():
            g_c = g_c + 1

        g_t = g_t + 1

        arr = p_i.numpy()
        decoded_sequence = ''
        for i in range(1,len(arr)):
            if arr[i] != 2:
                decoded_sequence = decoded_sequence + reverse_target_char_index[arr[i]]

        t_w = txt_tar_tst[j] 
        t_w = t_w[1:len(t_w)-1]
        dlist = [j+1, inps_txt_tst[j], t_w, decoded_sequence]
        data_list.append(dlist)

    with open('predictions_attention.tsv', 'w', newline='', encoding="utf-8") as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerows(data_list)

    val_accuracy = g_c/g_t
    print(val_accuracy)

In [25]:
#best model
best_attention = 'bahdanau'
best_batch_size = 64
best_cell_type = 'LSTM'
best_dropout = 0.2
best_epochs = 15
best_hidden_size = 128
best_learning_rate = 0.001

In [28]:
model_rnn = model_with_attention(Type = best_cell_type, hid_layer_size = best_hidden_size, l_r = best_learning_rate,
                                drop_prob = best_dropout, number_of_epochs = best_epochs, batch_size = best_batch_size, attn = best_attention)

In [None]:
model_rnn.fit(enc_inp,dec_tar)

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
0.40799107142857144
