#### CS20M059 Shibobrota Das | CS20M007 Abhishek Kumar

## Setup

In [1]:
!pip install tensorflow-addons -qqq

[K     |████████████████████████████████| 686kB 9.2MB/s eta 0:00:01
[?25h

In [2]:
!pip install wandb -qqq

[K     |████████████████████████████████| 1.8MB 8.4MB/s 
[K     |████████████████████████████████| 133kB 51.7MB/s 
[K     |████████████████████████████████| 174kB 38.2MB/s 
[K     |████████████████████████████████| 102kB 12.8MB/s 
[K     |████████████████████████████████| 71kB 8.2MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [1]:
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import GradientTape
from tensorflow import keras
import pandas as pd
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, GRU, SimpleRNN, SimpleRNNCell, LSTMCell, GRUCell
from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
import time
import sys
import datetime
from sklearn.utils import shuffle
import wandb
# import nltk
import csv
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.font_manager import FontProperties

print("Using numpy:",np.__version__)
print("Using tensorflow:",tf.__version__)
print("Using tensorflow Addons:",tfa.__version__)
print("Using keras:",keras.__version__)
print("Using pandas:",pd.__version__)

Using numpy: 1.19.5
Using tensorflow: 2.5.0
Using tensorflow Addons: 0.13.0
Using keras: 2.5.0
Using pandas: 1.2.4


In [None]:
# wandb.init(project='Assignment 3', entity='iitm-cs6910-jan-may-2021-cs20m059-cs20m007')

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/DL-A3 Dataset/dakshina_dataset_v1.0/hi/'

Mounted at /content/drive
/content/drive/My Drive/DL-A3 Dataset/dakshina_dataset_v1.0/hi


#### Load Data

In [2]:
val_df = pd.read_csv("./lexicons/hi.translit.sampled.dev.tsv", sep='\t', header=None)
train_df = pd.read_csv("./lexicons/hi.translit.sampled.train.tsv", sep='\t', header=None)
test_df = pd.read_csv("./lexicons/hi.translit.sampled.test.tsv", sep='\t', header=None)
print("Data Loaded to Dataframes!")

Data Loaded to Dataframes!


In [6]:
%cd '/content/drive/My Drive/A3-checkpoints/'

/content/drive/My Drive/A3-checkpoints


#### Dataset Samples

In [3]:
train_df.sample(n=3)

Unnamed: 0,0,1,2
7933,किड्स,kids,6
24721,प्रसाधन,prasadhan,2
37294,वेक्स,wax,2


## Preparing Dataset

In [4]:
sos = "@"
eos = "#"

In [5]:
class LexDataset:
    def __init__(self, input_tensor, target_tensor, batch_size):
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor
        self.batch = tf.data.Dataset.from_tensor_slices((self.input_tensor, self.target_tensor)).shuffle(len(self.input_tensor)).batch(batch_size, drop_remainder=True)

In [6]:
class TransliterationDatatset:
    def __init__(self, df_list, batch_size = 64):
        
        self.input_tokenizer = None
        self.target_tokenizer = None
        self.train = None
        self.val = None
        self.test = None
        self.batch_size = batch_size
        # Load Data
        self.load_dataset(df_list)
        # Other parameters
        self.num_input_tokens = len(self.input_tokenizer.index_word)+1
        self.num_target_tokens = len(self.target_tokenizer.index_word)+1
        self.max_input_seq_length = np.max([self.train.input_tensor.shape[1], self.val.input_tensor.shape[1], self.test.input_tensor.shape[1]])
        self.max_target_seq_length = np.max([self.train.target_tensor.shape[1], self.val.target_tensor.shape[1], self.test.target_tensor.shape[1]])
        
    def preprocess_word(self, w):
        return sos + str(w) + eos
    
    def print_input(self, tensor):
        for t in tensor:
            if t != 0:
                print(f'{t} ----> {self.input_tokenizer.index_word[t]}')
                
    def print_target(self, tensor):
        for t in tensor:
            if t != 0:
                print(f'{t} ----> {self.target_tokenizer.index_word[t]}')
    
    def create_dataset(self, data_frame):
        input_words = []
        target_words = []
        # Shuffle the data_frame before creating dataset
        df = data_frame
        for i in range(5):
            df = shuffle(df)
        for x, y in zip(df[1], df[0]):
            input_words.append(self.preprocess_word(x))
            target_words.append(self.preprocess_word(y))
        return (input_words, target_words)
    
    def load_dataset(self, df_list):
        # df_list should have train -> val -> test in sequence
        
        self.input_tokenizer = Tokenizer(num_words = None, char_level = True)
        self.target_tokenizer = Tokenizer(num_words = None, char_level = True)
        
        ds_list = []
        
        for df in df_list:
            # Get the words list
            (input_words, target_words) = self.create_dataset(df)
            # Fit on the set of words
            self.input_tokenizer.fit_on_texts(input_words)
            self.target_tokenizer.fit_on_texts(target_words)
            ds_list.append((input_words, target_words))
                    
        self.target_tokenizer.index_word.update({0:" "})
        self.input_tokenizer.index_word.update({0:" "})
        
        input_word_len = []
        target_word_len = []
        
        tensor_list = []
        
        for i, (input_words, target_words) in enumerate(ds_list):
            input_tensor = self.input_tokenizer.texts_to_sequences(input_words)
            target_tensor = self.target_tokenizer.texts_to_sequences(target_words)
            tensor_list.append((input_tensor, target_tensor))
            input_word_len.append(np.max([len(x) for x in input_tensor]))
            target_word_len.append(np.max([len(x) for x in target_tensor]))
        
        for i, (input_tensor, target_tensor) in enumerate(tensor_list):
            
            input_tensor = pad_sequences(input_tensor, padding='post', maxlen = np.max(input_word_len))
            target_tensor = pad_sequences(target_tensor, padding='post', maxlen = np.max(target_word_len))
            
            if i == 0:
                self.train = LexDataset(input_tensor, target_tensor, self.batch_size)
            elif i == 1:
                self.val = LexDataset(input_tensor, target_tensor, self.batch_size)
            else:
                self.test = LexDataset(input_tensor, target_tensor, self.batch_size)

In [7]:
dataset = TransliterationDatatset([train_df, val_df, test_df], 128)

#### Training Data

In [8]:
# Training data
dataset.train.input_tensor.shape, dataset.train.target_tensor.shape

((44204, 22), (44204, 21))

#### Validation Data

In [9]:
# Validation data
dataset.val.input_tensor.shape, dataset.val.target_tensor.shape

((4358, 22), (4358, 21))

#### Test Data

In [10]:
# Test data
dataset.test.input_tensor.shape, dataset.test.target_tensor.shape

((4502, 22), (4502, 21))

#### Number of Tokens

In [11]:
# Number of tokens
dataset.num_input_tokens, dataset.num_target_tokens

(30, 67)

#### Maximum Sequence Lengths

In [12]:
# max seq length
dataset.max_input_seq_length, dataset.max_target_seq_length

(22, 21)

#### Example batch - dataset

In [13]:
example_input_batch, example_target_batch = next(iter(dataset.train.batch))
example_input_batch.shape, example_target_batch.shape

(TensorShape([128, 22]), TensorShape([128, 21]))

In [14]:
dataset.print_input(example_input_batch[2].numpy())

2 ----> @
9 ----> s
6 ----> h
10 ----> e
7 ----> r
3 ----> #


In [15]:
dataset.print_target(example_target_batch[2].numpy())

1 ----> @
27 ----> श
14 ----> े
4 ----> र
2 ----> #


## Encoder and Decoder Model

In [16]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, dropout=0.2, layer_type="GRU"):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.layer_type = layer_type

        ##-------- RNN layer in Encoder ------- ##
        if self.layer_type == "LSTM":
            self.layer = LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')
            
        elif self.layer_type == "GRU":
            self.layer = GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')
           
        else:
            self.layer = SimpleRNN(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')

    def call(self, inputs, hidden):
        inputs = self.embedding(inputs)
        if self.layer_type == "LSTM":
            output, h, c = self.layer(inputs, initial_state = hidden)
            return output, h, c
        else:
            output, h = self.layer(inputs, initial_state = hidden)
            return output, h, None

    def initialize_hidden_state(self):
        if self.layer_type == "LSTM":
            return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]
        else:
            return tf.zeros((self.batch_sz, self.enc_units))

In [17]:
vocab_inp_size = dataset.num_input_tokens
embedding_dim = 64
units = 256
BATCH_SIZE = dataset.batch_size

In [18]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE, 0.2, "LSTM")

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden, sample_cell = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units)', sample_output.shape)
print('Encoder Hidden state shape: (batch size, units)', sample_hidden.shape)
if encoder.layer_type == "LSTM":
    print ('Encoder c vector shape: (batch size, units) {}'.format(sample_cell.shape))

Encoder output shape: (batch size, sequence length, units) (128, 22, 256)
Encoder Hidden state shape: (batch size, units) (128, 256)
Encoder c vector shape: (batch size, units) (128, 256)


In [19]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, dropout=0.2, layer_type="GRU"):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.layer_type = layer_type
        
        ##-------- RNN layer in Encoder ------- ##
        if self.layer_type == "LSTM":
            self.layer = LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')
            
        elif self.layer_type == "GRU":
            self.layer = GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')
           
        else:
            self.layer = SimpleRNN(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')
        
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden):

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # passing the concatenated vector to the GRU/SimpleRNN
        if self.layer_type != "LSTM":
            output, state_h = self.layer(x, initial_state = hidden)
        else:
            output, state_h, state_c = self.layer(x, initial_state = hidden)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        # return x, state
        if self.layer_type != "LSTM":
            return x, state_h, None
        else:
            return x, state_h, state_c

In [20]:
vocab_tar_size = dataset.num_target_tokens

In [21]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, 0.2, "LSTM")

if decoder.layer_type != "LSTM":
    sample_decoder_output, sample_decoder_hidden, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden)
else:
    sample_decoder_output, sample_decoder_hidden, sample_decoder_cell = decoder(tf.random.uniform((BATCH_SIZE, 1)), [sample_hidden, sample_cell])

print('Decoder output shape: (batch_size, vocab size)', sample_decoder_output.shape)
print('Decoder Hidden state shape: (batch size, units)', sample_decoder_hidden.shape)
if encoder.layer_type == "LSTM":
    print ('Encoder c vector shape: (batch size, units) {}'.format(sample_decoder_cell.shape))

Decoder output shape: (batch_size, vocab size) (128, 67)
Decoder Hidden state shape: (batch size, units) (128, 256)
Encoder c vector shape: (batch size, units) (128, 256)


## Optimizer and the loss function

In [30]:
optimizer = tf.keras.optimizers.Nadam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    loss_ = tf.reduce_mean(loss_)
    return loss_

## Training

In [31]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden, enc_cell = encoder(inp, enc_hidden)
        
        if decoder.layer_type != "LSTM":
            dec_hidden = enc_hidden
        else:
            dec_hidden, dec_cell = enc_hidden, enc_cell

        dec_input = tf.expand_dims([dataset.target_tokenizer.word_index[sos]] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            
            # passing enc_output to the decoder
            if decoder.layer_type != "LSTM":
                predictions, _, _ = decoder(dec_input, dec_hidden)
            else:
                predictions, _, _ = decoder(dec_input, [dec_hidden, dec_cell])

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [32]:
EPOCHS = 1
steps_per_epoch = len(dataset.train.input_tensor)//BATCH_SIZE

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.train.batch.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')

    print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 1.5269
Epoch 1 Batch 100 Loss 1.0398
Epoch 1 Batch 200 Loss 1.0099
Epoch 1 Batch 300 Loss 0.9548
Epoch 1 Loss 1.0231
Time taken for 1 epoch 225.84 sec



## Translate

In [52]:
def evaluate(sentence):
    attention_plot = np.zeros((dataset.max_target_seq_length, dataset.max_input_seq_length))

    sentence = dataset.preprocess_word(sentence)

    inputs = [dataset.input_tokenizer.word_index[i] for i in sentence]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=dataset.max_input_seq_length,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    if encoder.layer_type != "LSTM":
        hidden = [tf.zeros((1, units))]
    else:
        hidden = [tf.zeros((1, units)), tf.zeros((1, units))]
    
    enc_out, enc_hidden, enc_cell = encoder(inputs, hidden)
    
    if decoder.layer_type != "LSTM":
        dec_hidden = enc_hidden
    else:
        dec_hidden, dec_cell = enc_hidden, enc_cell

    dec_input = tf.expand_dims([dataset.target_tokenizer.word_index[sos]], 0)

    for t in range(dataset.max_target_seq_length):
        # passing enc_output to the decoder
        if decoder.layer_type != "LSTM":
            predictions, _, _ = decoder(dec_input, dec_hidden)
        else:
            predictions, _, _ = decoder(dec_input, [dec_hidden, dec_cell])
                
        
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        result += dataset.target_tokenizer.index_word[predicted_id]

        if dataset.target_tokenizer.index_word[predicted_id] == eos:
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [53]:
def translate(sentence):
    result, sentence = evaluate(sentence)

    print('Input:', sentence)
    print('Predicted translation:', result)

In [60]:
translate("saa")

Input: @saa#
Predicted translation: स्र#


In [None]:
sequence = dataset.input_tokenizer.texts_to_sequences("shibobrota")
np.reshape(sequence, len(sequence))

array([ 9,  6,  5, 19, 11, 19,  7, 11,  8,  1])

In [None]:
text = dataset.input_tokenizer.sequences_to_texts(sequence)
text

['s', 'h', 'i', 'b', 'o', 'b', 'r', 'o', 't', 'a']

In [41]:
def save_predictions(data_frame, name):
    accuracy_count = 0;
    with open(name, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["INPUT", "PREDICTION", "TRUE"])
        for i, (inp, trg) in enumerate(zip(data_frame[1], data_frame[0])): 
            result, sentence, attention_plot = evaluate(inp)
            writer.writerow([inp, result[:-1], trg])
            print(inp, result[:-1], trg)
            if result[:-1] == trg:
                accuracy_count += 1
            if (i+1) % 100 == 0 or i+1 == data_frame.size:
                print("Accuracy", (accuracy_count / (i+1)))

    return accuracy_count/data_frame.size

In [None]:
save_predictions(test_df, "new_without_attn_predictions.csv")