#### CS20M059 Shibobrota Das | CS20M007 Abhishek Kumar

## Setup

In [29]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import GradientTape
from tensorflow import keras
import pandas as pd
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, GRU, SimpleRNN, SimpleRNNCell, LSTMCell, GRUCell
from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
import os
import time
import sys
from sklearn.utils import shuffle

print("Using numpy:",np.__version__)
print("Using tensorflow:",tf.__version__)
print("Using tensorflow Addons:",tfa.__version__)
print("Using keras:",keras.__version__)
print("Using pandas:",pd.__version__)

Using numpy: 1.19.5
Using tensorflow: 2.4.1
Using tensorflow Addons: 0.12.1
Using keras: 2.4.0
Using pandas: 1.2.3


#### Load Data

In [2]:
val_df = pd.read_csv("./lexicons/hi.translit.sampled.dev.tsv", sep='\t', header=None)
train_df = pd.read_csv("./lexicons/hi.translit.sampled.train.tsv", sep='\t', header=None)
test_df = pd.read_csv("./lexicons/hi.translit.sampled.test.tsv", sep='\t', header=None)
print("Data Loaded to Dataframes!")

Data Loaded to Dataframes!


#### Dataset Samples

In [3]:
val_df.sample(n=3)

Unnamed: 0,0,1,2
2382,प्रक्रियाओं,prakriyaon,3
1287,जनजीवन,janjivn,1
81,अनादरण,anadaran,2


## Preparing Dataset

In [4]:
sos = "@"
eos = "#"

In [5]:
class LexDataset:
    def __init__(self, input_tensor, target_tensor, batch_size):
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor
        self.batch = tf.data.Dataset.from_tensor_slices((self.input_tensor, self.target_tensor)).shuffle(len(self.input_tensor)).batch(batch_size, drop_remainder=True)

In [6]:
class TransliterationDatatset:
    def __init__(self, df_list, problem_type = "en-hi", batch_size = 32):
        self.problem_type = problem_type
        self.input_tokenizer = None
        self.target_tokenizer = None
        self.train = None
        self.val = None
        self.test = None
        self.batch_size = batch_size
        # Load Data
        self.load_dataset(df_list)
        
    def preprocess_word(self, w):
        return sos + str(w) + eos
    
    def create_dataset(self, data_frame):
        input_words = []
        target_words = []
        # Shuffle the data_frame before creating dataset
        df_shuffled = shuffle(data_frame)
        for x, y in zip(df_shuffled[1], df_shuffled[0]):
            input_words.append(self.preprocess_word(x))
            target_words.append(self.preprocess_word(y))
        return (input_words, target_words)
    
    def load_dataset(self, df_list):
        # df_list should have train -> val -> test in sequence
        
        self.input_tokenizer = Tokenizer(num_words = None, char_level = True)
        self.target_tokenizer = Tokenizer(num_words = None, char_level = True)
        
        ds_list = []
        
        for df in df_list:
            # Get the words list
            (input_words, target_words) = self.create_dataset(df)
            # Fit on the set of words
            self.input_tokenizer.fit_on_texts(input_words)
            self.target_tokenizer.fit_on_texts(target_words)
            ds_list.append((input_words, target_words))
                    
        self.target_tokenizer.index_word.update({0:" "})
        self.input_tokenizer.index_word.update({0:" "})
        
        for i, (input_words, target_words) in enumerate(ds_list):
            
            input_tensor = self.input_tokenizer.texts_to_sequences(input_words)
            input_tensor = pad_sequences(input_tensor, padding='post')
            
            target_tensor = self.target_tokenizer.texts_to_sequences(target_words)
            target_tensor = pad_sequences(target_tensor, padding='post')
            
            if i == 0:
                self.train = LexDataset(input_tensor, target_tensor, self.batch_size)
            elif i == 1:
                self.val = LexDataset(input_tensor, target_tensor, self.batch_size)
            else:
                self.test = LexDataset(input_tensor, target_tensor, self.batch_size)

In [7]:
dataset = TransliterationDatatset([train_df, val_df, test_df])

#### Training Data

In [8]:
# Training data
dataset.train.input_tensor.shape, dataset.train.target_tensor.shape

((44204, 22), (44204, 21))

#### Validation Data

In [9]:
# Validation data
dataset.val.input_tensor.shape, dataset.val.target_tensor.shape

((4358, 20), (4358, 16))

#### Test Data

In [10]:
# Test data
dataset.test.input_tensor.shape, dataset.test.target_tensor.shape

((4502, 18), (4502, 17))

#### Number of Tokens

In [11]:
# Number of tokens
num_encoder_tokens = len(dataset.input_tokenizer.index_word)+1
num_decoder_tokens = len(dataset.target_tokenizer.index_word)+1
num_encoder_tokens, num_decoder_tokens

(30, 67)

#### Maximum Sequence Lengths

In [12]:
# max seq length
max_encoder_seq_length = np.max([dataset.train.input_tensor.shape[1], dataset.val.input_tensor.shape[1], dataset.test.input_tensor.shape[1]])
max_decoder_seq_length = np.max([dataset.train.target_tensor.shape[1], dataset.val.target_tensor.shape[1], dataset.test.target_tensor.shape[1]])
max_encoder_seq_length, max_decoder_seq_length

(22, 21)

#### Example batch - dataset

In [13]:
example_input_batch, example_target_batch = next(iter(dataset.train.batch))
example_input_batch.shape, example_target_batch.shape

(TensorShape([32, 22]), TensorShape([32, 21]))

## Encoder

In [14]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size, layer_type = "SimpleRNN"):
        super(Encoder, self).__init__()
        
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.layer_type = layer_type
        self.embedding = Embedding(vocab_size, embedding_dim)
        
        ##-------- RNN layer in Encoder ------- ##
        if self.layer_type == "LSTM":
            self.layer = LSTM(self.encoder_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        elif self.layer_type == "GRU":
            self.layer = GRU(self.encoder_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        else:
            self.layer = SimpleRNN(self.encoder_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')


    def call(self, inputs, hidden):
        inputs = self.embedding(inputs)
        if self.layer_type == "LSTM":
            output, h, c = self.layer(inputs, initial_state = hidden)
            return output, h, c
        else:
            output, h = self.layer(inputs, initial_state = hidden)
            return output, h, None

    def initialize_hidden_state(self):
        if self.layer_type == "LSTM":
            return [tf.zeros((self.batch_size, self.encoder_units)), tf.zeros((self.batch_size, self.encoder_units))]
        else:
            return tf.zeros((self.batch_size, self.encoder_units))

#### Test Encoder Stack

In [15]:
# ## Test Encoder Stack

# encoder = Encoder(num_encoder_tokens, embedding_dim, units, dataset.batch_size, "SimpleRNN")


# # sample input
# sample_hidden = encoder.initialize_hidden_state()
# sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
# print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
# if encoder.layer_type == "LSTM":
#     print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

## Decoder

In [16]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size, layer_type, attention_type='luong'):
        super(Decoder, self).__init__()
        
        self.decoder_units = decoder_units
        self.layer_type = layer_type
        self.attention_type = attention_type
        self.batch_size = batch_size
        
        # Embedding Layer
        self.embedding = Embedding(vocab_size, embedding_dim)
        
        # Final Dense layer on which softmax will be applied
        self.fc = Dense(vocab_size)
        
        # fundamental cell for decoder recurrent structure
        if self.layer_type == "LSTM":
            self.decoder_rnn_cell = LSTMCell(self.decoder_units)
        elif self.layer_type == "GRU":
            self.decoder_rnn_cell = GRUCell(self.decoder_units)
        else:
            self.decoder_rnn_cell = SimpleRNNCell(self.decoder_units)

        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        # Create attention mechanism with memory = None
        self.attention = self.build_attention_mechanism(self.decoder_units, 
                                                        None, self.batch_size*[max_decoder_seq_length], 
                                                        self.attention_type)

        # Wrap attention mechanism with the fundamental rnn cell of decoder
        self.rnn_cell = self.build_rnn_cell()

        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

        
    def build_rnn_cell(self):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell,
                                               self.attention,
                                               attention_layer_size = self.decoder_units)
        return rnn_cell
    
    def build_attention_mechanism(self, decoder_units, memory, 
                                  memory_sequence_length, attention_type='luong'):
        if(attention_type=='bahdanau'):
            return tfa.seq2seq.BahdanauAttention(units=decoder_units, memory = memory, memory_sequence_length = memory_sequence_length)
        else:
            return tfa.seq2seq.LuongAttention(units=decoder_units, memory = memory, memory_sequence_length = memory_sequence_length)

    def build_initial_state(self, batch_size, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = self.batch_size, dtype = Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state = encoder_state)
        return decoder_initial_state
    
    def call(self, inputs, initial_state):
        inputs = self.embedding(inputs)
        outputs, _, _ = self.decoder(inputs, initial_state = initial_state, sequence_length = self.batch_size*[max_decoder_seq_length-1])
        return outputs

#### Test decoder stack

In [17]:
# # Test decoder stack

# decoder = Decoder(num_decoder_tokens, embedding_dim, units, dataset.batch_size, "SimpleRNN", 'luong')
# sample_x = tf.random.uniform((dataset.batch_size, max_decoder_seq_length))
# decoder.attention.setup_memory(sample_output)
# if decoder.layer_type == "LSTM":
#     initial_state = decoder.build_initial_state(dataset.batch_size, [sample_h, sample_c], tf.float32)
# else:
#     initial_state = decoder.build_initial_state(dataset.batch_size, sample_h, tf.float32)


# sample_decoder_outputs = decoder(sample_x, initial_state)

# print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

## Model Training

In [18]:
config = {
    "layer_type": "LSTM",
    "units": 128,
    "embedding_dim": 16,
    "optimiser": "nadam",
    "num_encoders": 1,
    "num_decoders": 1,
    "dropout": 0.2,
    "epochs": 1,
    "batch_size": dataset.batch_size,
    "attention": "luong"
}

#### Optimizer

In [19]:
optimizer = tf.keras.optimizers.Nadam()

#### Loss Function

In [20]:
def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss  

#### Accuracy Calculation

In [23]:
def calc_accuracy(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    predictions = tf.cast(tf.argmax(pred, axis=2), tf.int32)
    correct_preds = tf.equal(predictions, real)
    accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
    return accuracy

#### Checkpoints (Object-based saving)

In [21]:
checkpoint_dir = './checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

### Training

In [None]:
## Encoder
encoder = Encoder(num_encoder_tokens, config["embedding_dim"], 
                  config["units"], config["batch_size"], 
                  config["layer_type"])

## Decoder
decoder = Decoder(num_decoder_tokens, config["embedding_dim"], 
                  config["units"], config["batch_size"], 
                  config["layer_type"], config["attention"])

# Checkpoint
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

EPOCHS = 2
BATCH_SIZE = config["batch_size"]
steps_per_epoch = int(np.shape(dataset.train.input_tensor)[0]//dataset.batch_size)

for epoch in range(EPOCHS):
    start = time.time()
    
    print(f'Epoch {epoch + 1}')
    
    dataset.train.batch.shuffle(BATCH_SIZE)
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    total_accuracy = 0

    for (batch, (inp, targ)) in enumerate(dataset.train.batch.take(steps_per_epoch)):        
        batch_loss = 0
        accuracy = 0
        with GradientTape() as tape:
            enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


            dec_input = targ[ : , :-1 ]       # Ignore '#' token
            real = targ[ : , 1: ]             # ignore '@' token

            # Set the AttentionMechanism object with encoder_outputs
            decoder.attention.setup_memory(enc_output)

            # Create AttentionWrapperState as initial_state for decoder
            if decoder.layer_type == "LSTM":
                decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
            else:
                decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, enc_h, tf.float32)
            pred = decoder(dec_input, decoder_initial_state)
            logits = pred.rnn_output
            batch_loss = loss_function(real, logits)
            
            # Experiment for Accuracy
            accuracy = calc_accuracy(real, logits)
            

        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(batch_loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        
        total_loss += batch_loss
        total_accuracy += accuracy
        
        sys.stdout.flush()
        sys.stdout.write(f"\rProcessing: {(100 * batch / len(dataset.train.batch.take(steps_per_epoch))):.2f}% Accuracy: {(total_accuracy / batch):.4f} Loss: {(total_loss / batch):.4f}")
            
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print(f'Loss {(total_loss / steps_per_epoch):.4f} Accuracy {(total_accuracy / steps_per_epoch):.4f}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')

In [25]:
''.join(f'{key}-{val}__' for key, val in sorted(config.items()))

'attention-luong__batch_size-32__dropout-0.2__embedding_dim-16__epochs-1__layer_type-LSTM__num_decoders-1__num_encoders-1__optimiser-nadam__units-128__'