#### CS20M059 Shibobrota Das | CS20M007 Abhishek Kumar

## Setup

In [111]:
!pip install tensorflow-addons -qqq

In [112]:
!pip install wandb -qqq

In [2]:
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import GradientTape
from tensorflow import keras
import pandas as pd
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, GRU, SimpleRNN, SimpleRNNCell, LSTMCell, GRUCell
from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
import time
import sys
import datetime
from sklearn.utils import shuffle
import wandb
# import nltk
import csv
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("Using numpy:",np.__version__)
print("Using tensorflow:",tf.__version__)
print("Using tensorflow Addons:",tfa.__version__)
print("Using keras:",keras.__version__)
print("Using pandas:",pd.__version__)

Using numpy: 1.19.5
Using tensorflow: 2.5.0
Using tensorflow Addons: 0.13.0
Using keras: 2.5.0
Using pandas: 1.2.4


In [119]:
wandb.init(project='Assignment 3', entity='iitm-cs6910-jan-may-2021-cs20m059-cs20m007')

In [12]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/DL-A3 Dataset/dakshina_dataset_v1.0/hi/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/DL-A3 Dataset/dakshina_dataset_v1.0/hi


#### Load Data

In [13]:
val_df = pd.read_csv("./lexicons/hi.translit.sampled.dev.tsv", sep='\t', header=None)
train_df = pd.read_csv("./lexicons/hi.translit.sampled.train.tsv", sep='\t', header=None)
test_df = pd.read_csv("./lexicons/hi.translit.sampled.test.tsv", sep='\t', header=None)
print("Data Loaded to Dataframes!")

Data Loaded to Dataframes!


In [14]:
%cd '/content/drive/My Drive/A3-checkpoints/'

/content/drive/My Drive/A3-checkpoints


#### Dataset Samples

In [15]:
val_df.sample(n=3)

Unnamed: 0,0,1,2
1134,चिंताजनक,chintajank,1
472,ऋतुराज,ruturaj,1
2739,बिताते,bitaate,1


## Preparing Dataset

In [16]:
sos = "@"
eos = "#"

In [17]:
class LexDataset:
    def __init__(self, input_tensor, target_tensor, batch_size):
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor
        self.batch = tf.data.Dataset.from_tensor_slices((self.input_tensor, self.target_tensor)).shuffle(len(self.input_tensor)).batch(batch_size, drop_remainder=True)

In [18]:
class TransliterationDatatset:
    def __init__(self, df_list, problem_type = "en-hi", batch_size = 128):
        self.problem_type = problem_type
        self.input_tokenizer = None
        self.target_tokenizer = None
        self.train = None
        self.val = None
        self.test = None
        self.batch_size = batch_size
        # Load Data
        self.load_dataset(df_list)
        
    def preprocess_word(self, w):
        return sos + str(w) + eos
    
    def create_dataset(self, data_frame):
        input_words = []
        target_words = []
        # Shuffle the data_frame before creating dataset
        df_shuffled = shuffle(data_frame)
        for x, y in zip(df_shuffled[1], df_shuffled[0]):
            input_words.append(self.preprocess_word(x))
            target_words.append(self.preprocess_word(y))
        return (input_words, target_words)
    
    def load_dataset(self, df_list):
        # df_list should have train -> val -> test in sequence
        
        self.input_tokenizer = Tokenizer(num_words = None, char_level = True)
        self.target_tokenizer = Tokenizer(num_words = None, char_level = True)
        
        ds_list = []
        
        for df in df_list:
            # Get the words list
            (input_words, target_words) = self.create_dataset(df)
            # Fit on the set of words
            self.input_tokenizer.fit_on_texts(input_words)
            self.target_tokenizer.fit_on_texts(target_words)
            ds_list.append((input_words, target_words))
                    
        self.target_tokenizer.index_word.update({0:" "})
        self.input_tokenizer.index_word.update({0:" "})
        
        input_word_len = []
        target_word_len = []
        
        tensor_list = []
        
        for i, (input_words, target_words) in enumerate(ds_list):
            input_tensor = self.input_tokenizer.texts_to_sequences(input_words)
            target_tensor = self.target_tokenizer.texts_to_sequences(target_words)
            tensor_list.append((input_tensor, target_tensor))
            input_word_len.append(np.max([len(x) for x in input_tensor]))
            target_word_len.append(np.max([len(x) for x in target_tensor]))
        
        for i, (input_tensor, target_tensor) in enumerate(tensor_list):
            
            input_tensor = pad_sequences(input_tensor, padding='post', maxlen = np.max(input_word_len))
            target_tensor = pad_sequences(target_tensor, padding='post', maxlen = np.max(target_word_len))
            
            if i == 0:
                self.train = LexDataset(input_tensor, target_tensor, self.batch_size)
            elif i == 1:
                self.val = LexDataset(input_tensor, target_tensor, self.batch_size)
            else:
                self.test = LexDataset(input_tensor, target_tensor, self.batch_size)

In [19]:
dataset = TransliterationDatatset([train_df, val_df, test_df])

#### Training Data

In [20]:
# Training data
dataset.train.input_tensor.shape, dataset.train.target_tensor.shape

((44204, 22), (44204, 21))

#### Validation Data

In [21]:
# Validation data
dataset.val.input_tensor.shape, dataset.val.target_tensor.shape

((4358, 22), (4358, 21))

#### Test Data

In [22]:
# Test data
dataset.test.input_tensor.shape, dataset.test.target_tensor.shape

((4502, 22), (4502, 21))

#### Number of Tokens

In [23]:
# Number of tokens
num_encoder_tokens = len(dataset.input_tokenizer.index_word)+1
num_decoder_tokens = len(dataset.target_tokenizer.index_word)+1
num_encoder_tokens, num_decoder_tokens

(30, 67)

#### Maximum Sequence Lengths

In [24]:
# max seq length
max_encoder_seq_length = np.max([dataset.train.input_tensor.shape[1], dataset.val.input_tensor.shape[1], dataset.test.input_tensor.shape[1]])
max_decoder_seq_length = np.max([dataset.train.target_tensor.shape[1], dataset.val.target_tensor.shape[1], dataset.test.target_tensor.shape[1]])
max_encoder_seq_length, max_decoder_seq_length

(22, 21)

#### Example batch - dataset

In [25]:
example_input_batch, example_target_batch = next(iter(dataset.train.batch))
example_input_batch.shape, example_target_batch.shape

(TensorShape([128, 22]), TensorShape([128, 21]))

## Encoder

In [26]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size, layer_type = "SimpleRNN", dropout = 0.2):
        super(Encoder, self).__init__()
        
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.layer_type = layer_type
        self.embedding = Embedding(vocab_size, embedding_dim)
        
        ##-------- RNN layer in Encoder ------- ##
        if self.layer_type == "LSTM":
            self.layer = LSTM(self.encoder_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')
            
        elif self.layer_type == "GRU":
            self.layer = GRU(self.encoder_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')
           
        else:
            self.layer = SimpleRNN(self.encoder_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = dropout,
                                       recurrent_initializer='glorot_uniform')
            
            
    def call(self, inputs, hidden):
        inputs = self.embedding(inputs)
        if self.layer_type == "LSTM":
            output, h, c = self.layer(inputs, initial_state = hidden)
            return output, h, c
        else:
            output, h = self.layer(inputs, initial_state = hidden)
            return output, h, None

    def initialize_hidden_state(self):
        if self.layer_type == "LSTM":
            return [tf.zeros((self.batch_size, self.encoder_units)), tf.zeros((self.batch_size, self.encoder_units))]
        else:
            return tf.zeros((self.batch_size, self.encoder_units))

#### Test Encoder Stack

In [27]:
# ## Test Encoder Stack

# encoder = Encoder(num_encoder_tokens, 16, 128, dataset.batch_size, "SimpleRNN", 3)


# # sample input
# sample_hidden = encoder.initialize_hidden_state()
# sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
# print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
# if encoder.layer_type == "LSTM":
#     print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

## Decoder

In [132]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size, layer_type, attention_type='luong'):
        super(Decoder, self).__init__()
        
        self.decoder_units = decoder_units
        self.layer_type = layer_type
        self.attention_type = attention_type
        self.batch_size = batch_size
        
        # Embedding Layer
        self.embedding = Embedding(vocab_size, embedding_dim)
        
        # Final Dense layer on which softmax will be applied
        self.fc = Dense(vocab_size, activation='softmax')
        
        # fundamental cell for decoder recurrent structure
        if self.layer_type == "LSTM":
            self.decoder_rnn_cell = LSTMCell(self.decoder_units)
        elif self.layer_type == "GRU":
            self.decoder_rnn_cell = GRUCell(self.decoder_units)
        else:
            self.decoder_rnn_cell = SimpleRNNCell(self.decoder_units)

        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        if self.attention_type != "none":
            # Create attention mechanism with memory = None
            self.attention = self.build_attention_mechanism(self.decoder_units, 
                                                            None, self.batch_size*[max_decoder_seq_length], 
                                                            self.attention_type)
            # Wrap attention mechanism with the fundamental rnn cell of decoder
            self.rnn_cell = self.build_rnn_cell()
        else:
            # Without attention mechanism
            self.rnn_cell = self.decoder_rnn_cell
            
        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

        
    def build_rnn_cell(self):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell,
                                               self.attention,
                                               attention_layer_size = self.decoder_units)
        return rnn_cell
    
    def build_attention_mechanism(self, decoder_units, memory, 
                                  memory_sequence_length, attention_type='luong'):
        if(attention_type=='bahdanau'):
            return tfa.seq2seq.BahdanauAttention(units=decoder_units, memory = memory, memory_sequence_length = memory_sequence_length)
        else:
            return tfa.seq2seq.LuongAttention(units=decoder_units, memory = memory, memory_sequence_length = memory_sequence_length)

    def build_initial_state(self, batch_size, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size, dtype = Dtype)
        if self.attention_type != "none":
            decoder_initial_state = decoder_initial_state.clone(cell_state = encoder_state)
        return decoder_initial_state
    
    def call(self, inputs, initial_state):
        inputs = self.embedding(inputs)
        outputs, _, _ = self.decoder(inputs, initial_state = initial_state, sequence_length = self.batch_size*[max_decoder_seq_length-1])
        return outputs

#### Test decoder stack

In [29]:
# # Test decoder stack

# decoder = Decoder(num_decoder_tokens, 16, 128, dataset.batch_size, "LSTM", 'none')
# sample_x = tf.random.uniform((dataset.batch_size, max_decoder_seq_length))
# if decoder.attention_type != "none":
#     decoder.attention.setup_memory(sample_output)
# if decoder.layer_type == "LSTM":
#     initial_state = decoder.build_initial_state(dataset.batch_size, [sample_h, sample_c], tf.float32)
# else:
#     initial_state = decoder.build_initial_state(dataset.batch_size, sample_h, tf.float32)


# sample_decoder_outputs = decoder(sample_x, initial_state)

# print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

## Model Training

In [30]:
default_config = {
    "layer_type": "LSTM",
    "units": 1024,
    "embedding_dim": 32,
    "optimiser": "nadam",
    "num_encoders": 1,
    "num_decoders": 1,
    "epochs": 20,
    "dropout": 0.002,
    "batch_size": dataset.batch_size,
    "attention": "none"
}

#### Loss Function

In [31]:
def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss  

#### Accuracy Calculation

In [32]:
def calc_accuracy(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    predictions = tf.cast(tf.argmax(pred, axis=2), tf.int32)
    correct_preds = tf.equal(predictions, real)
    accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
    return accuracy

#### Checkpoints (Object-based saving)

In [33]:
checkpoint_dir = './checkpoints'+datetime.datetime.now().replace(microsecond=0).isoformat()

In [34]:
def printf(data):
    sys.stdout.write("\r")
    sys.stdout.write(data)
    sys.stdout.flush()

### Training

In [123]:
def fit(config, train_dataset, val_dataset, encoder = None, decoder = None):
    
    run_name = "".join(f"{a}:{b} " for (a, b) in config.items())
    print(run_name)
    wandb.run.name = run_name
    
    ## Encoder
    if encoder == None:
        encoder = Encoder(num_encoder_tokens, config["embedding_dim"], 
                        config["units"], config["batch_size"], 
                        config["layer_type"], config["dropout"])

    ## Decoder
    if decoder == None:
        decoder = Decoder(num_decoder_tokens, config["embedding_dim"], 
                        config["units"], config["batch_size"], 
                        config["layer_type"], config["attention"])
    
    ## Optimizer
    optimizer = keras.optimizers.Nadam()

    # Checkpoint
    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)

    EPOCHS = config["epochs"]
    BATCH_SIZE = config["batch_size"]
    steps_per_epoch = np.shape(dataset.train.input_tensor)[0]//dataset.batch_size
    val_steps_per_epoch = np.shape(val_dataset.input_tensor)[0]//dataset.batch_size

    # init matrics
    total_loss = 0
    total_accuracy = 0
    val_total_loss = 0
    val_total_accuracy = 0
    steps_count = 0
    val_steps_count = 0

    enc_hidden = encoder.initialize_hidden_state()

    for epoch in range(EPOCHS):
        start = time.time()

        print(f'Epoch {epoch + 1}')

        train_dataset.batch.shuffle(BATCH_SIZE)
        val_dataset.batch.shuffle(BATCH_SIZE)
        
        # Training Loop
        for (batch, (inp, targ)) in enumerate(train_dataset.batch.take(steps_per_epoch)):        
            batch_loss = 0
            accuracy = 0
            with GradientTape() as tape:
                enc_output, enc_h, enc_c = encoder(inp, enc_hidden)

                dec_input = targ[ : , :-1 ]       # Ignore '#' token
                real = targ[ : , 1: ]             # ignore '@' token

                if decoder.attention_type != "none":
                    # Set the AttentionMechanism object with encoder_outputs
                    decoder.attention.setup_memory(enc_output)

                # Create AttentionWrapperState as initial_state for decoder
                if decoder.layer_type == "LSTM":
                    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
                else:
                    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, enc_h, tf.float32)
                
                pred = decoder(dec_input, decoder_initial_state)
                logits = pred.rnn_output
                batch_loss = loss_function(real, logits)

                # Experiment for Accuracy
                accuracy = calc_accuracy(real, logits)


            variables = encoder.trainable_variables + decoder.trainable_variables
            gradients = tape.gradient(batch_loss, variables)
            optimizer.apply_gradients([
                (grad, var) 
                for (grad, var) in zip(gradients, variables) 
                if grad is not None
            ])
            # optimizer.apply_gradients(zip(gradients, variables))

            total_loss += batch_loss
            total_accuracy += accuracy
            steps_count += 1

            printf(f"Training: {(100 * batch / len(dataset.train.batch.take(steps_per_epoch))):.2f}% Accuracy: {(total_accuracy / steps_count):.4f} Loss: {(total_loss / steps_count):.4f}")
            wandb.log({"accuracy":(total_accuracy / steps_count), "loss":(total_loss / steps_count), "epochs": epoch})

        printf("\n")
        
        # Validation Loop
        for (val_batch, (inp, targ)) in enumerate(val_dataset.batch.take(steps_per_epoch)):        
            val_batch_loss = 0
            val_accuracy = 0
            
            enc_output, enc_h, enc_c = encoder(inp, enc_hidden)

            dec_input = targ[ : , :-1 ]       # Ignore '#' token
            real = targ[ : , 1: ]             # ignore '@' token

            if decoder.attention_type != "none":
                # Set the AttentionMechanism object with encoder_outputs
                decoder.attention.setup_memory(enc_output)

            # Create AttentionWrapperState as initial_state for decoder
            if decoder.layer_type == "LSTM":
                decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
            else:
                decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, enc_h, tf.float32)

            pred = decoder(dec_input, decoder_initial_state)
            logits = pred.rnn_output
            
            val_batch_loss = loss_function(real, logits)
            val_accuracy = calc_accuracy(real, logits)
            
            val_total_loss += val_batch_loss
            val_total_accuracy += val_accuracy
            val_steps_count += 1
            
            printf(f"Validating: {(100 * val_batch / len(val_dataset.batch.take(val_steps_per_epoch))):.2f}% Accuracy: {(val_total_accuracy / val_steps_count):.4f} Loss: {(val_total_loss / val_steps_count):.4f}")
            wandb.log({"val accuracy":(val_total_accuracy / val_steps_count), "val loss":(val_total_loss / val_steps_count), "epochs": epoch})

        printf("\n")
        # saving (checkpoint) the model every epochs
        checkpoint_prefix = os.path.join(checkpoint_dir, "".join(f"{a}-{b}_" for (a, b) in config.items()))
        checkpoint.save(file_prefix = checkpoint_prefix)

        print(f'Loss {(total_loss / (steps_per_epoch * (epoch+1))):.4f} Accuracy {(total_accuracy / (steps_per_epoch * (epoch+1))):.4f}')
        print(f'Val Loss {(val_total_loss / (val_steps_per_epoch * (epoch+1))):.4f} Val Accuracy {(val_total_accuracy / (val_steps_per_epoch * (epoch+1))):.4f}')
        print(f'Time taken for this epoch {(time.time() - start):.4f} sec\n')
        
    return encoder, decoder

#### Test Training method

In [36]:
# encoder, decoder = fit(default_config, dataset.train, dataset.val)

### BeamSearch

In [37]:
def beam_evaluate_word(word, encoder, decoder, config, beam_width=3):
    word = dataset.preprocess_word(word)

    inputs = [dataset.input_tokenizer.word_index[i] for i in word]
    inputs = keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_encoder_seq_length,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    if encoder.layer_type == "LSTM":
        enc_start_state = [tf.zeros((inference_batch_size, config["units"])), tf.zeros((inference_batch_size, config["units"]))]
    else:
        enc_start_state = tf.zeros((inference_batch_size, config["units"]))
    
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], dataset.target_tokenizer.word_index['@'])
    end_token = dataset.target_tokenizer.word_index['#']

    enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)
    if decoder.attention_type != "none":
        decoder.attention.setup_memory(enc_out)

    # set decoder_inital_state which is an AttentionWrapperState considering beam_width
    if decoder.layer_type == "LSTM":
        hidden_state = tfa.seq2seq.tile_batch([enc_h, enc_c], multiplier=beam_width)
    else:
        hidden_state = tfa.seq2seq.tile_batch(enc_h, multiplier=beam_width)

    decoder_initial_state = decoder.rnn_cell.get_initial_state(batch_size=beam_width*inference_batch_size, dtype=tf.float32)
    decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)

    # Instantiate BeamSearchDecoder
    decoder_instance = tfa.seq2seq.BeamSearchDecoder(decoder.rnn_cell,beam_width=beam_width, output_layer=decoder.fc)
    decoder_embedding_matrix = decoder.embedding.variables[0]

    # The BeamSearchDecoder object's call() function takes care of everything.
    outputs, final_state, sequence_lengths = decoder_instance(decoder_embedding_matrix, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state)

    final_outputs = tf.transpose(outputs.predicted_ids, perm=(0,2,1))
    beam_scores = tf.transpose(outputs.beam_search_decoder_output.scores, perm=(0,2,1))

    return final_outputs.numpy(), beam_scores.numpy()

In [38]:
def beam_transliterate(word, encoder, decoder, config):
  result, beam_scores = beam_evaluate_word(word, encoder, decoder, config, 5)
  print(result.shape, beam_scores.shape)
  for beam, score in zip(result, beam_scores):
    print(beam.shape, score.shape)
    output = dataset.target_tokenizer.sequences_to_texts(beam)
    output = [a[:a.index('#')] for a in output]
    beam_score = [a.sum() for a in score]
    print('Input: %s' % (word))
    for i in range(len(output)):
      print('{} Predicted translation: {}  {}'
            .format(i+1, "".join(output[i].split(" ")).replace("#", ""), 
                    beam_score[i]))

In [40]:
# beam_transliterate('shibobrota', encoder, decoder, test_config)

### Hyper Parameter Search

In [41]:
sweep_config = {
    "name": "Assignment 3 - Without Attention " + datetime.datetime.now().replace(microsecond=0).isoformat() ,
    "metric": "categorical_accuracy",
    "method": "random",
    "project": 'Assignment 3',
    "parameters": {
        "layer_type": {
            "values": ["GRU", "LSTM", "SimpleRNN"]
        },
        "dropout": {
            "values": [0.002, 0.2]
        },
        "units": {
            "values": [64, 256]
        },
        "embedding_dim": {
            "values": [8, 32, 128]
        },
        "optimiser": {
            "values": ["nadam"]
        },
        "epochs": {
            "values": [20]
        },
        "batch_size": {
            "values": [dataset.batch_size]
        },
        "attention": {
            "values": ["none"]
        },
    }
}

In [42]:
def sweep():

    config = wandb.config
    print(str(config))

    wandb.init(config=default_config, magic=True, project='Assignment 3', entity='iitm-cs6910-jan-may-2021-cs20m059-cs20m007')
    config = wandb.config
    
    encoder, decoder = fit(config, dataset.train, dataset.val)

In [None]:
# sweep_id = wandb.sweep(sweep_config, project='Assignment 3', entity='iitm-cs6910-jan-may-2021-cs20m059-cs20m007')

In [None]:
# wandb.agent("2ie0acfv", function=sweep, project='Assignment 3', entity='iitm-cs6910-jan-may-2021-cs20m059-cs20m007')

### Evaluate Words

In [130]:
def evaluate_word(word, encoder, decoder, config):
    word = dataset.preprocess_word(word)

    inputs = [dataset.input_tokenizer.word_index[i] for i in word]
    inputs = keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_encoder_seq_length,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    if encoder.layer_type == "LSTM":
        enc_start_state = [tf.zeros((inference_batch_size, config["units"])), tf.zeros((inference_batch_size, config["units"]))]
    else:
        enc_start_state = tf.zeros((inference_batch_size, config["units"]))
    
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], dataset.target_tokenizer.word_index['@'])
    end_token = dataset.target_tokenizer.word_index['#']

    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
    
    # Setup Memory in decoder stack
    if decoder.attention_type != "none":
        decoder.attention.setup_memory(enc_out)

    # set decoder_initial_state
    if decoder.layer_type == "LSTM":
        decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)
    else:
        decoder_initial_state = decoder.build_initial_state(inference_batch_size, enc_h, tf.float32)

    decoder_embedding_matrix = decoder.embedding.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state = decoder_initial_state)
    return outputs.sample_id.numpy()


def transliterate(word, encoder, decoder, config):
    result = evaluate_word(word, encoder, decoder, config)
    print(result)
    result = dataset.target_tokenizer.sequences_to_texts(result)
    pred = "".join(result[0].split(" ")).replace("#", "")
    print(f'Input: {word}')  
    print(f'Predicted translation: {pred}')
    return pred

In [None]:
# transliterate("hello", encoder, decoder, default_config)

# Accuracy on the test set

In [129]:
def save_predictions(data_frame, encoder, decoder, config, name):
    accuracy_count = 0;
    with open(name, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["INPUT", "PREDICTION", "TRUE", "DISTANCE"])
        for i, (inp, trg) in enumerate(zip(data_frame[1], data_frame[0])): 
            pred = transliterate(inp, encoder, decoder, config)
            distance = nltk.jaccard_distance(set(trg), set(pred))
            writer.writerow([inp, pred, trg, distance])
            if pred == trg:
                accuracy_count += 1
            if (i+1) % 10 == 0 or i+1 == data_frame.size:
                wandb.log({"test accuracy":(accuracy_count / (i+1))})

    return accuracy_count/data_frame.size

In [74]:
test_config = {
    "layer_type": "LSTM",
    "units": 128,
    "embedding_dim": 128,
    "optimiser": "adam",
    "num_encoders": 1,
    "num_decoders": 1,
    "epochs": 5,
    "dropout": 0.002,
    "batch_size": dataset.batch_size,
    "attention": "luong"
}

In [75]:
# encoder, decoder = fit(test_config, dataset.val, dataset.val, encoder, decoder)

layer_type:LSTM units:128 embedding_dim:128 optimiser:adam num_encoders:1 num_decoders:1 epochs:5 dropout:0.002 batch_size:128 attention:luong 
Epoch 1
Training: 9.57% Accuracy: 0.2641 Loss: 1.2072
Validating: 97.06% Accuracy: 0.2662 Loss: 1.2055
Loss 0.1190 Accuracy 0.0260
Val Loss 1.2055 Val Accuracy 0.2662
Time taken for this epoch 27.8995 sec

Epoch 2
Training: 9.57% Accuracy: 0.2651 Loss: 1.2062
Validating: 97.06% Accuracy: 0.2664 Loss: 1.2053
Loss 0.1189 Accuracy 0.0261
Val Loss 1.2053 Val Accuracy 0.2664
Time taken for this epoch 27.1254 sec

Epoch 3
Training: 9.57% Accuracy: 0.2659 Loss: 1.2056
Validating: 97.06% Accuracy: 0.2673 Loss: 1.2044
Loss 0.1188 Accuracy 0.0262
Val Loss 1.2044 Val Accuracy 0.2673
Time taken for this epoch 31.4241 sec

Epoch 4
Training: 9.57% Accuracy: 0.2666 Loss: 1.2047
Validating: 97.06% Accuracy: 0.2679 Loss: 1.2037
Loss 0.1187 Accuracy 0.0263
Val Loss 1.2037 Val Accuracy 0.2679
Time taken for this epoch 29.2467 sec

Epoch 5
Training: 9.57% Accuracy

In [135]:
transliterate("dip", encoder, decoder, test_config)

[[20  7 17  2]]
Input: dip
Predicted translation: दिप


'दिप'