In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import sentence_bleu
import plotly.figure_factory as ff
import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
#read the required files

train = pd.read_csv('../data/processed/train.csv')
validation = pd.read_csv('../data/processed/validation.csv')
test = pd.read_csv('../data/processed/test.csv')

with open('../model/tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

In [123]:
class Dataset:
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __padding__(self, sequence):
        return pad_sequences(sequence, maxlen = self.max_length, dtype = 'int32', padding = 'post')
    
    def __getitem__(self, i):
        self.encoder_input_sequence = self.tokenizer['informal'].texts_to_sequences([self.data['encoder_input'].values[i]])
        self.decoder_input_sequence = self.tokenizer['normalized'].texts_to_sequences([self.data['decoder_input'].values[i]])
        self.decoder_output_sequence = self.tokenizer['normalized'].texts_to_sequences([self.data['decoder_output'].values[i]])
        return self.__padding__(self.encoder_input_sequence), self.__padding__(self.decoder_input_sequence), self.__padding__(self.decoder_output_sequence)
        

    def __len__(self):
        return len(self.encoder_inps)

In [124]:
class Dataloader(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size = 1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.data['encoder_input'].values))

    def __getitem__(self, i):
        data = [self.dataset[idx] for idx in range(i * self.batch_size, (i + 1) * self.batch_size)]
        batch = [np.squeeze(np.stack(samples, axis = 1), axis = 0) for samples in zip(*data)]
        return tuple([[batch[0],batch[1]],batch[2]])

    def __len__(self):
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [125]:
class Encoder(tf.keras.Model):
    '''
    Encoder model takes a input sequence and returns Encoder outputs as encoder_final_hidden_state, encoder_final_current_state
    '''
    def __init__(self, input_vocab_size, lstm_size, input_length):
        super().__init__()
        self.input_vocab_size = input_vocab_size
        self.lstm_size = lstm_size
        self.input_length = input_length
        
        embedding_params = {'input_dim': self.input_vocab_size,
                            'output_dim': self.input_vocab_size,
                            'embeddings_initializer' : tf.keras.initializers.RandomNormal(mean = 0, stddev = 1, seed = 42),
                            'input_length' : self.input_length, 
                            'mask_zero' : True}
        
        lstm_params = {'units':self.lstm_size, 
                      'return_state' : True, 
                      'return_sequences' : True,
                      'kernel_initializer' : tf.keras.initializers.glorot_uniform(seed = 42),
                      'recurrent_initializer' : tf.keras.initializers.orthogonal(seed = 42)}
        
  
        self.embedding = Embedding(**embedding_params)
        self.lstm1 = LSTM(**lstm_params)
        self.lstm2 = LSTM(**lstm_params)

    def call(self, input):
        self.encoder_output, self.hidden_state, self.current_state = self.lstm1(self.embedding(input[0]), initial_state = input[1])
        return self.lstm2(self.encoder_output, [self.hidden_state, self.current_state])
      
  
class Decoder(tf.keras.Model):
    def __init__(self, output_vocab_size, lstm_size, input_length):
        super().__init__()
        self.output_vocab_size = output_vocab_size
        self.lstm_size = lstm_size
        self.input_length = input_length
        
        embedding_params = {'input_dim' : self.output_vocab_size, 
                            'output_dim' : self.output_vocab_size, 
                            'input_length' : self.input_length, 
                            'mask_zero' : True}
        
        lstm_params = {'units':self.lstm_size, 
                       'return_sequences' : True,
                       'return_state' : True}
        
        self.embedding = Embedding(**embedding_params)
        self.lstm = LSTM(**lstm_params)

    def call(self, input):
        return self.lstm(self.embedding(input[0]), initial_state = input[1])
      
class Encoder_Decoder(tf.keras.Model):
    def __init__(self, input_vocab_size, output_vocab_size, lstm_size, input_length, batch_size):
        super().__init__()
        self.lstm_size = lstm_size
        self.input_length = input_length
        self.input_vocab_size = input_vocab_size + 1
        self.output_vocab_size = output_vocab_size + 1
        self.batch_size = batch_size
        self.encoder = Encoder(input_vocab_size = self.input_vocab_size, lstm_size = self.lstm_size, input_length = self.input_length)
        self.decoder = Decoder(output_vocab_size = self.output_vocab_size, lstm_size = self.lstm_size, input_length = self.input_length)
        self.dense   = Dense(self.output_vocab_size, activation = 'softmax')
    
    def call(self, data):
        initial_state = tf.zeros([self.batch_size, self.lstm_size]), tf.zeros([self.batch_size, self.lstm_size])
        _, encoder_hidden, encoder_current = self.encoder([data[0], initial_state])
        decoder_output, _, _ = self.decoder([data[1], [encoder_hidden, encoder_current]])
        return self.dense(decoder_output)

In [126]:
def create_tensorboard_cb(model):
    root_logdir = os.path.join(os.curdir, model)
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    logdir = os.path.join(root_logdir, run_id)
    return tf.keras.callbacks.TensorBoard(logdir, histogram_freq = 1)

In [5]:
BATCH_SIZE = 64
MAX_LEN = 200
UNITS = 256
EPOCHS = 60
TRAIN_STEPS = train.shape[0]//BATCH_SIZE
VALIDATION_STEPS = validation.shape[0]//BATCH_SIZE
cb_params = {'monitor' : 'val_loss', 
                  'factor' : 0.5, 
                  'verbose' : 1, 
                  'patience' : 1, 
                  'min_lr' : 0.0001}
cb_stopper_cb = {'monitor' : 'val_loss', 
                 'patience' : 3, 
                 'verbose' : 1, 
                 'restore_best_weights' : True}

train_dataset = Dataset(train, tokenizer, MAX_LEN)
validation_dataset  = Dataset(validation, tokenizer, MAX_LEN)
train_dataloader = Dataloader(train_dataset, BATCH_SIZE)
validation_dataloader = Dataloader(validation_dataset, BATCH_SIZE)


model  = Encoder_Decoder(input_vocab_size = len(tokenizer['informal'].word_index.keys()), output_vocab_size = len(tokenizer['normalized'].word_index.keys()),
                         lstm_size = UNITS, input_length = MAX_LEN, batch_size = BATCH_SIZE)
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)

model.compile(optimizer = optimizer, loss = 'sparse_categorical_crossentropy')

learning_rate_cb = tf.keras.callbacks.ReduceLROnPlateau(**cb_params)
tensorboard_cb = create_tensorboard_cb("Enc_Dec_logs")
stopper_cb = tf.keras.callbacks.EarlyStopping(**cb_stopper_cb)
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("Enc_Dec",
                                                    save_best_only = True, save_weights_only = False)

model.fit(train_dataloader, steps_per_epoch = TRAIN_STEPS, epochs = EPOCHS,
              callbacks = [learning_rate_cb, tensorboard_cb, stopper_cb, checkpoint_cb],
              validation_data = validation_dataloader, validation_steps = VALIDATION_STEPS)
model.summary()

In [128]:
def predict(input_sentence, model):
    inputs = [tokenizer['informal'].word_index.get(i, 0) for i in input_sentence]
    inputs = tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen = MAX_LEN, padding = 'post'))
    result = ''
    
    _, state_hidden, state_current = model.encoder([inputs, (tf.zeros([1, UNITS]), tf.zeros([1, UNITS]))])
    decoder_hidden = [state_hidden, state_current]
    decoder_input = tf.expand_dims([tokenizer['normalized'].word_index['<']], 0)
    
    vector_length = 0
    while vector_length < MAX_LEN:
    
        predictions, state_hidden, state_current = model.decoder([decoder_input, decoder_hidden])
        decoder_hidden = [state_hidden, state_current]
    
        predicted_id = tf.argmax(model.layers[2](predictions)[0][0]).numpy()
    
        if tokenizer['normalized'].index_word.get(predicted_id, '') == '>':
            break
        else:
            result += tokenizer['normalized'].index_word.get(predicted_id, '')
            decoder_input = tf.expand_dims([predicted_id], 0)
        vector_length = vector_length + 1
    
    return result

In [135]:
def post_processing(s):
    if s.startswith('<'):
        s = s[len('<'):]
    if s.endswith('>'):
        s = s[:-len('>')]
    return s

def predictor(s):
    return predict(s, model)

def convert_formals(s):
    return [s.split()]

def convert_predictions(s):
    return s.split()

test['informals'] = test['encoder_input'].apply(post_processing)
test['formals'] = test['decoder_input'].apply(post_processing)
test['predictions'] = test['informals'].apply(predictor)
test['formals'] = test['formals'].apply(convert_formals)
test['predictions'] = test['predictions'].apply(convert_predictions)

bleu_scores = []
i = 0

while i < (len(test)):
    bleu_scores.append(sentence_bleu(test['formals'].iloc[i], test['predictions'].iloc[i]))
    i = i + 1

print('Average BLEU score for the predictions:', np.mean(bleu_scores))

Average BLEU score for the predictions: 5.025140393849922e-80


In [1]:
fig = ff.create_distplot([bleu_scores], ['Count'])
fig.update_layout(title= 'BLEU Score')
fig.show()

In [4]:
scores = np.array(bleu_scores)
indices = (np.argsort(scores)).tolist()
worst = indices[0]
best = indices[-1]

print('Best Predictions:')
print('Informal Input: ',test['informals'].iloc[indices[-1]])
print('Expected Output: ',test['formals'].iloc[indices[-1]][0])
print('Predicted Output: ',test['predictions'].iloc[indices[-1]])
print('Bleu Score of Prediction: ',scores[indices[-1]])
print("\n")

print('Worst Predictions:')
print('Informal Input: ',test['informals'].iloc[indices[0]])
print('Expected Output: ',test['formals'].iloc[indices[0]][0])
print('Predicted Output: ',test['predictions'].iloc[indices[0]])
print('Bleu Score of Prediction: ',scores[indices[0]])
print("\n")

Best Predictions:
Informal Input: How you doing?
Expected Output: How you doing?
Predicted Output:  How you doing?
Bleu Score of Prediction : 1.0


Worst Predictions:
Informal Input : Kid's shop selling clothes izit...
Expected Output : Kid's shop is selling clothes, is it?
Predicted Output : I'm still to some to see you all not.
Bleu Score of Prediction : 0.00
