In [331]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import sentence_bleu
import plotly.figure_factory as ff
import time

import warnings
warnings.filterwarnings("ignore")

In [332]:
#read the required files

train = pd.read_csv('../data/processed/train.csv', index_col=[0])
validation = pd.read_csv('../data/processed/validation.csv', index_col=[0])
test = pd.read_csv('../data/processed/test.csv', index_col=[0])

with open('../model/tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

In [333]:
train

Unnamed: 0,encoder_input,decoder_input,decoder_output
0,"<Er... nope. Sad to say,I'm quite a loner. So ...","<Nope. Sad to say, I'm quite a loner. So just ...","Nope. Sad to say, I'm quite a loner. So just p..."
1,<Hope so... call ya when i'm better :)>,<Hope so. I will call you when I'm better.,Hope so. I will call you when I'm better.>
2,<I ' ll meet you before tne letter Than.>,<I'll meet you before the lecture then.,I'll meet you before the lecture then.>
3,"<Ok, since everyone give notice make it on Mon...","<Ok, since everyone can make it on Monday. Let...","Ok, since everyone can make it on Monday. Let'..."
4,<Hm... Wawa suggested we go watch movie then d...,<Wawa suggested that we go watch movie then di...,Wawa suggested that we go watch movie then dis...
...,...,...,...
7095,"<N probleme! Close fremds call me hammy. Haha,...",<No problem! Close friends call me hammy. Haha...,"No problem! Close friends call me hammy. Haha,..."
7096,<Do uou have ICQ or MSN? Wath ius your emai? Y...,<Do you have ICQ or MSN? What is your email? Y...,Do you have ICQ or MSN? What is your email? Yi...
7097,<yupz...if u cant den i help u collect lor...>,"<Yes, if you can't then I'll help you to collect.","Yes, if you can't then I'll help you to collect.>"
7098,<I don ' at shink soooo. Just bring moey and y...,<I don't think so. Just bring money and yourself.,I don't think so. Just bring money and yourself.>


In [334]:
class Dataset:
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __padding__(self, sequence):
        return pad_sequences(sequence, maxlen = self.max_length, dtype = 'int32', padding = 'post')
    
    def __getitem__(self, i):
        self.encoder_input_sequence = self.tokenizer['informal'].texts_to_sequences([self.data['encoder_input'].values[i]])
        self.decoder_input_sequence = self.tokenizer['normalized'].texts_to_sequences([self.data['decoder_input'].values[i]])
        self.decoder_output_sequence = self.tokenizer['normalized'].texts_to_sequences([self.data['decoder_output'].values[i]])
        return self.__padding__(self.encoder_input_sequence), self.__padding__(self.decoder_input_sequence), self.__padding__(self.decoder_output_sequence)
        

    def __len__(self):
        return len(self.encoder_inps)

In [335]:
class Dataloader(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size = 1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.data['encoder_input'].values))

    def __getitem__(self, i):
        data = [self.dataset[idx] for idx in range(i * self.batch_size, (i + 1) * self.batch_size)]
        batch = [np.squeeze(np.stack(samples, axis = 1), axis = 0) for samples in zip(*data)]
        return tuple([[batch[0],batch[1]],batch[2]])

    def __len__(self):
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [336]:
class Encoder(tf.keras.Model):
    
    def __init__(self, inp_vocab_size, embedding_dim, lstm_size, input_length):
        super().__init__()
        self.lstm_size = lstm_size
        embedding_params = {'input_dim': inp_vocab_size,
                            'output_dim': embedding_dim,
                            'embeddings_initializer' : tf.keras.initializers.RandomNormal(mean = 0, stddev = 1, seed = 42),
                            'input_length' : input_length, 
                            'mask_zero' : True}
        
        lstm_params = {'units':self.lstm_size, 
                      'return_state' : True, 
                      'return_sequences' : True,
                      'kernel_initializer' : tf.keras.initializers.glorot_uniform(seed = 42),
                      'recurrent_initializer' : tf.keras.initializers.orthogonal(seed = 42)}
        
  
        self.embedding = Embedding(**embedding_params)
        self.lstm1 = LSTM(**lstm_params)
        self.lstm2 = LSTM(**lstm_params)

    def call(self, input):
        self.encoder_output, self.hidden_state, self.current_state = self.lstm1(self.embedding(input[0]), initial_state = input[1])
        return self.lstm2(self.encoder_output, [self.hidden_state, self.current_state])
    
    def initialize_states(self, batch_size):
      return tf.zeros([batch_size, self.lstm_size]), tf.zeros([batch_size, self.lstm_size])
      

In [337]:
class Attention(tf.keras.Model):

    def __init__(self, lstm_size, scoring_function):
        super(Attention, self).__init__()
        self.scoring_function = scoring_function
       
        self.V = tf.keras.layers.Dense(1)
        self.W = tf.keras.layers.Dense(lstm_size)   
        self.W1 = tf.keras.layers.Dense(lstm_size)
        self.W2 = tf.keras.layers.Dense(lstm_size)
        self.V1 = tf.keras.layers.Dense(1)
    
    def call(self,input):        
        score = self.V(tf.linalg.matmul(input[1], tf.expand_dims(input[0], 1), transpose_b=True)) if self.scoring_function == 'dot' else (
            tf.keras.layers.Dot(axes=(2, 1))([self.W(input[1]), tf.expand_dims(input[0], axis = 2)]) if self.scoring_function == 'general' else 
            self.V1(tf.nn.tanh(self.W1(tf.expand_dims(input[0], 1)) + self.W2(input[1])))
        )
        return tf.reduce_sum(tf.nn.softmax(score, axis=1) * input[1], axis=1), tf.nn.softmax(score, axis=1)


In [338]:
class Step_Decoder(tf.keras.Model):

    def __init__(self, out_vocab_size, embedding_dim, input_length, lstm_size, scoring_function, embedding_matrix = None):
        # Initialize the parameters
        super().__init__()
        self.attention = Attention(lstm_size, scoring_function)
        
        embedding_params = {'input_dim' : out_vocab_size, 'output_dim' : embedding_dim,
                                       'embeddings_initializer' : tf.keras.initializers.RandomNormal(mean = 0, stddev = 1, seed = 42),
                                       'input_length' : input_length, 'mask_zero' : True}
        lstm_params = {'units':lstm_size, 'return_state' : True, 'return_sequences' : True, 
                            'kernel_initializer' : tf.keras.initializers.glorot_uniform(seed = 42), 
                            'recurrent_initializer' : tf.keras.initializers.orthogonal(seed = 42)}
        
        if embedding_matrix:
            embedding_params['embeddings_initializer'] = tf.keras.initializers.Constant(embedding_matrix)
            embedding_params['trainable'] = False
        
        self.embedding = Embedding(**embedding_params)
        self.lstm1 = LSTM(**lstm_params)
        self.lstm2 = LSTM(**lstm_params)
        self.dense = Dense(out_vocab_size)


    def call(self, input):

        encoder_hidden = input[2]
        encoder_current = input[3]
        dec_output, encoder_hidden, encoder_current = self.lstm1(tf.concat([tf.expand_dims(self.attention([encoder_hidden, input[1]])[0], 1), 
                                                                            self.embedding(input[0])], axis = -1), [encoder_hidden, encoder_current])
        dec_output, encoder_hidden, encoder_current = self.lstm2(dec_output, [encoder_hidden, encoder_current])
        output = self.dense(tf.reshape(dec_output, (-1, dec_output.shape[2])))
        
        return output, encoder_hidden, encoder_current

In [339]:
class Decoder(tf.keras.Model):

    def __init__(self, out_vocab_size, embedding_dim, input_length, lstm_size, scoring_function, embedding_matrix = None):
        super().__init__()
        self.timestepdecoder = Step_Decoder(out_vocab_size, embedding_dim, input_length,
                                                lstm_size, scoring_function, embedding_matrix)
        
    
    @tf.function
    def call(self, input):
        outputs = tf.TensorArray(tf.float32, size = tf.shape(input[0])[1])
        for timestep in range(tf.shape(input[0])[1]):
            outputs = outputs.write(timestep, self.timestepdecoder([input[0][:, timestep:timestep+1], input[1], input[2], input[3]])[0])
        
        return tf.transpose(outputs.stack(), [1,0,2])

In [340]:
class Encoder_Decoder(tf.keras.Model):
    
    def __init__(self, input_length, inp_vocab_size, out_vocab_size, lstm_size, scoring_function, batch_size, embedding_dim, embedding_matrix = None):
    
        super().__init__()
    
        encoder_args = {'inp_vocab_size' : inp_vocab_size + 1, 'embedding_dim' : embedding_dim, 'lstm_size' : lstm_size, 'input_length' : input_length}
        decoder_args = {'out_vocab_size' : out_vocab_size + 1, 'embedding_dim' : embedding_dim, 'lstm_size' : lstm_size,
                               'scoring_function' : scoring_function, 'input_length' : input_length, 'embedding_matrix' : embedding_matrix}
        self.batch_size = batch_size
        self.encoder = Encoder(**encoder_args)
        self.decoder = Decoder(**decoder_args)
    
    def call(self, data):
        encoder_output, encoder_hidden, encoder_current = self.encoder([data[0], self.encoder.initialize_states(self.batch_size)])
        return self.decoder([data[1], encoder_output, encoder_hidden, encoder_current])

In [341]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')

@tf.function
def loss_function(real, pred):
    # Refer https://www.tensorflow.org/tutorials/text/nmt_with_attention
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [342]:
def create_tensorboard_cb(model):
    root_logdir = os.path.join(os.curdir, model)
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    logdir = os.path.join(root_logdir, run_id)
    return tf.keras.callbacks.TensorBoard(logdir, histogram_freq = 1)

In [352]:

UNITS = 200
EPOCHS = 1
BATCH_SIZE = 64
MAX_LEN = 200 
TRAIN_STEPS = train.shape[0]//BATCH_SIZE
VALID_STEPS = validation.shape[0]//BATCH_SIZE
train_dataset = Dataset(train, tokenizer, MAX_LEN)
validation_dataset  = Dataset(validation, tokenizer, MAX_LEN)
train_dataloader = Dataloader(train_dataset, BATCH_SIZE)
validation_dataloader = Dataloader(validation_dataset, BATCH_SIZE)

cb_params = {'monitor' : 'val_loss', 
                  'factor' : 0.5, 
                  'verbose' : 1, 
                  'patience' : 1, 
                  'min_lr' : 0.0001}
cb_stopper_cb = {'monitor' : 'val_loss', 
                 'patience' : 3, 
                 'verbose' : 1, 
                 'restore_best_weights' : True}

model_dot  = Encoder_Decoder(input_length = MAX_LEN, inp_vocab_size = len(tokenizer['informal'].word_index.keys()),
                                            out_vocab_size = len(tokenizer['normalized'].word_index.keys()), lstm_size = UNITS,
                                            scoring_function = 'dot', batch_size = BATCH_SIZE,
                                            embedding_dim = len(tokenizer['normalized'].word_index.keys()), embedding_matrix = None)


optimizer = tf.keras.optimizers.Adam(0.01)
model_dot.compile(optimizer = optimizer, loss = loss_function)

learning_rate_cb = tf.keras.callbacks.ReduceLROnPlateau(**cb_params)
tensorboard_cb = create_tensorboard_cb("Model_Dot_logs")
stopper_cb = tf.keras.callbacks.EarlyStopping(**cb_stopper_cb)
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("Model_Dot",
                                                    save_best_only = True, save_weights_only = False)

model_dot.fit(train_dataloader, steps_per_epoch = TRAIN_STEPS, epochs = EPOCHS,
            callbacks = [learning_rate_cb, tensorboard_cb, stopper_cb, checkpoint_cb],
            validation_data = validation_dataloader, validation_steps = VALID_STEPS)
model_dot.summary()



In [344]:

def predict(input_sentence, model):
    inputs = tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([[tokenizer['informal'].word_index.get(i, 0) 
                                                                                  for i in input_sentence]], maxlen = MAX_LEN, padding = 'post'))
    sentence = ''
    enc_out, state_h, state_c = model.encoder([inputs, (tf.zeros([1, UNITS]), tf.zeros([1, UNITS]))])
    dec_input = tf.expand_dims([tokenizer['normalized'].word_index['<']], 0)
    for _ in range(MAX_LEN):
        output, state_h, state_c = model.decoder.timestepdecoder([dec_input, enc_out, state_h, state_c])
        character = tokenizer['normalized'].index_word.get(tf.argmax(output[0]).numpy(), '')
        if character == '>':
            break
        else:
            sentence += character
            dec_input = tf.expand_dims([tf.argmax(output[0]).numpy()], 0)
    return sentence

In [345]:
def post_processing(s):
    if s.startswith('<'):
        s = s[len('<'):]
    if s.endswith('>'):
        s = s[:-len('>')]
    return s

def predictor(s):
    return predict(s, model_dot)

def convert_formals(s):
    return [s.split()]

def convert_predictions(s):
    return s.split()

test['informals'] = test['encoder_input'].apply(post_processing)
test['formals'] = test['decoder_input'].apply(post_processing)
test['predictions'] = test['informals'].apply(predictor)
test['formals'] = test['formals'].apply(convert_formals)
test['predictions'] = test['predictions'].apply(convert_predictions)

bleu_scores = []
i = 0

while i < (len(test)):
    bleu_scores.append(sentence_bleu(test['formals'].iloc[i], test['predictions'].iloc[i]))
    i = i + 1

print('Average BLEU score for the predictions:', np.mean(bleu_scores))

Average BLEU score for the predictions: 3.3823417436993433e-233


In [348]:
fig = ff.create_distplot([bleu_scores], ['Count'])
fig.update_layout(title= 'BLEU Score')
fig.show()

In [349]:
model_general  = Encoder_Decoder(input_length = MAX_LEN, inp_vocab_size = len(tokenizer['informal'].word_index.keys()),
                                            out_vocab_size = len(tokenizer['normalized'].word_index.keys()), lstm_size = UNITS,
                                            scoring_function = 'general', batch_size = BATCH_SIZE,
                                            embedding_dim = len(tokenizer['normalized'].word_index.keys()), embedding_matrix = None)


optimizer = tf.keras.optimizers.Adam(0.01)
model_general.compile(optimizer = optimizer, loss = loss_function)

learning_rate_cb = tf.keras.callbacks.ReduceLROnPlateau(**cb_params)
tensorboard_cb = create_tensorboard_cb("Model_General_logs")
stopper_cb = tf.keras.callbacks.EarlyStopping(**cb_stopper_cb)
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("Model_General",
                                                    save_best_only = True, save_weights_only = False)

model_general.fit(train_dataloader, steps_per_epoch = TRAIN_STEPS, epochs = EPOCHS,
            callbacks = [learning_rate_cb, tensorboard_cb, stopper_cb, checkpoint_cb],
            validation_data = validation_dataloader, validation_steps = VALID_STEPS)
model_general.summary()

def predictor(s):
    return predict(s, model_general)

test['informals'] = test['encoder_input'].apply(post_processing)
test['formals'] = test['decoder_input'].apply(post_processing)
test['predictions'] = test['informals'].apply(predictor)
test['formals'] = test['formals'].apply(convert_formals)
test['predictions'] = test['predictions'].apply(convert_predictions)

bleu_scores = []
i = 0

while i < (len(test)):
    bleu_scores.append(sentence_bleu(test['formals'].iloc[i], test['predictions'].iloc[i]))
    i = i + 1

print('Average BLEU score for the predictions:', np.mean(bleu_scores))

In [None]:
fig = ff.create_distplot([bleu_scores], ['Count'])
fig.update_layout(title= 'BLEU Score')
fig.show()

In [None]:
model_concat  = Encoder_Decoder(input_length = MAX_LEN, inp_vocab_size = len(tokenizer['informal'].word_index.keys()),
                                            out_vocab_size = len(tokenizer['normalized'].word_index.keys()), lstm_size = UNITS,
                                            scoring_function = 'concat', batch_size = BATCH_SIZE,
                                            embedding_dim = len(tokenizer['normalized'].word_index.keys()), embedding_matrix = None)


optimizer = tf.keras.optimizers.Adam(0.01)
model_concat.compile(optimizer = optimizer, loss = loss_function)

learning_rate_cb = tf.keras.callbacks.ReduceLROnPlateau(**cb_params)
tensorboard_cb = create_tensorboard_cb("Model_Concat_logs")
stopper_cb = tf.keras.callbacks.EarlyStopping(**cb_stopper_cb)
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("Model_Concat",
                                                    save_best_only = True, save_weights_only = False)

model_concat.fit(train_dataloader, steps_per_epoch = TRAIN_STEPS, epochs = EPOCHS,
            callbacks = [learning_rate_cb, tensorboard_cb, stopper_cb, checkpoint_cb],
            validation_data = validation_dataloader, validation_steps = VALID_STEPS)
model_concat.summary()

def predictor(s):
    return predict(s, model_concat)

test['informals'] = test['encoder_input'].apply(post_processing)
test['formals'] = test['decoder_input'].apply(post_processing)
test['predictions'] = test['informals'].apply(predictor)
test['formals'] = test['formals'].apply(convert_formals)
test['predictions'] = test['predictions'].apply(convert_predictions)

bleu_scores = []
i = 0

while i < (len(test)):
    bleu_scores.append(sentence_bleu(test['formals'].iloc[i], test['predictions'].iloc[i]))
    i = i + 1

print('Average BLEU score for the predictions:', np.mean(bleu_scores))

In [None]:
fig = ff.create_distplot([bleu_scores], ['Count'])
fig.update_layout(title= 'BLEU Score')
fig.show()

In [None]:
scores = np.array(bleu_scores)
indices = (np.argsort(scores)).tolist()
worst = indices[0]
best = indices[-1]

print('Best Predictions:')
print('Informal Input: ',test['informals'].iloc[indices[-1]])
print('Expected Output: ',test['formals'].iloc[indices[-1]][0])
print('Predicted Output: ',test['predictions'].iloc[indices[-1]])
print('Bleu Score of Prediction: ',scores[indices[-1]])
print("\n")

print('Worst Predictions:')
print('Informal Input: ',test['informals'].iloc[indices[0]])
print('Expected Output: ',test['formals'].iloc[indices[0]][0])
print('Predicted Output: ',test['predictions'].iloc[indices[0]])
print('Bleu Score of Prediction: ',scores[indices[0]])
print("\n")

Best Predictions:
Informal Input: How you doing?
Expected Output: How you doing?
Predicted Output:  How you doing?
Bleu Score of Prediction : 1.0


Worst Predictions:
Informal Input : Kid's shop selling clothes izit...
Expected Output : Kid's shop is selling clothes, is it?
Predicted Output : I'm still to some to see you all not.
Bleu Score of Prediction : 0.00
