# Import Libraries

In [None]:
# Python
import sys
import os
import re
import numpy as np
import pandas as pd
from time import time
from sys import getsizeof
import itertools

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# TQDM
from tqdm import tqdm_notebook
from tqdm import tqdm

# sklearn
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Keras
import tensorflow as tf
import keras
import keras.backend as K
from keras.models import Model

# Keras Pre-processing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Keras Layers
from keras.layers import Input, Dense, Embedding, LSTM, Lambda, Dropout

# Keras Optimizer
from keras.optimizers import Adadelta
from keras.constraints import max_norm

# Keras Model Saving
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Load Data

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Add path to your project directory
PROJ_DIR = ''

In [None]:
# Add path to your data directory
DATA_DIR = CURR_DIR + 'data/'

## Load Train Data

In [None]:
train_data = pd.read_csv(DATA_DIR + 'train.csv')

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
# Label Distribution
train_data['is_duplicate'].value_counts()

In [None]:
X = train_data[['question1', 'question2']]
Y = train_data['is_duplicate']

## Load Test Data

In [None]:
test_data = pd.read_csv(DATA_DIR + 'test.csv')

In [None]:
test_data.shape

In [None]:
test_data.head()

# Preprocess 

## Max Sequence Length

In [None]:
def find_max_seq_len(corpus):
    corpus_len = []
    for question in corpus:
        question_list = (str(question)).split()
        corpus_len.append(len(question_list))
  
  return max(corpus_len)

In [None]:
'''
MAX_SEQUENCE_LENGTH = max(find_max_seq_len(list(train_data['question1'])),
                          find_max_seq_len(list(train_data['question2'])),
                          find_max_seq_len(list(test_data['question1'])),
                          find_max_seq_len(list(test_data['question2'])))
'''
MAX_SEQUENCE_LENGTH = 250
print(MAX_SEQUENCE_LENGTH)

## Tokenizer

In [None]:
# List of train_data question1
train_question1 = list(train_data['question1'])
train_question1 = [str(ques) for ques in train_question1]

# List of train_data question2
train_question2 = list(train_data['question2'])
train_question2 = [str(ques) for ques in train_question2]

# List of all the unique question
train_corpus = set(train_question1 + train_question2)
train_corpus = list(train_corpus)
print("Total unique question in the train data: {}".format(len(train_corpus)))

In [None]:
# Keep all the punctuations  -- Since, GloVe has embeddings for them
tokenizer = Tokenizer(filters='')

In [None]:
# Add space before punctuations
def add_space_punctuations(text):
    to_tokenize = '!\"#$%&\'()*+, -./:;<=>?@[\]^_`{|}~'
    return re.sub(r'(['+to_tokenize+'])', r' \1 ', text)

In [None]:
# Add Space before punctuations for entire train corpus
for i in range(len(train_corpus)):
    train_corpus[i] = add_space_punctuations(train_corpus[i])

In [None]:
train_corpus[:5]

In [None]:
# Fit the tokenizer on entire train corpus
tokenizer.fit_on_texts(train_corpus)
word_index = tokenizer.word_index
print("Found %s unique tokens: " % len(word_index))

In [None]:
# Add space before punctuations for train data
X['question1'] = X['question1'].astype(str)
X['question1'] = X['question1'].apply(add_space_punctuations)

X['question2'] = X['question2'].astype(str)
X['question2'] = X['question2'].apply(add_space_punctuations)

In [None]:
X.head()

In [None]:
# Add space before punctuations for train data
test_data['question1'] = test_data['question1'].astype(str)
test_data['question1'] = test_data['question1'].apply(add_space_punctuations)

test_data['question2'] = test_data['question2'].astype(str)
test_data['question2'] = test_data['question2'].apply(add_space_punctuations)

In [None]:
test_data.head()

# Generate Indices

In [None]:
def generate_indices(question1, question2, max_sequence_len):
    # Make sure it is list of strings
    question1 = [str(ques) for ques in question1]
    question2 = [str(ques) for ques in question2]

    sequences1 = tokenizer.texts_to_sequences(question1)
    indices1 = pad_sequences(sequences1, maxlen=max_sequence_len)

    sequences2 = tokenizer.texts_to_sequences(question2)
    indices2 = pad_sequences(sequences2, maxlen=max_sequence_len)

    return indices1.tolist(), indices2.tolist()

## Train Indices

In [None]:
train_question1_indices, train_question2_indices = generate_indices(list(X['question1']),
                                                                    list(X['question2']),
                                                                    MAX_SEQUENCE_LENGTH)

In [None]:
# Add the indices to the train dataframe

# Create columns for indices
X['question1_indices'] = [[] for _ in range(len(X))]
X['question2_indices'] = [[] for _ in range(len(X))]

# Reset index to iterate over the train question's indices
X.reset_index(drop=True, inplace=True)

for index, row, in X.iterrows():
    row['question1_indices'] = train_question1_indices[index]
    row['question2_indices'] = train_question2_indices[index]

In [None]:
X.head()

## Test Indices

In [None]:
test_question1_indices, test_question2_indices = generate_indices(list(test_data['question1']),
                                                                    list(test_data['question2']),
                                                                    MAX_SEQUENCE_LENGTH)

In [None]:
# Add the indices to the test dataframe

# Create columns for indices
test_data['question1_indices'] = [[] for _ in range(len(X))]
test_data['question2_indices'] = [[] for _ in range(len(X))]

# Reset index to iterate over the test question's indices
test_data.reset_index(drop=True, inplace=True)

for index, row, in test_data.iterrows():
    row['question1_indices'] = test_question1_indices[index]
    row['question2_indices'] = test_question2_indices[index]

## Reverse Map the indices

In [None]:
# Create a reverse dictionary 
reverse_word_map = {v : k for k, v in word_index.items()}

In [None]:
def indices_to_text(indices_list):
    text = []
    for i in indices_list:
        if i is not 0:
            text.append(reverse_word_map.get(i))
    return text

In [None]:
print(X['question1'][0])
print(X['question1_indices'][0])
print(indices_to_text(X['question1_indices'][0]))

# Prepare embedding layer

## Load the GloVe embedding vector

In [None]:
CURR_DIR

In [None]:
# Add path to your GLoVe Directory
GLOVE_DIR = ''

glove_embedding_index = {}

f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_embedding_index[word] = coefs
f.close()

print("Found %s word vectors" % len(glove_embedding_index))

## Create embedding matrix

In [None]:
# Vocabulary Size
VOCAB_SIZE = len(word_index)
print("Vocabulary size is: {}".format(VOCAB_SIZE))

In [None]:
# Randomly initialize the embedding matrix for unknown words
embedding_matrix = np.random.rand(VOCAB_SIZE + 1, 300)

# Padding will be ignored
embedding_matrix[0] = 0 

for word, i in word_index.items():
    glove_embedding_vector = glove_embedding_index.get(word)
    if glove_embedding_vector is not None:
        embedding_matrix[i] = glove_embedding_vector

# Train-Val Split

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=42)

In [None]:
print(X_train.shape)
print(X_val.shape)

print(Y_train.shape)
print(Y_val.shape)

In [None]:
# Convert labels to their numpy representation
Y_train = Y_train.values
Y_val = Y_val.values

# Define Training Model

In [None]:
def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

def train_MaLSTM_model(embedding_pretrained, VOCAB_SIZE):
    # Input Layers
    left_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
    right_input = Input(shape=(MAX_SEQUENCE_LENGTH,))

    # Embedding Layer
    embedding_layer = Embedding(input_dim=VOCAB_SIZE+1, output_dim=300, input_length = MAX_SEQUENCE_LENGTH, weights=[embedding_pretrained], trainable=False)

    # Embedded version of the inputs
    encoded_left = embedding_layer(left_input)
    encoded_right = embedding_layer(right_input)

    # Since this is a siamese network, both the sides share the same LSTM
    shared_lstm = LSTM(n_hidden)

    left_output = shared_lstm(encoded_left)
    right_output = shared_lstm(encoded_right)

    # Calculate the distance as defined by the MaLSTM model
    malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]))([left_output, right_output])

    # Pack it all up into a model
    malstm_model = Model([left_input, right_input], [malstm_distance])

    return malstm_model 

# Train MaLSTM Model

### Model Hyper-Parameter

In [None]:
n_hidden = 50
gradient_clipping_norm = 3
batch_size = 64
n_epoch = 10

In [None]:
# Adadelta optimizer with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

### Model Definition

In [None]:
malstm_model = train_MaLSTM_model(embedding_matrix, VOCAB_SIZE)

In [None]:
malstm_model.summary()

### Model Compilation

In [None]:
malstm_model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

### Model Fit

In [None]:
MODEL_DIR = ''

In [None]:
# Create a checkpoint
filepath = MODEL_DIR + 'malstm_best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
malstm_trained = malstm_model.fit(x=[list(X_train['question1_indices']), list(X_train['question2_indices'])], y=Y_train,
                                  batch_size=batch_size, nb_epoch=n_epoch, callbacks=callbacks_list, verbose=2,
                                  validation_data=([list(X_val['question1_indices']), list(X_val['question2_indices'])], Y_val))

### Save Model

In [None]:
# Serialize model to JSON
malstm_model_json = malstm_model.to_json()

with open(MODEL_DIR + 'malstm_model_json.json', 'w') as json_file:
    json_file.write(malstm_model_json)

print('Saved MaLSTM model architecture to the disk')

### Load Model

In [None]:
# Load architecture
with open(MODEL_DIR + 'malstm_model_json.json', 'r') as json_file:
    model_json = json_file.read()

malstm_model = model_from_json(model_json)

# Load weights
malstm_model.load_weights(MODEL_DIR + '')
print('Loaded MaLSTM model from the disk')