In [None]:
!pip install -q gensim
!pip install -q nltk

In [None]:
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Lambda
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import os
import itertools
import datetime



In [None]:
from google.colab import drive


# This will prompt for authorization.
drive.mount('/content/drive/')

In [None]:

TRAIN_CSV = open('drive/My Drive/AML/Project/Dataset/train.csv',encoding='latin')
TEST_CSV = open('drive/My Drive/AML/Project/Dataset/test.csv',encoding='latin')
EMBEDDING_FILE = 'drive/My Drive/AML/GoogleNews-vectors-negative300.bin.gz'
MODEL_SAVING_DIR = 'drive/My Drive/AML/Project/'

In [None]:
import nltk
nltk.download('stopwords')
train_df = pd.read_csv(TRAIN_CSV,delimiter='\t')
test_df = pd.read_csv(TEST_CSV,delimiter='\t')
print(train_df.columns)


In [None]:
#refered from kaggle discussions

def text_to_word_list(text):

    text = str(text)
    text = text.lower()

    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text


In [None]:
stops = set(stopwords.words('english'))

In [None]:

vocabulary = dict()
inverse_vocabulary = ['<unk>'] 
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)


In [None]:

questions_cols = ['question1', 'question2']

for dataset in [train_df, test_df]:
    for index, row in dataset.iterrows():

        for question in questions_cols:

            q2n = [] 
            for word in text_to_word_list(row[question]):

                if word in stops and word not in word2vec.vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])

           
            dataset.set_value(index, question, q2n)
            
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim) 
embeddings[0] = 0 

for word, index in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[index] = word2vec.word_vec(word)

del word2vec

In [None]:
max_seq_length = max(train_df.question1.map(lambda x: len(x)).max(),
                     train_df.question2.map(lambda x: len(x)).max(),
                     test_df.question1.map(lambda x: len(x)).max(),
                     test_df.question2.map(lambda x: len(x)).max())

In [None]:

trainX = train_df[questions_cols]
trainY = train_df['is_duplicate']
testX = test_df[questions_cols]
testY = test_df['is_duplicate']


In [None]:
X_train = {'left': trainX.question1, 'right': trainX.question2}
X_test = {'left': testX.question1, 'right': testX.question2}

Y_train = trainY.values
Y_test = testY.values

for dataset, side in itertools.product([X_train, X_test], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

In [None]:

assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

In [None]:

n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 25

def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

malstm = Model([left_input, right_input], [malstm_distance])


In [None]:
malstm.compile(
    optimizer=Adadelta(clipnorm=gradient_clipping_norm, ),
    loss='mean_squared_error', metrics=['accuracy']
)


In [None]:
try:
    device_name = os.environ['COLAB_TPU_ADDR']
    TPU_ADDRESS = 'grpc://' + device_name
    print('Found TPU at: {}'.format(TPU_ADDRESS))

except KeyError:
    print('TPU not found')

In [None]:
tpu_model = tf.contrib.tpu.keras_to_tpu_model(
    malstm,
    strategy=tf.contrib.tpu.TPUDistributionStrategy(
        tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
    )
)

In [None]:

training_start_time = time()

malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
                            validation_data=([X_test['left'], X_test['right']], Y_test))

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))