# MaLSTM on Kaggle's Quora Question Pairs

In [0]:
from google.colab import drive
drive.mount('/gdrive')
#!pip install Keras==2.1.0

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


This notebook is about implementing the MaLSTM model (http://www.mit.edu/~jonasm/info/MuellerThyagarajan_AAAI16.pdf) on Kaggle's Quora Question Pairs data.

Blog post containing a broader explanation about the network can be found in the following link https://medium.com/@eliorcohen/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07


# CODE

First, lets import all the necessary packages

In [0]:
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import datetime

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Lambda
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
!pip install nltk
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz" 

--2019-04-30 02:20:59--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.138.125
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.138.125|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [0]:
!ls /gdrive/My\ Drive/quora

 drop				    weights-improvement-04-0.76.hdf5
 lda				    weights-improvement-04-0.80.hdf5
'test_processed (1).csv'	    weights-improvement-05-0.77.hdf5
 test_processed.csv		    weights-improvement-05-0.80.hdf5
 train.csv			    weights-improvement-05-0.81.hdf5
'train_processed (1).csv'	    weights-improvement-06-0.77.hdf5
 train_processed.csv		    weights-improvement-06-0.81.hdf5
 vocab.txt			    weights-improvement-07-0.78.hdf5
 weights-improvement-01-0.72.hdf5   weights-improvement-07-0.81.hdf5
 weights-improvement-01-0.76.hdf5   weights-improvement-08-0.81.hdf5
 weights-improvement-01-0.77.hdf5   weights-improvement-08-0.82.hdf5
 weights-improvement-02-0.74.hdf5   weights-improvement-09-0.81.hdf5
 weights-improvement-02-0.78.hdf5   weights-improvement-09-0.82.hdf5
 weights-improvement-02-0.79.hdf5   weights-improvement-11-0.82.hdf5
 weights-improvement-03-0.76.hdf5   weights-improvement-14-0.82.hdf5
 weights-improvement-03-0.79.hdf5   wemb
 weights-improvement-03-0.80.hdf5


Global variables

In [0]:
# File paths
TRAIN_CSV = '/gdrive/My Drive/quora/train.csv'
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
MODEL_SAVING_DIR = '/gdrive/My Drive/quora'

Create embedding matrix

In [0]:
# Load training and test set
train_df = pd.read_csv(TRAIN_CSV)
train_df,test_df= train_test_split(train_df,test_size=0.2,shuffle=train_df.is_duplicate.values)

stops = set(stopwords.words('english'))

def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()
   
    return text

# Prepare embedding
vocabulary = dict()
inverse_vocabulary = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

questions_cols = ['question1', 'question2']
import pickle as pk
# Iterate over the questions only of both training and test datasets
for dataset in [train_df, test_df]:
    for index, row in dataset.iterrows():
   
        # Iterate through the text of both questions of the row
        for question in questions_cols:

            q2n = []  # q2n -> question numbers representation
            for word in text_to_word_list(row[question]):

                # Check for unwanted words
                if word in stops and word not in word2vec.vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])

            # Replace questions as word to question as number representation
            dataset.set_value(index, question, q2n)

with open(MODEL_SAVING_DIR+'/vocab.txt','wb') as f:
  pk.dump(vocabulary,f)
  
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, index in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[index] = word2vec.word_vec(word)

del word2vec



Prepare training and validation data

In [0]:
max_seq_length = max(train_df.question1.map(lambda x: len(x)).max(),
                     train_df.question2.map(lambda x: len(x)).max(),
                     test_df.question1.map(lambda x: len(x)).max(),
                     test_df.question2.map(lambda x: len(x)).max())

print('max_seq={}'.format(max_seq_length))
# Split to train validation
validation_size = 40000
training_size = len(train_df) - validation_size

X = train_df[questions_cols]
Y = train_df['is_duplicate']

X_train, X_validation, Y_train, Y_validation = X,test_df[questions_cols],Y,test_df['is_duplicate']#train_test_split(X, Y, test_size=validation_size)

# Split to dicts
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': test_df.question1, 'right': test_df.question2}

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

max_seq=212


Build the model

In [0]:
!pip install keras-self-attention

In [0]:
# ## with attention network

# from keras_self_attention import SeqSelfAttention
# from keras.layers import Dense
# from keras.layers import TimeDistributed,Flatten

# # Model variables
# n_hidden = 50
# gradient_clipping_norm = 1.25
# batch_size = 64
# n_epoch = 25

# def exponent_neg_manhattan_distance(left, right):
#     ''' Helper function for the similarity estimate of the LSTMs outputs'''
#     return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# # The visible layer
# left_input = Input(shape=(max_seq_length,), dtype='int32')
# right_input = Input(shape=(max_seq_length,), dtype='int32')

# embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

# # Embedded version of the inputs
# encoded_left = embedding_layer(left_input)
# encoded_right = embedding_layer(right_input)


# # Since this is a siamese network, both sides share the same LSTM
# #shared_lstm = LSTM(n_hidden,return_sequences=True)

# #left_output = shared_lstm(encoded_left)
# #right_output = shared_lstm(encoded_right)


# seq = SeqSelfAttention(attention_activation='sigmoid',attention_width=10)


# left_output = seq(encoded_left)
# right_output = seq(encoded_right)


# seq2 = SeqSelfAttention(attention_activation='sigmoid',attention_width=5)

# left_output = seq2(left_output)
# right_output = seq2(right_output)

# dense = TimeDistributed(Dense(5))

# left_output = dense(left_output)
# right_output = dense(right_output)

# flat = Flatten()

# left_output = flat(left_output)
# right_output = flat(right_output)


# # Calculates the distance as defined by the MaLSTM model
# malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# # Pack it all up into a model
# malstm = Model([left_input, right_input], [malstm_distance])

# # Adadelta optimizer, with gradient clipping by norm
# optimizer = Adadelta(clipnorm=gradient_clipping_norm)

# malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

# # Start training
# training_start_time = time()

# malstm.summary()
# malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
#                             validation_data=([X_validation['left'], X_validation['right']], Y_validation))

# print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

Plotting the results

In [0]:
# # Plot accuracy
# plt.plot(malstm_trained.history['acc'])
# plt.plot(malstm_trained.history['val_acc'])
# plt.title('Model Accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

# # Plot loss
# plt.plot(malstm_trained.history['loss'])
# plt.plot(malstm_trained.history['val_loss'])
# plt.title('Model Loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper right')
# plt.show()

In [0]:
## with attention network

from keras_self_attention import SeqSelfAttention
from keras.layers import Dense
from keras.layers import TimeDistributed,Flatten,Conv1D,MaxPooling1D,GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 128
n_epoch = 100

def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)


# Since this is a siamese network, both sides share the same LSTM
#shared_lstm = LSTM(n_hidden,return_sequences=True)

#left_output = shared_lstm(encoded_left)
#right_output = shared_lstm(encoded_right)


encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

## conv12
conv = Conv1D(filters=1500, kernel_size=4, padding='valid', activation='sigmoid', strides=1)


encoded_left = conv(encoded_left)
encoded_right = conv(encoded_right)

pooling  = MaxPooling1D(pool_size=4)
encoded_left = pooling(encoded_left)
encoded_right = pooling(encoded_right)

conv2 = Conv1D(filters=3000, kernel_size=4, padding='valid', activation='sigmoid', strides=1)
encoded_left = conv2(encoded_left)
encoded_right = conv2(encoded_right)


pooling2  = GlobalMaxPooling1D()
encoded_left = pooling2(encoded_left)
encoded_right = pooling2(encoded_right)


dense = Dense(256)
left_output = dense(encoded_left)
right_output = dense(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

# Start training
training_start_time = time()

filepath=MODEL_SAVING_DIR+"/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

malstm.summary()
#malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
                            #validation_data=([X_validation['left'], X_validation['right']], Y_validation),callbacks=callbacks_list)

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 212)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 212)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 212, 300)     25800600    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None

In [0]:
# 


from keras.models import load_model

filepath_latest = MODEL_SAVING_DIR+'/weights-improvement-14-0.82.hdf5'
malstm.load_weights(filepath_latest)
print(filepath_latest)



/gdrive/My Drive/quora/weights-improvement-14-0.82.hdf5


In [0]:
!pip install keras --upgrade
!pip install tensorflow-gpu --upgrade
!ls /gdrive/My\ Drive/quora/
!cp /gdrive/My\ Drive/quora/weights-improvement-01-0.77.hdf5 .
!ls

In [0]:
# malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
#                             validation_data=([X_validation['left'], X_validation['right']], Y_validation),callbacks=callbacks_list)
malstm.predict([X_validation['left'][0:10], X_validation['right'][0:10]])

array([[0.21023595],
       [1.        ],
       [0.2981399 ],
       [0.07219958],
       [0.05977337],
       [0.6075143 ],
       [0.15939113],
       [0.08044683],
       [0.2218177 ],
       [0.48542237]], dtype=float32)

In [0]:
X_validation['left'].shape

(80858, 212)

In [0]:
!python -c 'import keras; print(keras.__version__)'