In [1]:
import pandas as pd
import numpy as np
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.layers.embeddings import Embedding
from keras.utils.data_utils import get_file
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Merge, Dropout, concatenate, Dense, BatchNormalization, Lambda, TimeDistributed, Dot, dot
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint


from sklearn.model_selection import train_test_split

from zipfile import ZipFile
from os.path import expanduser, exists

import datetime
import time

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
from hyperdash import monitor_cell

In [3]:
import json

In [4]:
train_dataset = pd.read_csv('train.csv')
train_dataset

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


# PLAN

We need to find if two questions are similar. In face recognition, we use siamese networks to solve similar problem but for faces. So, we'll try using siamese networks here - only we'll use LSTMs instead of CNNs since LSTMs are suited for sequences.

0. Check the data. If we have enough q-ids for which we have duplicates available, then we could train the whole thing via triplet loss. If so, follow plan a, else plan b.

1. Pre-processing
    - Remove questionmarks throughout
    - Remove stop-words (Save one which keeps stop-words as well)

2. Convert to vectors
3. Divide in 70/30 split. (Also try 80/20 split)
4. Pass through siamese LSTMs

**Plan a**
5. Use squared distance

**Plan b**
5. Use triplet loss (find all those q-ids which have duplicates available for them first. See if it makes sense to use triplet loss)


## Analysis of data

In [5]:
train_df_copy = train_dataset.copy()
train_df_copy.describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,363861.0,363861.0,363861.0,363861.0
mean,181930.0,201899.281913,204884.863951,0.371502
std,105037.767486,144924.825062,146663.968132,0.483207
min,0.0,1.0,2.0,0.0
25%,90965.0,70779.0,70942.0,0.0
50%,181930.0,179999.0,184182.0,0.0
75%,272895.0,321295.0,327744.0,1.0
max,363860.0,493887.0,493889.0,1.0


In [6]:
train_df_copy[train_df_copy['is_duplicate'] > 0].describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,135175.0,135175.0,135175.0,135175.0
mean,181735.176741,156901.917507,157247.986292,1.0
std,105058.800004,137618.6556,137577.456205,0.0
min,5.0,11.0,12.0,1.0
25%,90843.0,39315.0,39697.0,1.0
50%,181718.0,113964.0,113489.0,1.0
75%,272849.5,250886.0,251945.0,1.0
max,363860.0,493877.0,493878.0,1.0


**Total number of unique questions whose duplicates we have**

In [7]:
len(train_df_copy[train_df_copy['is_duplicate'] > 0]['qid1'].unique())

80105

**Total number of unique questions **

In [8]:
len(train_df_copy['qid1'].unique())

266358

**Decision** : We could go this path and use triplet loss, however, triplet loss uses A(anchor), P(positive) and N(negative) triplet and it's very important to find a N which is closer to A but still not a duplicate. For us to find those pairs would be a time-taking exercise which I could try to do after basic model, perhaps. 

## Pre-processing

In [9]:
train_df = train_dataset.copy()

#### Prepare a list of all vocabulary words

In [10]:
q1_set = set(train_df['question1'].unique())
q2_set = set(train_df['question2'].unique())
all_ques_list = q1_set | q2_set
len(all_ques_list)

493392

In [11]:
q1_list = train_df['question1'].tolist()
q1_list = [str(ques) for ques in q1_list]
q2_list = train_df['question2'].tolist()
q2_list = [str(ques) for ques in q2_list]
is_duplicate_list = train_df['is_duplicate'].tolist()

print(q1_list[0],":",q2_list[0],":",is_duplicate_list[0])

What is the step by step guide to invest in share market in india? : What is the step by step guide to invest in share market? : 0


In [12]:
all_questions_list = q1_list + q2_list
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(all_questions_list)

q1_word_seq = tokenizer.texts_to_sequences(q1_list)
q2_word_seq = tokenizer.texts_to_sequences(q2_list)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 91013


In [13]:
# Save the tokenizer word index we've gotten for later

dictionary = word_index
# Let's save this out so we can use it later
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [18]:
GLOVE_DOWNLOAD_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'

if not exists(expanduser('~/.keras/datasets/glove.840B.300d.zip')):
    zipfile = ZipFile(get_file('glove.840B.300d.zip', GLOVE_DOWNLOAD_URL))
    zipfile.extract('glove.840B.300d.txt', path=expanduser('~/.keras/datasets/'))
    
print("Processing", 'glove.840B.300d.txt')

embeddings_index = {}

with open(expanduser('~/.keras/datasets/glove.840B.300d.txt'), encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Processing glove.840B.300d.txt
Word embeddings: 2196016


In [19]:
len(embeddings_index['the'])

300

In [20]:
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300


nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 27053


In [21]:
q1_word_seq[0:5]

[[2, 3, 1, 1245, 57, 1245, 2546, 7, 577, 8, 772, 379, 8, 35],
 [2, 3, 1, 562, 10, 13509, 14684, 5, 21439, 4449],
 [4, 13, 5, 219, 1, 439, 10, 17, 364, 1848, 205, 146, 6, 2836],
 [16, 72, 5, 2693, 309, 2764, 4, 13, 5, 661, 19],
 [23, 49, 7202, 8, 233, 33752, 1906, 2077, 10473, 12, 1927, 10671, 6462]]

In [22]:
max_seq_length = 0
for ques in q1_word_seq:
    if(len(ques) > max_seq_length):
        max_seq_length = len(ques)
max_seq_length

127

In [23]:
MAX_SEQUENCE_LENGTH = 130

q1_data = pad_sequences(q1_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(is_duplicate_list, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (363861, 130)
Shape of question2 data tensor: (363861, 130)
Shape of label tensor: (363861,)


In [24]:
q1_data[0:5]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     2,
            3,     1,  1245,    57,  1245,  2546,     7,   577,     8,
      

In [25]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X.shape

(363861, 2, 130)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]
Q1_train.shape

(291088, 130)

In [30]:
Q1_train

array([[    0,     0,     0, ...,   229,  7037,   229],
       [    0,     0,     0, ...,    10,  1558,   313],
       [    0,     0,     0, ...,     6,  6348,   333],
       ..., 
       [    0,     0,     0, ..., 20350,     8,  1307],
       [    0,     0,     0, ...,     7,   114, 10766],
       [    0,     0,     0, ..., 36739,    12, 23563]], dtype=int32)

In [31]:
Q1_train.shape

(291088, 130)

In [27]:
NUM_HIDDEN_UNITS_LAYER1 = 50
NUM_HIDDEN_UNITS_LAYER2 = 100

question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_layer = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)

q1 = embedding_layer(question1)
q2 = embedding_layer(question2)

lstm_first = LSTM(NUM_HIDDEN_UNITS_LAYER1, return_sequences=False)

q1 = lstm_first(q1)
q2 = lstm_first(q2)

dropout_layer = Dropout(0.2)

q1 = dropout_layer(q1)
q2 = dropout_layer(q2)

dense = Dense(100, activation='relu')
dropout_two = Dropout(0.2)
bn_one = BatchNormalization()

q1 = dense(q1)
# q1 = dropout_two(q1)
# q1 = bn_one(q1)
q2 = dense(q2)
# q2 = dropout_two(q2)
# q2 = bn_one(q2)

merged = concatenate([q1,q2])
is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 130)           0                                            
____________________________________________________________________________________________________
input_4 (InputLayer)             (None, 130)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 130, 300)      27304200    input_3[0][0]                    
                                                                   input_4[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 50)            70200       embedding_1[0][0]       

In [28]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

In [34]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint('question_pairs_weights_type1_final_new.h5', monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=20,
                    validation_data=([Q1_test, Q2_test], y_test),
                    verbose=1,
                    batch_size=512,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2018-06-03 10:16:32.628602
Train on 291088 samples, validate on 72773 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training ended at 2018-06-03 11:34:15.437467
Minutes elapsed: 77.713477


### Testing model

In [38]:
from keras.preprocessing.text import text_to_word_sequence

In [35]:
def convert_text_to_index_array(text, dictionary):
	words = text_to_word_sequence(text)
	wordIndices = []
	for word in words:
	    if word in dictionary:
	        wordIndices.append(dictionary[word])
	    else:
	        print("'%s' not in training corpus; ignoring." %(word))
	return wordIndices

In [39]:
# HAPPY CASE
question1 = "What's r programming?"
question2 = "What's in r programming?"

q1_word_seq = convert_text_to_index_array(question1,dictionary)
q1_word_seq = [q1_word_seq]
q2_word_seq = convert_text_to_index_array(question2,dictionary)
q2_word_seq = [q2_word_seq]
q1_data = pad_sequences(q1_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_word_seq, maxlen=MAX_SEQUENCE_LENGTH)

pred = model.predict([q1_data,q2_data])
print(pred)

[[ 0.01451303]]


In [40]:
question1 = "How to learn english?"
question2 = "Why can't I dance?"

q1_word_seq = convert_text_to_index_array(question1,dictionary)
q1_word_seq = [q1_word_seq]
q2_word_seq = convert_text_to_index_array(question2,dictionary)
q2_word_seq = [q2_word_seq]
q1_data = pad_sequences(q1_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_word_seq, maxlen=MAX_SEQUENCE_LENGTH)

pred = model.predict([q1_data,q2_data])
print(pred)

[[ 0.56790417]]


## Model 2 -- that works

In [30]:
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

In [86]:
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

q1 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q1)
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q1)

q2 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q2)

merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [90]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_81 (InputLayer)            (None, 30)            0                                            
____________________________________________________________________________________________________
input_82 (InputLayer)            (None, 30)            0                                            
____________________________________________________________________________________________________
embedding_42 (Embedding)         (None, 30, 300)       27304200    input_81[0][0]                   
____________________________________________________________________________________________________
embedding_43 (Embedding)         (None, 30, 300)       27304200    input_82[0][0]                   
___________________________________________________________________________________________

In [None]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint('question_pairs_weights.h5', monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=25,
                    validation_data=([Q1_test, Q2_test], y_test),
                    verbose=1,
                    batch_size=64,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2018-06-03 01:26:59.153656
Train on 291088 samples, validate on 72773 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25

## Links that helped

[An example of model for same problem but in R](https://tensorflow.rstudio.com/blog/keras-duplicate-questions-quora.html)
    
[Manhattan distance model with siamese network approach for same problem](https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb)

[Another approach for same problem](https://github.com/bradleypallen/keras-quora-question-pairs/blob/master/quora-question-pairs-training.ipynb)

[Data pre-processing](https://github.com/bradleypallen/keras-quora-question-pairs/blob/master/quora-question-pairs-data-prep.ipynb)

[How to save tokenizer dictionary - word indices](https://gist.github.com/vgpena/b1c088f3c8b8c2c65dd8edbe0eae7023#file-makemodel-py-L27)

[Intuitive explanation of word embeddings](https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec/)
                                                  
[keras starter script for word embeddings](https://www.kaggle.com/sudalairajkumar/keras-starter-script-with-word-embeddings)

## Unnecessary

In [33]:
# all_ques_list
vocabulary = set()

for question in all_ques_list:
    question = str(question)
    question = question.lower()
    question = re.sub(r"\?", " ", question)
    question_words = question.split()
    for word in question_words:
        vocabulary.add(word)

len(vocabulary)

159039

In [38]:
tokenizer = Tokenizer(nb_words=300)
tokenizer.fit_on_texts(vocabulary)
sequences_train = tokenizer.texts_to_sequences(vocabulary)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data_train = pad_sequences(sequences_train, maxlen=40)
data_train.shape



Found 91013 unique tokens.


(159039, 40)

In [16]:
NUM_HIDDEN_UNITS_LAYER1 = 50
NUM_HIDDEN_UNITS_LAYER2 = 100

question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_layer = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)

encoded_q1 = embedding_layer(question1)
encoded_q2 = embedding_layer(question2)

lstm_first = LSTM(NUM_HIDDEN_UNITS_LAYER1, return_sequences=True)

lstm_output_q1 = lstm_first(encoded_q1)
lstm_output_q2 = lstm_first(encoded_q2)

dropout_layer = Dropout(0.2)

dropout_q1 = dropout_layer(lstm_output_q1)
dropout_q2 = dropout_layer(lstm_output_q2)

q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM,))(dropout_q1)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM,))(dropout_q2)

#lstm_second = LSTM(NUM_HIDDEN_UNITS_LAYER2, return_sequences=False)

#lstm_second_output_q1 = lstm_second(dropout_q1)
#lstm_second_output_q2 = lstm_second(dropout_q2)

#dropout_second_q1 = dropout_layer(lstm_second_output_q1)
#dropout_second_q2 = dropout_layer(lstm_second_output_q2)

# Calculates the distance as defined by the MaLSTM model
#malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([dropout_q1, dropout_q2])

merged = concatenate([q1,q2])
# is_duplicate = Dense(1, activation='sigmoid')(merged)


merged = Dense(600, activation='relu')(merged)
#merged = Dropout(0.2)(merged)
# merged = BatchNormalization()(merged)
# merged = Dense(200, activation='relu')(merged)
# merged = Dropout(0.2)(merged)
# merged = BatchNormalization()(merged)
# is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=merged)
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Pack it all up into a model
# malstm = Model([question1, question2], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
# optimizer = Adadelta()

# malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

# Start training
# malstm_trained = malstm.fit([Q1_train, Q2_train], y_train, batch_size=128, nb_epoch=10,
#                             validation_data=([Q1_test, Q2_test], y_test))



NameError: name 'nb_words' is not defined