# Quora Question Pairs Similarity Detection (NLP)


This is the submission for a task on flagging duplicate questions on the Quora dataset from Kaggle.

## High level approach
The questions will be converted to respective feature vectors using ***doc2vec*** (gensim's implementation). Then a ***Siamese network*** will be trained to predict the duplicacy of questions. The code with all the explanation is given below.

### Importing the libraries

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import gensim
from sklearn.cross_validation import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense, BatchNormalization, Activation, Input, Add, Concatenate, Lambda, Dropout
from keras.optimizers import RMSprop, SGD, Adam
from keras.callbacks import TensorBoard
from keras import backend as K

### Reading the CSV file into Pandas Dataframe

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df.head()

### Preparing data for Doc2Vec training (unsupervised learning)

In [None]:
#List of sentences and labels to make labelled sentences
questions = list(df['question1']) + list(df['question2'])
label_list = list(df['qid1']) + list(df['qid2'])

#Encoding to unicode
for i,question in enumerate(questions):
    questions[i] = str(question).encode('utf-8')

#Tokenizing the sentences
questions = [list(gensim.utils.tokenize(question, deacc=True, lower=True)) for question in questions]

In [None]:
#Creating LabeledLineSentence iterator to feed into Doc2Vec input
class LabeledLineSentence(object):
    def __init__(self, label_list, questions):
        self.questions = questions
        self.label_list = label_list
    def __iter__(self):
        for label, ques in zip(self.label_list, self.questions):
            yield gensim.models.doc2vec.TaggedDocument(words=ques, tags=['QUES_%s' % label])

it = LabeledLineSentence(label_list, questions)

### Training the Doc2Vec model for generating feature vectors of sentence.

In [None]:
#Initialize
size_of_vector = 300
model = gensim.models.Doc2Vec(vector_size=size_of_vector, window=8, workers=16, negative=20, epochs=10, alpha=0.025, min_alpha=0.005)

#Keeping only l2 normalized vectors, trick to save memory
#model.init_sims(replace=True)

#Build the vocabulary to train
model.build_vocab(it)
print("Vocab built!")

In [None]:
#Train the model
model.train(it, start_alpha=model.alpha, end_alpha=model.min_alpha, total_examples = model.corpus_count, epochs = model.epochs)

#Saving the model to disk
model.save('TrainedModel.doc2vec')

#### Printing a feature vector

In [None]:
model["QUES_1"]

### Create training data variables
###### Features

In [34]:
#Load the model and the dataframe
model = gensim.models.Doc2Vec.load("TrainedModel.doc2vec")
df = pd.read_csv("train.csv")

#Getting final array of training data
size_of_vector = 300
train_count = len(df.index)
train_data = np.zeros((train_count, 2, size_of_vector))

for idx, row in df.iterrows():
    q1_label = "QUES_" + str(row['qid1'])
    q2_label = "QUES_" + str(row['qid2'])
    train_data[idx,0] = model[q1_label]
    train_data[idx,1] = model[q2_label]

In [35]:
train_data.shape

(404290, 2, 300)

###### Labels

In [36]:
train_labels = df['is_duplicate']

In [37]:
#free up some memory
del df

### Undersampling to balance classes and remove skew
*Note:-* There are other techniques as well to deal with skewed classes. However, as this solution is for demonstration purpose, we will simply remove the extra training data in one class and balance them. This is known as undersampling.

In [40]:
class0 = len(train_labels[train_labels == 0])
class1 = len(train_labels[train_labels == 1])
print("Data labelled as 0 : {0}\nData labelled as 1 : {1}".format(class0,class1))

Data labelled as 0 : 255027
Data labelled as 1 : 149263


In [None]:
#Removing skew by undersampling
extras = class0 - class1
rand_idx = np.random.permutation(train_labels[train_labels == 0].index)
del_idx = rand_idx[0:extras]

In [None]:
train_data = np.delete(train_data, del_idx, axis=0)
train_labels = train_labels.drop(del_idx, axis=0)

##### Splitting data into training and validation sets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(train_data, train_labels, test_size=0.15, random_state=42)


### We have completed the preprocessing of data. Now comes the real part.

## The Siamese Network

In [None]:
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

def contrastive_loss(y_true, y_pred):
    return K.mean((1 - y_true)*K.square(y_pred) + y_true*K.square(K.maximum(0.0,(1 - y_pred))))


def create_base_network(input_dim):
    '''
    Base network for feature extraction.
    '''
    input = Input(shape=(input_dim, ))
    dense1 = Dense(300)(input)
    bn1 = BatchNormalization()(dense1)
    relu1 = Activation('relu')(bn1)

    drop1 = Dropout(0.2)(relu1)
    dense2 = Dense(300)(drop1)
    bn2 = BatchNormalization()(dense2)
    res2 = Add()([relu1, bn2])
    relu2 = Activation('relu')(res2)

    drop2 = Dropout(0.2)(relu2)
    dense3 = Dense(300)(drop2)
    bn3 = BatchNormalization()(dense3)
    res3 = Add()([relu2, bn3])
    relu3 = Activation('relu')(res3)
    
    drop3 = Dropout(0.2)(relu3)
    dense4 = Dense(300)(drop3)
    bn4 = BatchNormalization()(dense4)
    res4 = Add()([relu3, bn4])
    relu4 = Activation('relu')(res4)
    
    drop4 = Dropout(0.2)(relu4)
    dense5 = Dense(300)(drop4)
    bn5 = BatchNormalization()(dense5)
    res5 = Add()([relu4, bn5])
    relu5 = Activation('relu')(res5)
    
    feats = Concatenate()([relu5, relu4, relu3])
    bn6 = BatchNormalization()(feats)

    model = Model(outputs=bn6, inputs=input)

    return model


def compute_accuracy(predictions, labels):
    '''
    Compute classification accuracy with a fixed threshold on distances.
    '''
    return np.mean(np.equal(predictions.ravel() > 0.7, labels))    #Duplicate if probability more than 50%

def create_network(input_dim):
    # network definition
    base_network = create_base_network(input_dim)
    
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    
    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    
    prob = Dense(1, activation='sigmoid')(distance)
    
    model = Model(outputs=prob, inputs=[input_a, input_b])
    return model

In [None]:
net = create_network(300)

#Using Adam Optimizer with fixed 0.001 learning rate
net.compile(loss=contrastive_loss, optimizer = Adam(lr=0.001))


## Running the final training

In [17]:
for epoch in range(50):
    print('Real Epoch %d/50'%(epoch+1))
    
    # Added TensorBoard callbacks for graph visualization
    net.fit([X_train[:,0,:], X_train[:,1,:]], Y_train,
          validation_data=([X_test[:,0,:], X_test[:,1,:]], Y_test),
          batch_size=128, epochs=1, shuffle=True, callbacks=[TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=128, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)])
    
    # Compute final accuracy on training and test sets
    pred = net.predict([X_test[:,0,:], X_test[:,1,:]], batch_size=128)
    te_acc = compute_accuracy(pred, Y_test)
    
    print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

Real Epoch 1/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 71.31%
Real Epoch 2/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 74.16%
Real Epoch 3/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 75.92%
Real Epoch 4/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 77.99%
Real Epoch 5/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 78.36%
Real Epoch 6/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 79.00%
Real Epoch 7/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 79.36%
Real Epoch 8/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 79.45%
Real Epoch 9/50
Train on 253747 samples, validate on 44779 samples
Epoch 1/1
* Accuracy on test set: 79.55%
Real Epoch 10/50
Train on 25

#### Saving the trained model

In [18]:
model_json = net.to_json()
with open("model3.json", "w") as json_file:
    json_file.write(model_json)
    
# serialize weights to HDF5 (saving the weights)
net.save_weights("model3.h5")

print("Saved model to disk")

Saved model to disk


### Till now, we have successfully trained on the training set and acheived _81.09% accuracy_ on the validation set.

# Now, We will predict the values for test set

In [19]:
test_df = pd.read_csv("test.csv")

In [19]:
test_df.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [20]:
q1 = list(test_df["question1"])
q2 = list(test_df["question2"])

In [21]:
#Encode to unicode
for i,question in enumerate(q1):
    q1[i] = str(question).encode('utf-8')
for i,question in enumerate(q2):
    q2[i] = str(question).encode('utf-8')

In [22]:
#Tokenizing the sentences
q1 = [list(gensim.utils.tokenize(question, deacc=True, lower=True)) for question in q1]
q2 = [list(gensim.utils.tokenize(question, deacc=True, lower=True)) for question in q2]

In [23]:
#Load Doc2Vec model to use infer_vector()
model = gensim.models.Doc2Vec.load("TrainedModel.doc2vec")

In [24]:
#size_of_vector = 300
test_count = len(test_df.index)
test_data_vector = np.zeros((test_count, 2, size_of_vector))

for idx, q in enumerate(q1):
    test_data_vector[idx,0] = model.infer_vector(q)
for idx, q in enumerate(q2):
    test_data_vector[idx,1] = model.infer_vector(q)

In [25]:
#Predict on the test data
is_duplicate = net.predict([test_data_vector[:,0,:], test_data_vector[:,1,:]], batch_size=128)

###### The above prediciton gives the distance between 2 vectors. Lesser the distance, higher the chance for them being duplicate.

### Preparing the final DataFrame

In [26]:
final_data = np.zeros((test_count,2))
final_data[:,0] = list(test_df['test_id'])
final_data[:,1] = list(is_duplicate)

final_df = pd.DataFrame(data = final_data, columns=["test_id", "probability"])
final_df["test_id"] = final_df["test_id"].astype(int)
final_df.head()

Unnamed: 0,test_id,probability
0,0,0.029931
1,1,0.198537
2,2,0.049482
3,3,0.826719
4,4,0.956664


## Saving the output to Submission.CSV

In [27]:
final_df.to_csv("submission.csv", encoding='utf-8', index = False)

### Finally, we have calculated our predicitions and saved them to the CSV file in the prescribed format.