# Quora Question Pairs

## Imports

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing
from random import shuffle
import datetime



In [2]:
# Import training data into dataframe
TRAIN = pd.read_csv('../data.gi/train.csv')
TEST = pd.read_csv('../data.gi/test.csv')

<hr>

## EDA

### Train Data

In [3]:
TRAIN.shape

(404290, 6)

In [4]:
TRAIN.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
TRAIN.tail()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0
404289,404289,537932,537933,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0


### Test Data

In [6]:
TEST.shape

(2345796, 3)

In [7]:
TEST.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [8]:
TEST.tail()

Unnamed: 0,test_id,question1,question2
2345791,2345791,How do Peaks (TV series): Why did Leland kill ...,What is the most study scene in twin peaks?
2345792,2345792,"What does be ""in transit"" mean on FedEx tracking?",How question FedEx packages delivered?
2345793,2345793,What are some famous Romanian drinks (alcoholi...,Can a non-alcoholic restaurant be a huge success?
2345794,2345794,What were the best and worst things about publ...,What are the best and worst things examination...
2345795,2345795,What is the best medication equation erectile ...,How do I out get rid of Erectile Dysfunction?


### Notes

- There's 2,000,000+ test pairs compared to only 400,000+ train pairs.
- Must doublecheck quora docs if there's computer generated questions in this set.
- Maybe split this set into validation set instead of doing cross-validation?

<hr>

## Preprocessing

### Reshape Dataframes for Text Preprocessing

#### Train

In [9]:
# Create lookup table
train_lookup_df = TRAIN[['id', 'qid1', 'qid2', 'is_duplicate']]

In [10]:
train_lookup_df.to_pickle('./pickles.gi/train_lookup_df.pkl')

In [76]:
# Separate q1 and q2 into respective dataframes
# Then stack, sort, and reindex new dataframe
train_q1_df = TRAIN[['id', 'qid1', 'question1']]
train_q1_df.columns = ['pid', 'qid', 'question']
train_q2_df = TRAIN[['id', 'qid2', 'question2']]
train_q2_df.columns = ['pid', 'qid', 'question']
train_questions_df = pd.concat([train_q1_df, train_q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)

Unnamed: 0,pid,qid,question
0,0,1,What is the step by step guide to invest in sh...
1,0,2,What is the step by step guide to invest in sh...
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,1,4,What would happen if the Indian government sto...
4,2,5,How can I increase the speed of my internet co...


In [91]:
# Add test set indicator
values = [0] * len(train_questions_df.index)
train_questions_df = train_questions_df.assign(test=values)

In [106]:
# Check work
train_questions_df

Unnamed: 0,pid,qid,question,test
0,0,1,What is the step by step guide to invest in sh...,0
1,0,2,What is the step by step guide to invest in sh...,0
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,0
3,1,4,What would happen if the Indian government sto...,0
4,2,5,How can I increase the speed of my internet co...,0
5,2,6,How can Internet speed be increased by hacking...,0
6,3,7,Why am I mentally very lonely? How can I solve...,0
7,3,8,Find the remainder when [math]23^{24}[/math] i...,0
8,4,9,"Which one dissolve in water quikly sugar, salt...",0
9,4,10,Which fish would survive in salt water?,0


#### Test

In [74]:
# Add qid's for question1 and question2
odd_range = pd.Series(range(1, len(TEST.index) * 2 + 1, 2))
even_range = pd.Series(range(2, len(TEST.index) * 2 + 1, 2))
TEST = TEST.assign(qid1=odd_range, qid2=even_range)

In [75]:
# Separate q1 and q2 into respective dataframes
# Then stack, sort, and reindex new dataframe
test_q1_df = TEST[['test_id', 'qid1', 'question1']]
test_q1_df.columns = ['pid', 'qid', 'question']
test_q2_df = TEST[['test_id', 'qid2', 'question2']]
test_q2_df.columns = ['pid', 'qid', 'question']
test_questions_df = pd.concat([test_q1_df, test_q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)

Unnamed: 0,pid,qid,question
0,0,1,How does the Surface Pro himself 4 compare wit...
1,0,2,Why did Microsoft choose core m3 and not core ...
2,1,3,Should I have a hair transplant at age 24? How...
3,1,4,How much cost does hair transplant require?
4,2,5,What but is the best way to send money from Ch...


In [85]:
# Add test set flag
values = [1] * len(test_questions_df.index)
test_questions_df = test_questions_df.assign(test=values)

#### Combine Train & Test

In [103]:
# Combine train and test sets
# Move test flag column to first position
combined_questions_df = pd.concat([train_questions_df, test_questions_df], ignore_index=True).sort_values(by=['test', 'pid', 'qid']).reset_index(drop=True)
cols = combined_questions_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
combined_questions_df = combined_questions_df[cols]

### Preprocess Text for Setence Vectorization

In [107]:
# Parse to string, force lowercasing, tokenize, filter out stopwords, and stem
def preprocessText(questions):
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer('english')
    stopwords = set('for a of the and to in'.split(' '))
    lowered = [str(question).lower() for question in questions]
    tokenized = [tokenizer.tokenize(question) for question in lowered]
    filtered = [[token for token in tokens if token not in stopwords] for tokens in tokenized]
    stemmed = [[stemmer.stem(token) for token in tokens if token not in stopwords] for tokens in filtered]
    return stemmed

In [108]:
# Preprocess
combined_questions_df = combined_questions_df.assign(tokens=preprocessText(combined_questions_df['question']))

In [113]:
# Pickle dataframe
combined_questions_df.to_pickle('./combined_questions_df.pkl')

<hr>

## Predictive Evaluation Methods

In [185]:
import numpy as np
import statsmodels.api as sm
from random import sample

# for timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    #print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

<hr>

## Setup Up Doc2Vec Model for Sentence Vectorization

Approximating experiments of:
1. RaRe-Technologies ["gensim doc2vec & IMDB sentiment dataset"](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb)
2. Le & Mikolov ["Distributed Representations of Sentences and Documents"](http://cs.stanford.edu/~quocle/paragraph_vector.pdf)

Parameter Choices:
- 100-dimensional vectors, (1) notes that the 400d vectors of (2) don't seem to offer much benefit on this task
- each paragraph vector is a combination of two vectors: one learned by the standard paragraph vector with distributed memory (PV-DM) and one learned by the paragraph vector with distributed bag of words (PVDBOW); (2) strongly recommends this configuration as it provides more consistent results across many tasks
- per (1), frequent word subsampling seems to decrease sentiment-prediction accuracy, so it's left out
- two DM models are available, one which averages vectors (dm_mean) and one which concatenates them (dm_concat, resulting in a much larger, slower, more data-hungry model); dm_mean is selected for reduced processing time
- per (1), a min_count=2 saves quite a bit of model memory, discarding only words that appear in a single doc (and are thus no more expressive than the unique-to-each doc vectors themselves)
- negative sampling (aka dropout?) will be used for model training instead of hierarchical sampling; 3 noise words will be drawn

In [141]:
# Speed up processing
cores = multiprocessing.cpu_count() - 2
assert gensim.models.doc2vec.FAST_VERSION > -1, "doc2vec not running fast verison!!!"

In [167]:
# Get list of tagged documents
tagged_docs = [TaggedDocument(row[5], [row[1], row[3]]) for row in combined_questions_df.itertuples()]
train_docs = [doc for doc in tagged_docs if doc[1][0] == 0]
test_docs = [doc for doc in tagged_docs if doc[1][0] == 1]
doc_list = tagged_docs[:]  # for reshuffling per pass

In [210]:
# Build Model: PV-DM w/average
DMM_model = Doc2Vec(dm=1, dm_mean=1, size=100, window=5, negative=3, hs=0, min_count=2, workers=cores)

# Build vocab table
DMM_model.build_vocab(sentences=doc_list)

In [213]:
# Train model
from collections import defaultdict
best_error = defaultdict(lambda :1.0)  # to selectively-print only best errors achieved

alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

train_model = DMM_model

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results

    print ('Training epoch %s' % epoch)

    # train
    duration = 'na'
    train_model.alpha, train_model.min_alpha = alpha, alpha
    with elapsed_timer() as elapsed:
        train_model.train(doc_list)
        duration = '%.1f' % elapsed()

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta

print("END %s" % str(datetime.datetime.now()))

START 2017-04-09 10:18:15.699203
Training epoch 0
completed pass 1 at alpha 0.025000
Training epoch 1
completed pass 2 at alpha 0.023800
Training epoch 2
completed pass 3 at alpha 0.022600
Training epoch 3
completed pass 4 at alpha 0.021400
Training epoch 4
completed pass 5 at alpha 0.020200
Training epoch 5
completed pass 6 at alpha 0.019000
Training epoch 6
completed pass 7 at alpha 0.017800
Training epoch 7
completed pass 8 at alpha 0.016600
Training epoch 8
completed pass 9 at alpha 0.015400
Training epoch 9
completed pass 10 at alpha 0.014200
Training epoch 10
completed pass 11 at alpha 0.013000
Training epoch 11
completed pass 12 at alpha 0.011800
Training epoch 12
completed pass 13 at alpha 0.010600
Training epoch 13
completed pass 14 at alpha 0.009400
Training epoch 14
completed pass 15 at alpha 0.008200
Training epoch 15
completed pass 16 at alpha 0.007000
Training epoch 16
completed pass 17 at alpha 0.005800
Training epoch 17
completed pass 18 at alpha 0.004600
Training epoch

In [214]:
# shows the similar words
print (train_model.most_similar('increas'))
 
# shows the learnt embedding
print (train_model['increas'])
 
# shows the similar docs with id = 2
print (train_model.docvecs.most_similar(str(2)))

[('improv', 0.8758364915847778), ('boost', 0.8625863790512085), ('chang', 0.8399116396903992), ('exceed', 0.8137489557266235), ('maxim', 0.807131290435791), ('quit', 0.8061968088150024), ('lower', 0.8037192225456238), ('rais', 0.8002794981002808), ('littl', 0.7969976663589478), ('extend', 0.795707106590271)]
[-1.12470818 -0.35494083 -0.49132398 -0.20920444 -0.06262583  0.2598519
 -0.17185074 -0.01804389 -0.97175509  0.0635014   0.16748099  0.55220932
  0.47858143 -0.63086516 -0.1153461   0.12526655  0.23380475 -1.02953315
  0.78604954 -0.19079041 -0.09251432  0.4803766  -0.33760706  0.07566075
  0.54467356 -0.04292172  0.31571522  0.61457348 -0.9424876  -0.20818026
 -0.58001941  0.36140078 -0.06931017 -0.13500226 -0.02686614  0.73060274
  0.52834183 -0.84551752  0.42851517 -0.23981416  0.31625879  0.48092607
  1.17890859  0.68487334  0.07830159 -0.61633956  1.07624519 -0.78344667
 -0.99361235 -1.72483933 -0.40009087  0.3125715   0.47662818  0.23762196
 -0.04787419  0.10361657 -0.127142

TypeError: '<' not supported between instances of 'str' and 'int'

In [215]:
train_model.save('./doc2vec_models/DMM_model')

In [216]:
train_model['1']

array([-0.49445194,  0.57114244, -0.29900444, -0.48292378, -0.58117747,
        0.6194604 , -0.19689982, -0.37719521, -1.08449328, -0.42487201,
        0.51859283,  0.21435472,  0.22841647, -0.30224866, -0.17015429,
       -0.14194492, -0.37416264, -0.95118243,  1.34621644,  0.10636292,
        0.47003758,  0.84099102, -0.38159865,  0.12460876,  0.42012173,
        0.0413011 , -0.25113052, -0.14961573, -1.04559064,  0.18868892,
        0.01681517,  0.33061174, -0.05980685,  0.02511602, -0.32749659,
        0.42036548, -0.21351975, -1.06129146, -0.02310394, -0.19862628,
       -0.09790964,  0.63926524,  0.86929059,  0.3215766 , -0.03939769,
       -0.55079871,  0.55959737, -0.85833514, -0.96015924, -1.49373126,
       -0.71297568,  0.57121527, -0.01732248, -0.26978436, -0.4271037 ,
        0.16163146,  0.29986122,  1.66312897, -0.35684928,  0.00293985,
       -0.79682332, -0.56326377,  0.26202238, -0.21335189,  0.19462696,
       -0.14269695, -0.22159635,  0.23533957,  0.07753155, -0.42

<hr>