In [24]:
import math
import pandas as pd
import numpy as np
import scipy as sp
import gensim.models.doc2vec as d2v
import multiprocessing as mp
import datetime as dt

from scipy.spatial.distance import cosine
from collections import OrderedDict
from random import shuffle
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import log_loss

### Parallelization

In [2]:
cores = mp.cpu_count() - 1
assert d2v.FAST_VERSION > -1, "Doc2Vec will run painfully slow otherwise"

### Import Data

In [4]:
# Import training data into dataframe
all_questions_df = pd.read_pickle('./pickles.gi/all_questions_df.pkl')

In [22]:
train_outcomes_df = pd.read_pickle('./pickles.gi/train_lookup_df.pkl')

### Preprocess Data

In [6]:
# Get tagged documents
tagged_docs = [d2v.TaggedDocument(row[5], [row[1], row[3]]) for row in all_questions_df.itertuples()]
train_docs = [doc for doc in tagged_docs if doc[1][0] == 0]
test_docs = [doc for doc in tagged_docs if doc[1][0] == 1]
doc_list = tagged_docs[:]  # for reshuffling per pass

### Import DMM Model

In [8]:
dmm_model = d2v.Doc2Vec.load('./models.gi/dmm_model.build_vocab')

### Train Model

In [9]:
# Train model
alpha, min_alpha, passes = (0.025, 0.001, 10)
alpha_delta = (alpha - min_alpha) / passes

train_model = dmm_model

print("START %s" % dt.datetime.now())

for epoch in range(passes):
    # shuffling gets best results
    shuffle(doc_list)

    print ('Training epoch %s' % epoch)

    # train
    train_model.alpha, train_model.min_alpha = alpha, alpha
    train_model.train(doc_list)

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    print(str(dt.datetime.now()))
    
    alpha -= alpha_delta

train_model.save('./models.gi/dmm_model.trained')

print("END %s" % str(dt.datetime.now()))

START 2017-04-25 20:32:01.161559
Training epoch 0
completed pass 1 at alpha 0.025000
2017-04-25 21:42:24.616931
Training epoch 1
completed pass 2 at alpha 0.022600
2017-04-25 22:49:53.580664
Training epoch 2
completed pass 3 at alpha 0.020200
2017-04-25 23:56:48.398570
Training epoch 3
completed pass 4 at alpha 0.017800
2017-04-26 01:03:45.691996
Training epoch 4
completed pass 5 at alpha 0.015400
2017-04-26 02:10:40.141003
Training epoch 5
completed pass 6 at alpha 0.013000
2017-04-26 03:17:32.455762
Training epoch 6
completed pass 7 at alpha 0.010600
2017-04-26 04:23:51.984528
Training epoch 7
completed pass 8 at alpha 0.008200
2017-04-26 05:30:34.487696
Training epoch 8
completed pass 9 at alpha 0.005800
2017-04-26 06:37:18.928758
Training epoch 9
completed pass 10 at alpha 0.003400
2017-04-26 07:43:56.880602
END 2017-04-26 07:45:48.292163


In [16]:
# Get document vectors for each question
all_questions_df['vector'] = [dmm_model.infer_vector(row) for row in all_questions_df['tokens']]

In [19]:
all_questions_df.head()

Unnamed: 0,test,pid,qid,question,tokens,vector
0,0,0,1,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share...","[0.0146032, -0.956191, -0.708046, 0.0977121, 0..."
1,0,0,2,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share...","[-0.230472, 0.581748, 0.237926, -0.976853, 0.0..."
2,0,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[what, is, stori, kohinoor, koh, i, noor, diam...","[-0.523243, -0.246485, -0.602073, -0.667664, 0..."
3,0,1,4,What would happen if the Indian government sto...,"[what, would, happen, if, indian, govern, stol...","[0.261746, -0.106042, 0.397741, 0.128407, 0.11..."
4,0,2,5,How can I increase the speed of my internet co...,"[how, can, i, increas, speed, my, internet, co...","[-0.154436, 0.027261, 0.507302, -0.230165, 0.1..."


### Calculate Cosine Similarity

In [25]:
# Train
documents_train = all_questions_df[all_questions_df['test'] == 0]
documents_train = documents_train.merge(documents_train, on='pid', how='inner')
documents_train = documents_train[(documents_train['qid_x'] != documents_train['qid_y']) & (documents_train['qid_x'] < documents_train['qid_y'])]
documents_train = documents_train[['pid', 'vector_x', 'vector_y']]
documents_train = documents_train.reset_index(drop=True)
documents_train = documents_train.merge(train_outcomes_df, left_on='pid', right_on='id')
documents_train = documents_train[['pid', 'vector_x', 'vector_y', 'is_duplicate']]
documents_train['cosine_similarity'] = documents_train.apply(lambda x: (1 - cosine(x['vector_x'], x['vector_y'])), axis=1)
documents_train.to_pickle('./pickles.gi/dmm_vectors_cosine_similarity')

In [26]:
documents_train.head()

Unnamed: 0,pid,vector_x,vector_y,is_duplicate,cosine_similarity
0,0,"[0.0146032, -0.956191, -0.708046, 0.0977121, 0...","[-0.230472, 0.581748, 0.237926, -0.976853, 0.0...",0,0.262513
1,1,"[-0.523243, -0.246485, -0.602073, -0.667664, 0...","[0.261746, -0.106042, 0.397741, 0.128407, 0.11...",0,0.091059
2,2,"[-0.154436, 0.027261, 0.507302, -0.230165, 0.1...","[0.534726, 0.132381, 0.611858, 0.222932, -0.16...",0,0.262287
3,3,"[0.26084, -0.0442004, 0.233576, -0.278878, 0.2...","[0.199144, -0.635316, 0.396587, -0.338456, 1.2...",0,0.136919
4,4,"[-0.15542, -1.19199, -0.590674, -0.834643, 0.5...","[0.717642, -0.693195, -0.319493, -0.693959, 0....",0,0.124205


In [27]:
# Test
documents_test = all_questions_df[all_questions_df['test'] == 1]
documents_test = documents_test.merge(documents_test, on='pid', how='inner')
documents_test = documents_test[(documents_test['qid_x'] != documents_test['qid_y']) & (documents_test['qid_x'] < documents_test['qid_y'])]
documents_test = documents_test[['pid', 'vector_x', 'vector_y']]
documents_test = documents_test.reset_index(drop=True)
documents_test['cosine_similarity'] = documents_test.apply(lambda x: (1 - cosine(x['vector_x'], x['vector_y'])), axis=1)
documents_test.to_pickle('./pickles.gi/dmm_vectors_cosine_similarity_test')

In [28]:
documents_test.head()

Unnamed: 0,pid,vector_x,vector_y,cosine_similarity
0,0,"[0.486248, -0.338634, 1.66882, -0.680684, 0.02...","[-0.684407, 0.187148, 0.573343, -0.341806, 0.0...",0.260499
1,1,"[-0.168766, -0.588327, 0.171794, -0.18299, 0.2...","[-0.212234, 0.00949307, 0.626844, 0.607864, -0...",0.244934
2,2,"[0.878145, 0.734945, 0.852028, -0.772522, -0.6...","[0.651759, -0.787332, 0.735912, -0.112427, 0.0...",0.191064
3,3,"[0.131549, -1.01717, 0.353361, -0.292697, 0.60...","[0.362579, 0.218021, 0.437419, -0.0652552, 0.0...",0.138348
4,4,"[-0.320172, -0.105362, 0.50626, 0.470953, 0.45...","[-0.710141, -0.148397, -0.370671, -0.256379, -...",0.038799


### Evaluation Metrics

In [29]:
def kfoldScore(results):
    mean_score = results.mean()
    std_dev = results.std()
    std_error = results.std() / math.sqrt(results.shape[0])
    ci =  2.262 * std_error
    lower_bound = mean_score - ci
    upper_bound = mean_score + ci
    print ("Score is %f +/-  %f" % (mean_score, ci))
    print ('95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound))

### Model Parameters

In [30]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
scoring = 'neg_log_loss'
model = LogisticRegression()

### Cosine Similarity Model

In [31]:
# Train-Test Split
X_train_cs = documents_train['cosine_similarity'].values.reshape(-1,1)
y_train_cs = documents_train['is_duplicate']
X_test_cs = documents_test['cosine_similarity'].values.reshape(-1,1)

# Kfold Cross Validation
cross_val_scores_cs = model_selection.cross_val_score(model, X_train_cs, y_train_cs, cv=kfold, scoring=scoring)
kfoldScore(cross_val_scores_cs)

# Predict Outcome Probabilities
model.fit(X_train_cs, y_train_cs)
y_pred_test_cs = model.predict_proba(X_test_cs)

# Create CSV
submission_cs = documents_test['pid'].to_frame()
submission_cs['is_duplicate'] =  pd.Series([row[1] for row in y_pred_test_cs])
submission_cs.columns = ['test_id', 'is_duplicate']
submission_cs.to_csv('./dmm_model_cosine_similarity.csv', index=False)

Score is -0.647815 +/-  0.002564
95 percent probability that if this experiment were repeated over and over the average score would be between -0.650380 and -0.645251
