In [4]:
import math
import pandas as pd
import numpy as np
import scipy as sp
import gensim.models.doc2vec as d2v
import multiprocessing as mp
import datetime as dt

from scipy.spatial.distance import cosine
from collections import OrderedDict
from random import shuffle
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import log_loss

### Parallelization

In [5]:
cores = mp.cpu_count() - 1
assert d2v.FAST_VERSION > -1, "Doc2Vec will run painfully slow otherwise"

### Import Data

In [6]:
# Import training data into dataframe
all_questions_df = pd.read_pickle('./pickles.gi/all_questions_df.pkl')

In [7]:
train_outcomes_df = pd.read_pickle('./pickles.gi/train_lookup_df.pkl')

### Preprocess Data

In [8]:
# Get tagged documents
tagged_docs = [d2v.TaggedDocument(row[5], [row[1], row[3]]) for row in all_questions_df.itertuples()]
train_docs = [doc for doc in tagged_docs if doc[1][0] == 0]
test_docs = [doc for doc in tagged_docs if doc[1][0] == 1]
doc_list = tagged_docs[:]  # for reshuffling per pass

### Import DMM Model

In [None]:
dbow_model = d2v.Doc2Vec.load('./models.gi/dbow_model.build_vocab')

### Train Model

In [None]:
# Train model
alpha, min_alpha, passes = (0.025, 0.001, 10)
alpha_delta = (alpha - min_alpha) / passes

train_model = dbow_model

print("START %s" % dt.datetime.now())

for epoch in range(passes):
    # shuffling gets best results
    shuffle(doc_list)

    print ('Training epoch %s' % epoch)

    # train
    train_model.alpha, train_model.min_alpha = alpha, alpha
    train_model.train(doc_list)

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    print(str(dt.datetime.now()))
    
    alpha -= alpha_delta

train_model.save('./models.gi/dbow_model.trained')

print("END %s" % str(dt.datetime.now()))

In [11]:
dbow_model = d2v.Doc2Vec.load('./models.gi/dbow_model.trained')

In [12]:
# Get document vectors for each question
all_questions_df['vector'] = [dbow_model.infer_vector(row) for row in all_questions_df['tokens']]

In [13]:
all_questions_df.head()

Unnamed: 0,test,pid,qid,question,tokens,vector
0,0,0,1,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share...","[0.267908, -0.0955344, -0.10118, -0.0556185, 0..."
1,0,0,2,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share...","[0.191767, 0.0166993, -0.0327661, -0.0625783, ..."
2,0,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[what, is, stori, kohinoor, koh, i, noor, diam...","[0.23633, 0.100282, -0.113752, 0.139624, 0.218..."
3,0,1,4,What would happen if the Indian government sto...,"[what, would, happen, if, indian, govern, stol...","[0.142674, -0.192956, -0.0365042, -0.00100532,..."
4,0,2,5,How can I increase the speed of my internet co...,"[how, can, i, increas, speed, my, internet, co...","[-0.532338, -0.0369567, -0.130684, 0.183189, 0..."


### Calculate Cosine Similarity

In [14]:
# Train
documents_train = all_questions_df[all_questions_df['test'] == 0]
documents_train = documents_train.merge(documents_train, on='pid', how='inner')
documents_train = documents_train[(documents_train['qid_x'] != documents_train['qid_y']) & (documents_train['qid_x'] < documents_train['qid_y'])]
documents_train = documents_train[['pid', 'vector_x', 'vector_y']]
documents_train = documents_train.reset_index(drop=True)
documents_train = documents_train.merge(train_outcomes_df, left_on='pid', right_on='id')
documents_train = documents_train[['pid', 'vector_x', 'vector_y', 'is_duplicate']]
documents_train['cosine_similarity'] = documents_train.apply(lambda x: (1 - cosine(x['vector_x'], x['vector_y'])), axis=1)
documents_train.to_pickle('./pickles.gi/dbow_vectors_cosine_similarity_train')

In [15]:
documents_train.head()

Unnamed: 0,pid,vector_x,vector_y,is_duplicate,cosine_similarity
0,0,"[0.267908, -0.0955344, -0.10118, -0.0556185, 0...","[0.191767, 0.0166993, -0.0327661, -0.0625783, ...",0,0.304468
1,1,"[0.23633, 0.100282, -0.113752, 0.139624, 0.218...","[0.142674, -0.192956, -0.0365042, -0.00100532,...",0,0.549187
2,2,"[-0.532338, -0.0369567, -0.130684, 0.183189, 0...","[-0.216235, -0.027894, -0.0838264, 0.0726645, ...",0,0.375429
3,3,"[-0.0021527, 0.0867196, 0.00909216, 0.0286029,...","[0.0998014, 0.110679, -0.0399092, -0.0471708, ...",0,0.183327
4,4,"[0.690951, 0.148918, 0.0675064, -0.0768607, 0....","[-0.157396, -0.0319132, 0.0558307, 0.0341421, ...",0,0.495322


In [16]:
# Test
documents_test = all_questions_df[all_questions_df['test'] == 1]
documents_test = documents_test.merge(documents_test, on='pid', how='inner')
documents_test = documents_test[(documents_test['qid_x'] != documents_test['qid_y']) & (documents_test['qid_x'] < documents_test['qid_y'])]
documents_test = documents_test[['pid', 'vector_x', 'vector_y']]
documents_test = documents_test.reset_index(drop=True)
documents_test['cosine_similarity'] = documents_test.apply(lambda x: (1 - cosine(x['vector_x'], x['vector_y'])), axis=1)
documents_test.to_pickle('./pickles.gi/dbow_vectors_cosine_similarity_test')

In [17]:
documents_test.head()

Unnamed: 0,pid,vector_x,vector_y,cosine_similarity
0,0,"[0.037997, 0.252992, 0.0104824, -0.0321285, 0....","[0.0416953, -0.192362, 0.05134, -0.318364, 0.1...",0.151364
1,1,"[-0.219072, 0.000400307, 0.021056, -0.324568, ...","[0.0555762, -0.0338761, 0.118223, -0.0916486, ...",0.281057
2,2,"[0.093416, -0.222189, -0.229785, -0.0479537, -...","[0.11058, -0.163763, -0.0781699, -0.158556, 0....",0.261138
3,3,"[-0.189055, 0.098018, -0.322019, -0.229096, -0...","[-0.114543, -0.0186581, -0.149556, 0.0213385, ...",0.461827
4,4,"[0.21405, -0.0372923, -0.0225445, 0.102477, 0....","[-0.0954036, -0.172726, -0.000439818, 0.023264...",0.425137


### Evaluation Metrics

In [18]:
def kfoldScore(results):
    mean_score = results.mean()
    std_dev = results.std()
    std_error = results.std() / math.sqrt(results.shape[0])
    ci =  2.262 * std_error
    lower_bound = mean_score - ci
    upper_bound = mean_score + ci
    print ("Score is %f +/-  %f" % (mean_score, ci))
    print ('95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound))

### Model Parameters

In [19]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
scoring = 'neg_log_loss'
model = LogisticRegression()

### Cosine Similarity Model

In [20]:
# Train-Test Split
X_train_cs = documents_train['cosine_similarity'].values.reshape(-1,1)
y_train_cs = documents_train['is_duplicate']
X_test_cs = documents_test['cosine_similarity'].values.reshape(-1,1)

# Kfold Cross Validation
cross_val_scores_cs = model_selection.cross_val_score(model, X_train_cs, y_train_cs, cv=kfold, scoring=scoring)
kfoldScore(cross_val_scores_cs)

# Predict Outcome Probabilities
model.fit(X_train_cs, y_train_cs)
y_pred_test_cs = model.predict_proba(X_test_cs)

# Create CSV
submission_cs = documents_test['pid'].to_frame()
submission_cs['is_duplicate'] =  pd.Series([row[1] for row in y_pred_test_cs])
submission_cs.columns = ['test_id', 'is_duplicate']
submission_cs.to_csv('./submission.gi/dbow_model_cosine_similarity.csv', index=False)

Score is -0.640641 +/-  0.002441
95 percent probability that if this experiment were repeated over and over the average score would be between -0.643083 and -0.638200
