In [6]:
import pandas as pd
import numpy as np
import scipy as sp
import gensim.models.doc2vec as d2v
import multiprocessing as mp
import datetime as dt
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import log_loss
import math
from scipy.spatial.distance import cosine

### Import Data

In [3]:
documents = pd.read_pickle('./pickles.gi/documents_with_vectors.pkl')
outcomes_train = pd.read_pickle('./pickles.gi/train_lookup_df.pkl')

In [4]:
documents.shape

(5500172, 6)

In [5]:
outcomes_train.head()

Unnamed: 0,id,qid1,qid2,is_duplicate
0,0,1,2,0
1,1,3,4,0
2,2,5,6,0
3,3,7,8,0
4,4,9,10,0


### Calculate Cosine Similarity

In [8]:
documents_train = documents[documents['test'] == 0]
documents_train = documents_train.merge(documents_train, on='pid', how='inner')
documents_train = documents_train[(documents_train['qid_x'] != documents_train['qid_y']) & (documents_train['qid_x'] < documents_train['qid_y'])]
documents_train = documents_train[['pid', 'vectors_x', 'vectors_y']]
documents_train = documents_train.reset_index(drop=True)
documents_train = documents_train.merge(outcomes_train, left_on='pid', right_on='id')
documents_train = documents_train[['pid', 'vectors_x', 'vectors_y', 'is_duplicate']]
documents_train['cosine_similarity'] = documents_train.apply(lambda x: (1 - cosine(x['vectors_x'], x['vectors_y'])), axis=1)
documents_train.to_pickle('./pickles.gi/documents_train_cosine_similarity')

In [9]:
documents_train.head()

Unnamed: 0,pid,vectors_x,vectors_y,is_duplicate,cosine_similarity
0,0,"[0.103571, -0.324362, -0.0369732, 0.491305, 0....","[-1.00418, -0.011127, 0.357603, -0.945217, 0.1...",0,0.334541
1,1,"[0.780759, 0.254167, -0.438259, -1.03809, -0.3...","[-0.0608294, 0.256059, 0.275494, 0.383385, -0....",0,0.042259
2,2,"[0.287551, 0.815586, 0.169537, -0.247909, -0.5...","[0.951204, 0.87587, 0.314868, 1.09143, 0.59101...",0,0.235947
3,3,"[-0.346908, 0.120057, 0.203118, 0.048314, -0.1...","[1.62754, 0.0183279, -0.494569, 0.321272, 0.12...",0,0.003515
4,4,"[-0.479951, 0.235628, 0.16198, -0.689058, -0.5...","[-0.889178, -0.321585, 0.44331, -0.327662, -1....",0,0.105817


In [10]:
documents_test = documents[documents['test'] == 1]
documents_test = documents_test.merge(documents_test, on='pid', how='inner')
documents_test = documents_test[(documents_test['qid_x'] != documents_test['qid_y']) & (documents_test['qid_x'] < documents_test['qid_y'])]
documents_test = documents_test[['pid', 'vectors_x', 'vectors_y']]
documents_test = documents_test.reset_index(drop=True)
documents_test['cosine_similarity'] = documents_test.apply(lambda x: (1 - cosine(x['vectors_x'], x['vectors_y'])), axis=1)
documents_test.to_pickle('./pickles.gi/documents_test_cosine_similarity')

In [11]:
documents_test.head()

Unnamed: 0,pid,vectors_x,vectors_y,cosine_similarity
0,0,"[-0.214728, -0.21874, 0.386611, 0.369, 0.07617...","[-0.662917, -0.25785, 0.199841, -0.454418, 0.5...",-0.052104
1,1,"[-0.205587, 1.22863, -0.00696208, 0.0571063, -...","[0.174078, 0.257381, 0.646523, 0.381952, -0.49...",0.262038
2,2,"[0.0120485, -0.649001, 0.621607, 0.261708, 0.3...","[0.731086, -1.45327, 0.494948, 0.412926, -0.23...",0.427748
3,3,"[0.251726, 0.81067, 0.212609, 0.0255506, -0.31...","[0.113517, 0.0238135, -0.167942, -0.193619, -0...",-0.058153
4,4,"[-0.36815, -0.468827, 0.00867531, 0.0692847, 0...","[0.475766, -0.177542, 0.00298492, -0.25851, -0...",0.061502


### Evaluation Metrics

In [12]:
def kfoldScore(results):
    mean_score = results.mean()
    std_dev = results.std()
    std_error = results.std() / math.sqrt(results.shape[0])
    ci =  2.262 * std_error
    lower_bound = mean_score - ci
    upper_bound = mean_score + ci
    print ("Score is %f +/-  %f" % (mean_score, ci))
    print ('95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound))

### Model Parameters

In [15]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
scoring = 'neg_log_loss'
model = LogisticRegression()

### Cosine Similarity Model

In [16]:
# Train-Test Split
X_train_cs = documents_train['cosine_similarity'].values.reshape(-1,1)
y_train_cs = documents_train['is_duplicate']
X_test_cs = documents_test['cosine_similarity'].values.reshape(-1,1)

In [18]:
# Kfold Cross Validation
cross_val_scores_cs = model_selection.cross_val_score(model, X_train_cs, y_train_cs, cv=kfold, scoring=scoring)
kfoldScore(cross_val_scores_cs)

Score is -0.652610 +/-  0.002665
95 percent probability that if this experiment were repeated over and over the average score would be between -0.655275 and -0.649944
0.652609860006


In [19]:
# Predict Outcome Probabilities
model.fit(X_train_cs, y_train_cs)
y_pred_test_cs = model.predict_proba(X_test_cs)

In [36]:
# Create CSV
submission_cs = documents_test['pid'].to_frame()
submission_cs['is_duplicate'] =  pd.Series([row[1] for row in y_pred_test_cs])
submission_cs.columns = ['test_id', 'is_duplicate']
submission_cs.to_csv('./submission_cs.csv', index=False)

### Calculate Vector Differences

In [38]:
# Get Train Diffs As Array, Then Expand Into Dataframe
documents_train['vector_diff'] = np.subtract(documents_train['vectors_x'], documents_train['vectors_y'])
vector_diffs_train = documents_train['vector_diff'].apply(pd.Series)
vector_diffs_train = vector_diffs_train.rename(columns = lambda x : 'val_' + str(x))

In [39]:
vector_diffs_train.head()

Unnamed: 0,val_0,val_1,val_2,val_3,val_4,val_5,val_6,val_7,val_8,val_9,...,val_90,val_91,val_92,val_93,val_94,val_95,val_96,val_97,val_98,val_99
0,1.107752,-0.313235,-0.394576,1.436522,0.239561,0.347694,0.380208,-0.264182,-0.271405,-0.948074,...,0.112876,-0.371856,0.011935,0.013963,-0.86503,0.02191,0.269779,-0.395462,1.013008,1.712046
1,0.841588,-0.001892,-0.713754,-1.421479,-0.005607,-0.807846,0.090561,0.537039,-0.105543,0.803184,...,-0.402758,-0.871411,-1.006908,0.041368,0.928657,-1.476838,-0.021899,0.451373,-0.181559,0.81691
2,-0.663653,-0.060284,-0.145331,-1.339342,-1.093719,-1.117436,1.072385,1.041608,1.412804,-0.392849,...,0.476503,0.041992,-0.326097,1.31496,0.493841,-0.064051,0.789275,-0.593265,-0.151976,1.118786
3,-1.974446,0.101729,0.697687,-0.272958,-0.307428,2.082574,0.26769,0.484197,0.132945,0.965306,...,0.344746,0.390783,-1.115674,1.117954,-0.266042,-0.033769,-0.73697,0.461864,0.215652,0.231992
4,0.409227,0.557212,-0.281329,-0.361395,0.646847,0.343811,1.13983,-0.88663,0.120703,-0.635926,...,0.129824,0.335401,-1.246236,0.068261,0.105812,-1.395813,0.684883,0.691859,0.192519,0.123829


### Vector Difference Model

In [44]:
# Train-Test Split
X_train_vd = vector_diffs_train
y_train_vd = y_train_cs

In [45]:
# Kfold Cross Validation
cross_val_scores_vd = model_selection.cross_val_score(model, X_train_vd, y_train_vd, cv=kfold, scoring=scoring)
kfoldScore(cross_val_scores_vd)

Score is -0.652517 +/-  0.002654
95 percent probability that if this experiment were repeated over and over the average score would be between -0.655171 and -0.649863


### Vector Difference w/ Cosine Similarity Model

In [46]:
# Train-Test Split
X_train_vdcs = vector_diffs_train
X_train_vdcs['cosine_similarity'] = X_train_cs
y_train_vdcs = y_train_cs

In [47]:
# Kfold Cross Validation
cross_val_scores_vdcs = model_selection.cross_val_score(model, X_train_vdcs, y_train_vdcs, cv=kfold, scoring=scoring)
kfoldScore(cross_val_scores_vdcs)

Score is -0.652517 +/-  0.002654
95 percent probability that if this experiment were repeated over and over the average score would be between -0.655171 and -0.649863


<hr>

In [239]:
X_train_vd = vector_diffs_train
X_train_vd['cosine_similarity'] = documents_train['cosine_similarity']
y_train_vdcs = documents_train['is_duplicate']
X_train_vdcs.head()

Unnamed: 0,vec_diff_0,vec_diff_1,vec_diff_2,vec_diff_3,vec_diff_4,vec_diff_5,vec_diff_6,vec_diff_7,vec_diff_8,vec_diff_9,...,vec_diff_93,vec_diff_94,vec_diff_95,vec_diff_96,vec_diff_97,vec_diff_98,vec_diff_99,cosine_similarity,pid,is_duplicate
0,1.107752,-0.313235,-0.394576,1.436522,0.239561,0.347694,0.380208,-0.264182,-0.271405,-0.948074,...,0.013963,-0.86503,0.02191,0.269779,-0.395462,1.013008,1.712046,0.334541,0,0
1,0.841588,-0.001892,-0.713754,-1.421479,-0.005607,-0.807846,0.090561,0.537039,-0.105543,0.803184,...,0.041368,0.928657,-1.476838,-0.021899,0.451373,-0.181559,0.81691,0.042259,1,0
2,-0.663653,-0.060284,-0.145331,-1.339342,-1.093719,-1.117436,1.072385,1.041608,1.412804,-0.392849,...,1.31496,0.493841,-0.064051,0.789275,-0.593265,-0.151976,1.118786,0.235947,2,0
3,-1.974446,0.101729,0.697687,-0.272958,-0.307428,2.082574,0.26769,0.484197,0.132945,0.965306,...,1.117954,-0.266042,-0.033769,-0.73697,0.461864,0.215652,0.231992,0.003515,3,0
4,0.409227,0.557212,-0.281329,-0.361395,0.646847,0.343811,1.13983,-0.88663,0.120703,-0.635926,...,0.068261,0.105812,-1.395813,0.684883,0.691859,0.192519,0.123829,0.105817,4,0


In [240]:
X_train_vdcs = X_train_vdcs.drop('pid', 1)
X_train_vdcs.head()

Unnamed: 0,vec_diff_0,vec_diff_1,vec_diff_2,vec_diff_3,vec_diff_4,vec_diff_5,vec_diff_6,vec_diff_7,vec_diff_8,vec_diff_9,...,vec_diff_92,vec_diff_93,vec_diff_94,vec_diff_95,vec_diff_96,vec_diff_97,vec_diff_98,vec_diff_99,cosine_similarity,is_duplicate
0,1.107752,-0.313235,-0.394576,1.436522,0.239561,0.347694,0.380208,-0.264182,-0.271405,-0.948074,...,0.011935,0.013963,-0.86503,0.02191,0.269779,-0.395462,1.013008,1.712046,0.334541,0
1,0.841588,-0.001892,-0.713754,-1.421479,-0.005607,-0.807846,0.090561,0.537039,-0.105543,0.803184,...,-1.006908,0.041368,0.928657,-1.476838,-0.021899,0.451373,-0.181559,0.81691,0.042259,0
2,-0.663653,-0.060284,-0.145331,-1.339342,-1.093719,-1.117436,1.072385,1.041608,1.412804,-0.392849,...,-0.326097,1.31496,0.493841,-0.064051,0.789275,-0.593265,-0.151976,1.118786,0.235947,0
3,-1.974446,0.101729,0.697687,-0.272958,-0.307428,2.082574,0.26769,0.484197,0.132945,0.965306,...,-1.115674,1.117954,-0.266042,-0.033769,-0.73697,0.461864,0.215652,0.231992,0.003515,0
4,0.409227,0.557212,-0.281329,-0.361395,0.646847,0.343811,1.13983,-0.88663,0.120703,-0.635926,...,-1.246236,0.068261,0.105812,-1.395813,0.684883,0.691859,0.192519,0.123829,0.105817,0


In [242]:
y_train_vdcs.head()

0    0
1    0
2    0
3    0
4    0
Name: is_duplicate, dtype: int64

In [243]:
results_vdcs = model_selection.cross_val_score(model, X_train_vdcs, y_train_vdcs, cv=kfold, scoring=scoring)

In [244]:
mean_score = results_vdcs.mean()
std_dev = results_vdcs.std()
std_error = results_vdcs.std() / math.sqrt(results_vdcs.shape[0])
ci =  2.262 * std_error
lower_bound = mean_score - ci
upper_bound = mean_score + ci

print ("Score is %f +/-  %f" % (mean_score, ci))
print ('95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound))

Score is -0.000130 +/-  0.000000
95 percent probability that if this experiment were repeated over and over the average score would be between -0.000130 and -0.000130


In [245]:
model.fit(X_train_vdcs, y_train_vdcs)
y_pred_vdcs = model.predict_proba(X_train_vdcs)
y_pred_vdcs

array([[  9.99901917e-01,   9.80829717e-05],
       [  9.99874015e-01,   1.25985374e-04],
       [  9.99890534e-01,   1.09465545e-04],
       ..., 
       [  9.99889265e-01,   1.10735001e-04],
       [  9.99867918e-01,   1.32081535e-04],
       [  9.99873007e-01,   1.26992602e-04]])

In [246]:
logloss_vdcs = log_loss(y_pred=y_pred_vdcs, y_true=y_train_vdcs)

In [247]:
logloss_vdcs

0.00012028165367434853

### VDCS TEST

In [248]:
documents_test['vector_diff'] = np.subtract(documents_test['vectors_x'], documents_test['vectors_y'])

In [251]:
documents_test.head()

Unnamed: 0,pid,vectors_x,vectors_y,cosine_similarity,is_duplicate,vector_diff
0,0,"[-0.214728, -0.21874, 0.386611, 0.369, 0.07617...","[-0.662917, -0.25785, 0.199841, -0.454418, 0.5...",-0.052104,0.277446,"[0.44819, 0.0391104, 0.18677, 0.823418, -0.451..."
1,1,"[-0.205587, 1.22863, -0.00696208, 0.0571063, -...","[0.174078, 0.257381, 0.646523, 0.381952, -0.49...",0.262038,0.412398,"[-0.379665, 0.971253, -0.653485, -0.324845, -0..."
2,2,"[0.0120485, -0.649001, 0.621607, 0.261708, 0.3...","[0.731086, -1.45327, 0.494948, 0.412926, -0.23...",0.427748,0.49102,"[-0.719038, 0.80427, 0.126659, -0.151218, 0.61..."
3,3,"[0.251726, 0.81067, 0.212609, 0.0255506, -0.31...","[0.113517, 0.0238135, -0.167942, -0.193619, -0...",-0.058153,0.275124,"[0.138209, 0.786857, 0.38055, 0.219169, -0.241..."
4,4,"[-0.36815, -0.468827, 0.00867531, 0.0692847, 0...","[0.475766, -0.177542, 0.00298492, -0.25851, -0...",0.061502,0.32321,"[-0.843916, -0.291284, 0.00569039, 0.327795, 0..."


In [None]:
# expand vector_diffs into its own dataframe
vector_diffs_test = documents_test['vector_diff'].apply(pd.Series)

# rename each variable is vector_diffs
vector_diffs_test = vector_diffs_test.rename(columns = lambda x : 'vec_diff_' + str(x))

# view the vector_diffs dataframe
vector_diffs_test.head()

In [None]:
X_test_vdcs = vector_diffs_test
X_test_vdcs['cosine_similarity'] = documents_test['cosine_similarity']
X_test_vdcs.head()

In [None]:
y_pred_test_vdcs = model.predict_proba(X_test_vdcs)

In [None]:
submission02 = documents_test['pid']
submission02['is_duplicate'] = [row[1] for row in y_pred_test_vdcs]
submission02.columns = ['test_id', 'is_duplicate']
submission02.head()
submission02.to_csv('./submission02.csv', index=False)

In [1]:
documents_test.shape

NameError: name 'documents_test' is not defined