In [11]:
import nltk            # natural language tool kit
import numpy as np     # support for large data structures
import pandas as pd    # data structure support
import string          # various string functions
import difflib         # classes and functions for comparing sequences
from sklearn.metrics import log_loss    # used in measurement / scoring
from sklearn.metrics import classification_report   # among other things, provides accuracy and f1

# Libraries for Doc2Vec processing
from gensim import models, corpora, similarities
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from nltk.tokenize import TweetTokenizer
import pyemd
import csv
import multiprocessing

cores = multiprocessing.cpu_count()
print cores

8


In [12]:
train = pd.read_csv('Data/train.csv')
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [23]:
with open('Data/train.csv', 'rU') as f:
    f.next()
    reader = csv.reader(f)
    qpairs = list(reader)
   
tknzr = TweetTokenizer()
qcorp = []
qdict = {}
for qpair in qpairs:
    qid = qpair[1]
    qwords = tknzr.tokenize(qpair[3])
    q = TaggedDocument(words=qwords, tags=["QID_"+str(qid)])
    qcorp.append(q)
    qdict["QID_"+str(qid)] = qwords
    qid = qpair[2]
    qwords = tknzr.tokenize(qpair[4])
    q = TaggedDocument(words=qwords, tags=["QID_"+str(qid)])
    qcorp.append(q)
    qdict["QID_"+str(qid)] = qwords




In [182]:
model = Doc2Vec(size=100, window=5, negative=5, sample=1e-4, min_count=2, workers=cores, alpha=0.025, min_alpha=0.025)
model.build_vocab(qcorp)
for epoch in range(10):
    model.train(qcorp)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
    model.train(qcorp)

iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10


In [183]:
model.save("doc2vec.model")

In [4]:
model = Doc2Vec.load("doc2vec.model")

In [5]:
vectors = model.docvecs

In [26]:
dv_results = []
for i in range(len(qpairs)):
    qid1 = "QID_"+str(qpairs[i][1])
    qid2 = "QID_"+str(qpairs[i][2])
    dv_similarity = vectors.similarity(qid1, qid2)
    dv_results.append([i, dv_similarity])
    #qpairs[i].extend([score])

In [27]:
actuals = np.array(train['is_duplicate'])
n_dv_results = np.array(dv_results)

predictions_dv = n_dv_results[:,1]

In [28]:
# build a function to run scoring
# function takes an array of actuals, predicted values (as percentages, not absolute), and a probability 
# probability represents value over which over which we assume = 1)
# percentages are converted to absolute values (0 or 1) with the function

def score(actuals, predictions, probability):
    score_sm = log_loss(actuals, predictions)
    print ('log loss score is: %3f' %(score_sm))
    
    n_abs = np.where(predictions > probability, 1, 0)
    
    total_wrong = np.sum(np.not_equal(actuals, n_abs))
    print ('number of incorrect predictions is: %3d' %(total_wrong))
    total = len(actuals)
    correct = total - total_wrong
    print ('total: %3d  correct: %3d  accuracy: %3.2f \n' %(total, correct, 1.0*correct/total))
    print classification_report(actuals, n_abs)

In [29]:
score(actuals, predictions_dv, .50)

log loss score is: 0.661802
number of incorrect predictions is: 136267
total: 404351  correct: 268084  accuracy: 0.66 

             precision    recall  f1-score   support

          0       0.83      0.59      0.69    255045
          1       0.53      0.79      0.63    149306

avg / total       0.72      0.66      0.67    404351



In [25]:
for i in range(25):
    print qpairs[i]

['0', '1', '2', 'What is the step by step guide to invest in share market in india?', 'What is the step by step guide to invest in share market?', '0']
['1', '3', '4', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?', 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', '0']
['2', '5', '6', 'How can I increase the speed of my internet connection while using a VPN?', 'How can Internet speed be increased by hacking through DNS?', '0']
['3', '7', '8', 'Why am I mentally very lonely? How can I solve it?', 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?', '0']
['4', '9', '10', 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?', 'Which fish would survive in salt water?', '0']
['5', '11', '12', 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?', "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?", '1']
['6', '13', '14', 'Shou

In [187]:
wordvectors = model.wv

In [193]:
print wordvectors.most_similar(["UPSC"])

[(u'IAS', 0.8955783247947693), (u'CAT', 0.843267560005188), (u'GATE', 0.8333038091659546), (u'CSE', 0.8325021266937256), (u'exams', 0.8290932178497314), (u'IIT', 0.8272045850753784), (u'SSC', 0.8266948461532593), (u'exam', 0.8232311010360718), (u'graduation', 0.8177750110626221), (u'college', 0.814316987991333)]


In [31]:
test = "QID_476440"
print qdict[test]
for q in vectors.most_similar(["QID_476440"]):
    print q, qdict[q[0]]

[u'How', u'can', u'I', u'be', u'a', u'good', u'software', u'engineer', u'?']
('QID_384999', 0.9501305818557739) [u'How', u'can', u'I', u'be', u'a', u'good', u'software', u'engineer', u'?']
('QID_449616', 0.9494233131408691) [u'How', u'can', u'I', u'be', u'a', u'good', u'software', u'engineer', u'?']
('QID_717572', 0.9488134980201721) [u'How', u'can', u'I', u'be', u'a', u'good', u'software', u'engineer', u'?']
('QID_19818', 0.9455262422561646) [u'How', u'can', u'I', u'be', u'a', u'good', u'software', u'engineer', u'?']
('QID_387741', 0.9427506923675537) [u'How', u'can', u'I', u'be', u'a', u'good', u'software', u'engineer', u'?']
('QID_627259', 0.9392973780632019) [u'How', u'can', u'I', u'be', u'a', u'good', u'software', u'engineer', u'?']
('QID_131300', 0.9286038279533386) [u'How', u'can', u'I', u'be', u'good', u'computer', u'engineer', u'?']
('QID_604540', 0.9001113176345825) [u'How', u'can', u'I', u'be', u'good', u'engineer', u'?']
('QID_483489', 0.8802008628845215) [u'How', u'can', u