In [13]:
import nltk
import gensim
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from gensim.models.doc2vec import TaggedDocument

In [14]:
# Import training data into dataframe
TRAIN = pd.read_csv('../data.gi/train.csv')
TRAIN.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [15]:
# Create lookup table
lookup_df = TRAIN[['id', 'qid1', 'qid2', 'is_duplicate']]

In [16]:
# Separate q1 and q2 into respective dataframes
# Then stack, sort, and reindex new dataframe
q1_df = TRAIN[['id', 'qid1', 'question1']]
q1_df.columns = ['pid', 'qid', 'question']
q2_df = TRAIN[['id', 'qid2', 'question2']]
q2_df.columns = ['pid', 'qid', 'question']
questions_df = pd.concat([q1_df, q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)
questions_df.head()

Unnamed: 0,pid,qid,question
0,0,1,What is the step by step guide to invest in sh...
1,0,2,What is the step by step guide to invest in sh...
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,1,4,What would happen if the Indian government sto...
4,2,5,How can I increase the speed of my internet co...


In [17]:
# Parse to string, force lowercasing, tokenize, filter out stopwords, and stem
def preprocessText(questions):
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer('english')
    stopwords = set('for a of the and to in'.split(' '))
    cleaned = [str(question).lower() for question in questions]
    tokenized = [tokenizer.tokenize(question) for question in cleaned]
    filtered = [[token for token in tokens if token not in stopwords] for tokens in tokenized]
    stemmed = [[stemmer.stem(token) for token in tokens if token not in stopwords] for tokens in filtered]
    return stemmed

questions_df['tokens'] = preprocessText(questions_df['question'])
questions_df.head()

Unnamed: 0,pid,qid,question,tokens
0,0,1,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share..."
1,0,2,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share..."
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[what, is, stori, kohinoor, koh, i, noor, diam..."
3,1,4,What would happen if the Indian government sto...,"[what, would, happen, if, indian, govern, stol..."
4,2,5,How can I increase the speed of my internet co...,"[how, can, i, increas, speed, my, internet, co..."


In [18]:
# Get list of tagged documents
taggedDocs = [TaggedDocument(row[4], str(row[2])) for row in questions_df.itertuples()]

In [19]:
# Build model
model = gensim.models.Doc2Vec(taggedDocs, workers=7, size= 20, alpha=0.025, min_alpha=0.025)

In [20]:
# Train model
for epoch in range(10):
    print ('Training epoch %s' % epoch)
    model.train(taggedDocs)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

Training epoch 0
Training epoch 1
Training epoch 2
Training epoch 3
Training epoch 4
Training epoch 5
Training epoch 6
Training epoch 7
Training epoch 8
Training epoch 9


In [21]:
# shows the similar words
print (model.most_similar('increas'))
 
# shows the learnt embedding
print (model['increas'])
 
# shows the similar docs with id = 2
print (model.docvecs.most_similar(str(2)))

[('reduc', 0.9100373387336731), ('boost', 0.8827394843101501), ('calcul', 0.8824217319488525), ('alloc', 0.8559360504150391), ('transfer', 0.8410112261772156), ('measur', 0.8238139152526855), ('handl', 0.8101462125778198), ('control', 0.8082395792007446), ('maxim', 0.8013314604759216), ('diminish', 0.7969419360160828)]
[ 0.04635077 -2.66627789  1.09804022 -3.83831191  3.95506978 -2.37412429
 -3.86691737  3.24838758  1.90225863  1.08853459  2.41925621 -1.51014245
  0.72561002  0.88563824  0.8014428  -8.41444588  0.9044345  -4.82455158
  0.57666326 -2.34827375]
[('8', 0.81833815574646), ('9', 0.8041192293167114), ('5', 0.745431125164032), ('3', 0.7058579921722412), ('0', 0.6976327300071716), ('1', 0.6937171816825867), ('4', 0.6884109973907471), ('6', 0.6852712035179138), ('7', 0.5872551202774048)]


In [22]:
model.save('./doc2vec_model_01')