In [9]:
import nltk
import gensim
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from gensim.models.doc2vec import TaggedDocument

In [10]:
# Import training data into dataframe
TRAIN = pd.read_csv('../data.gi/train.csv')
TRAIN.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [11]:
# Create lookup table
lookup_df = TRAIN[['id', 'qid1', 'qid2', 'is_duplicate']]

In [12]:
# Separate q1 and q2 into respective dataframes
# Then stack, sort, and reindex new dataframe
q1_df = TRAIN[['id', 'qid1', 'question1']]
q1_df.columns = ['pid', 'qid', 'question']
q2_df = TRAIN[['id', 'qid2', 'question2']]
q2_df.columns = ['pid', 'qid', 'question']
questions_df = pd.concat([q1_df, q2_df], ignore_index=True).sort_values(by=['pid', 'qid']).reset_index(drop=True)
questions_df.head()

Unnamed: 0,pid,qid,question
0,0,1,What is the step by step guide to invest in sh...
1,0,2,What is the step by step guide to invest in sh...
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,1,4,What would happen if the Indian government sto...
4,2,5,How can I increase the speed of my internet co...


In [13]:
# Parse to string, force lowercasing, tokenize, filter out stopwords, and stem
def preprocessText(questions):
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer('english')
    stopwords = set('for a of the and to in'.split(' '))
    cleaned = [str(question).lower() for question in questions]
    tokenized = [tokenizer.tokenize(question) for question in cleaned]
    filtered = [[token for token in tokens if token not in stopwords] for tokens in tokenized]
    stemmed = [[stemmer.stem(token) for token in tokens if token not in stopwords] for tokens in filtered]
    return stemmed

questions_df['tokens'] = preprocessText(questions_df['question'])
questions_df.head()

Unnamed: 0,pid,qid,question,tokens
0,0,1,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share..."
1,0,2,What is the step by step guide to invest in sh...,"[what, is, step, by, step, guid, invest, share..."
2,1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[what, is, stori, kohinoor, koh, i, noor, diam..."
3,1,4,What would happen if the Indian government sto...,"[what, would, happen, if, indian, govern, stol..."
4,2,5,How can I increase the speed of my internet co...,"[how, can, i, increas, speed, my, internet, co..."


In [14]:
# Get list of tagged documents
taggedDocs = [TaggedDocument(row[4], str(row[2])) for row in questions_df.itertuples()]

In [15]:
# Build model
model = gensim.models.Doc2Vec(taggedDocs, workers=7, size= 50, alpha=0.025, min_alpha=0.025)

In [20]:
# Train model
for epoch in range(50):
    print ('Training epoch %s' % epoch)
    model.train(taggedDocs)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

Training epoch 0
Training epoch 1
Training epoch 2
Training epoch 3
Training epoch 4
Training epoch 5
Training epoch 6
Training epoch 7
Training epoch 8
Training epoch 9
Training epoch 10
Training epoch 11
Training epoch 12
Training epoch 13
Training epoch 14
Training epoch 15
Training epoch 16
Training epoch 17
Training epoch 18
Training epoch 19
Training epoch 20
Training epoch 21
Training epoch 22
Training epoch 23
Training epoch 24
Training epoch 25
Training epoch 26
Training epoch 27
Training epoch 28
Training epoch 29
Training epoch 30
Training epoch 31
Training epoch 32
Training epoch 33
Training epoch 34
Training epoch 35
Training epoch 36
Training epoch 37
Training epoch 38
Training epoch 39
Training epoch 40
Training epoch 41
Training epoch 42
Training epoch 43
Training epoch 44
Training epoch 45
Training epoch 46
Training epoch 47
Training epoch 48
Training epoch 49


In [21]:
# shows the similar words
print (model.most_similar('increas'))
 
# shows the learnt embedding
print (model['increas'])
 
# shows the similar docs with id = 2
print (model.docvecs.most_similar(str(2)))

[('reduc', 0.8069440722465515), ('decreas', 0.7137227058410645), ('chang', 0.7093374729156494), ('boost', 0.7023246884346008), ('improv', 0.6826469302177429), ('lower', 0.6556110382080078), ('regain', 0.6458404064178467), ('gain', 0.6406462788581848), ('measur', 0.636250376701355), ('determin', 0.6260837316513062)]
[-2.05986357  0.03243658 -1.37755215  2.52617979 -1.02827513  1.18709183
  4.62784719  3.66332197  1.95711815 -2.63808966  2.0665307   1.81047201
 -1.60181689 -2.94268322  1.22515988 -2.61498785 -2.25240827 -1.57505798
 -1.3084954   2.00848222  1.78252947  0.75253165  0.45351121  4.0585537
  1.36573446 -0.85397917 -3.45068216 -0.211909   -1.41552711 -1.31944919
 -1.56268156  3.32461619 -3.5697279   7.84070349 -1.56449115 -2.83459806
 -0.45440879 -2.97191691  0.05383747 -3.25249934  1.54816508  2.80941129
 -0.19263583 -1.83397758 -1.17932796  2.04775095  3.00896835 -2.60724998
 -1.22312009 -3.1659565 ]
[('4', 0.9981161952018738), ('0', 0.9981117248535156), ('5', 0.99798917770

In [22]:
model.save('./doc2vec_models/doc2vec_model_03')