In [1]:
#Code reference: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py
#Code reference: https://www.tutorialspoint.com/gensim/gensim_doc2vec_model.htm

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
import gensim

### Reading File

In [3]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.question-question.txt', 'r', encoding='utf-8') as file:
    data = file.read()

In [4]:
data[0:1000]

'Should I drink water during my workout?\tHow can I get my toddler to drink more water?\tStackExchange Network: http://fitness.stackexchange.com/questions/1902 Author: Rogach (http://fitness.stackexchange.com/users/132) Last Editor: Nathan Wheeler (http://fitness.stackexchange.com/users/21)\tStackExchange Network: http://parenting.stackexchange.com/questions/11704 Author: I Like to Code (http://parenting.stackexchange.com/users/4718)\nHow can I put something in book format without "publishing" it?\tHow can I "time-stamp" my data without publishing it?\tStackExchange Network: http://writers.stackexchange.com/questions/5043 Author: justkt (http://writers.stackexchange.com/users/20) Last Editor: cwallenpoole (http://writers.stackexchange.com/users/734)\tStackExchange Network: http://academia.stackexchange.com/questions/23367 Author: Quora Feans (http://academia.stackexchange.com/users/8970) Last Editor: aeismail (http://academia.stackexchange.com/users/53)\nHow do I stop my dog from jumpi

In [5]:
len(data)

727101

### Pre-process and Tag the data

In [6]:
tp1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', data)
tp2 = re.sub(r'\d+', '', tp1)
clean= re.compile('<.*?>')
tp4 = re.sub(clean, '', tp2)
tp5 = tp4.lower()
tp7 = re.sub(r'[^\w\s]', '',tp5)
tp8 = re.sub(r'[^\x00-\x7f]',r'', tp7)
tp9 = tp8.replace("_","")

stop_words = set(stopwords.words('english'))

In [7]:
with open("doc2vec5.txt", 'w') as file:
    file.write("%s\n" % tp9)

In [8]:
df = pd.read_csv('doc2vec5.txt', sep='\t')
df.columns = ['First_Question','Second_Question','irr1','irr2']
df.head()

Unnamed: 0,First_Question,Second_Question,irr1,irr2
0,how can i put something in book format without...,how can i timestamp my data without publishing it,stackexchange network author justkt last edi...,stackexchange network author quora feans las...
1,how do i stop my dog from jumping on me,how do i make my dog forget a command,stackexchange network author user last edito...,stackexchange network author thinly veiled qu...
2,whats the best way to store asparagus,whats the correct way to store fats,stackexchange network author katiek,stackexchange network author andres jaan tack
3,how do i make a height adjustable desk,how can i build a wall mounted adjustable heig...,stackexchange network author evan last edito...,stackexchange network author davemackey last...
4,what is the best time and temperature for tast...,what is the best oil to use when cooking in a wok,stackexchange network author mark rogers las...,stackexchange network author lomaxx


In [9]:
df['first_removedStop'] = df['First_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['second_removedStop'] = df['Second_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [10]:
df.drop(columns = ['First_Question','Second_Question','irr1','irr2'], axis=1, inplace=True)

In [11]:
df.shape

(1554, 2)

In [12]:
import string

corpus = []

for index, row in df.iterrows():
    corpus.append(row['first_removedStop'])
    corpus.append(row['second_removedStop'])

In [13]:
corpus[0:5]

['put something book format without publishing',
 'timestamp data without publishing',
 'stop dog jumping',
 'make dog forget command',
 'whats best way store asparagus']

In [14]:
import string

questions1 = df['first_removedStop'].to_list()
questions2 = df['second_removedStop'].to_list()

In [15]:
type(questions1)

list

In [16]:
#ls = ["This","is","list","of", "words"]

def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        tokens = gensim.utils.simple_preprocess(list_of_words)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

data_for_training = list(tagged_document(corpus))

In [17]:
print(data_for_training[:3])

[TaggedDocument(words=['put', 'something', 'book', 'format', 'without', 'publishing'], tags=[0]), TaggedDocument(words=['timestamp', 'data', 'without', 'publishing'], tags=[1]), TaggedDocument(words=['stop', 'dog', 'jumping'], tags=[2])]


In [18]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(data_for_training)

model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

#embeddings2 = model.train(quest2, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
model.dv.most_similar(0)

[(2588, 0.9915246367454529),
 (2495, 0.9904581308364868),
 (1839, 0.9898824095726013),
 (2461, 0.9889029860496521),
 (35, 0.9885929226875305),
 (506, 0.9885508418083191),
 (2748, 0.9883555769920349),
 (2672, 0.988191545009613),
 (1952, 0.9879448413848877),
 (1227, 0.9879159927368164)]

In [20]:
data_for_training[0]

TaggedDocument(words=['put', 'something', 'book', 'format', 'without', 'publishing'], tags=[0])

In [21]:
data_for_training[1132]

TaggedDocument(words=['whats', 'best', 'way', 'seal', 'around', 'faucet'], tags=[1132])

In [22]:
len(model.dv)

3108

### Evaluation

In [23]:
tokenized_ques1 =[]
for question in questions1:
    tokenized_ques1.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques1)

1554

In [24]:
tokenized_ques2 =[]
for question in questions2:
    tokenized_ques2.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques2)

1554

In [25]:
from scipy import spatial

sim_scores=[]

#tokenized_ques2[0:2]

length_ques1 = len(tokenized_ques1)

for i in range(0,length_ques1):
    #Some part taken from https://stackoverflow.com/questions/53503049/measure-similarity-between-two-documents-using-doc2vec
    vec1 = model.infer_vector(tokenized_ques1[i])
    vec2 = model.infer_vector(tokenized_ques2[i])
    cos_distance = spatial.distance.cosine(vec1, vec2)
    cos_sim = 1-cos_distance
    sim_scores.append(cos_sim)

In [26]:
sim_scores = [(score*5) for score in sim_scores]
sim_scores[0:50]

[4.4195228815078735,
 4.11289244890213,
 2.8517255187034607,
 3.625454008579254,
 1.3545145094394684,
 2.1957167983055115,
 4.891596734523773,
 4.336508214473724,
 3.289390504360199,
 4.609622359275818,
 3.7830546498298645,
 4.756889045238495,
 2.4993523955345154,
 4.135308265686035,
 3.7410685420036316,
 -0.9799274802207947,
 2.2923652827739716,
 4.882136583328247,
 4.242141842842102,
 4.288155734539032,
 4.645232856273651,
 4.960460066795349,
 4.79410320520401,
 4.938544034957886,
 3.3315542340278625,
 4.332265555858612,
 4.144156575202942,
 4.855320155620575,
 2.3628291487693787,
 3.1481218338012695,
 4.924359619617462,
 4.190070629119873,
 3.966730833053589,
 3.5004907846450806,
 2.870563268661499,
 4.10126805305481,
 2.329670935869217,
 4.760605692863464,
 2.9147791862487793,
 3.7884649634361267,
 2.212234139442444,
 4.038016200065613,
 4.742093980312347,
 2.9938197135925293,
 3.662436306476593,
 1.0018041729927063,
 4.240373969078064,
 4.488396048545837,
 3.979889750480652,
 4.58

In [27]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.question-question.cosine.txt', 'w') as file:
    for score in sim_scores:
        file.write('%s\n' %score)