In [1]:
#Code reference: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py
#Code reference: https://www.tutorialspoint.com/gensim/gensim_doc2vec_model.htm

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
import gensim

### Reading File

In [3]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.answer-answer.txt', 'r', encoding='utf-8') as file:
    data = file.read()

In [4]:
data[0:1000]

"Tasting it is the only reliable way.\tThe way you have it is fine.\tStackExchange Network: http://cooking.stackexchange.com/questions/5611 Author: Krister Olsson (http://cooking.stackexchange.com/users/1220)\tStackExchange Network: http://writers.stackexchange.com/questions/11017 Author: Lauren Ipsum (http://writers.stackexchange.com/users/553)\nI think it probably depends on your money.\tIt depends on your country.\tStackExchange Network: http://workplace.stackexchange.com/questions/1755 Author: Michael Durrant (http://workplace.stackexchange.com/users/126)\tStackExchange Network: http://travel.stackexchange.com/questions/45030 Author: Vince (http://travel.stackexchange.com/users/3044) Last Editor: Vince (http://travel.stackexchange.com/users/3044)\nYou need to read a lot to know what you like and what you don't.\tYou don't have to know.\tStackExchange Network: http://writers.stackexchange.com/questions/12166 Author: Lexi (http://writers.stackexchange.com/users/2410)\tStackExchange N

In [5]:
len(data)

621616

### Pre-process and Tag the data

In [6]:
tp1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', data)
tp2 = re.sub(r'\d+', '', tp1)
clean= re.compile('<.*?>')
tp4 = re.sub(clean, '', tp2)
tp5 = tp4.lower()
tp7 = re.sub(r'[^\w\s]', '',tp5)
tp8 = re.sub(r'[^\x00-\x7f]',r'', tp7)
tp9 = tp8.replace("_","")

stop_words = set(stopwords.words('english'))

In [7]:
with open("doc2vec1.txt", 'w') as file:
    file.write("%s\n" % tp9)

In [8]:
df = pd.read_csv('doc2vec1.txt', sep='\t')
df.columns = ['First_Question','Second_Question','irr1','irr2']
df.head()

Unnamed: 0,First_Question,Second_Question,irr1,irr2
0,i think it probably depends on your money,it depends on your country,stackexchange network author michael durrant,stackexchange network author vince last edit...
1,you need to read a lot to know what you like a...,you dont have to know,stackexchange network author lexi,stackexchange network author bigpants last e...
2,obviously the best book for you depends a lot ...,the answer will depend of course on what youre...,stackexchange network author josay last edit...,stackexchange network author karlson
3,ive had this same problem,i had the same problem as you,stackexchange network author joseph,stackexchange network author izzydorio last ...
4,if you are not sure how to do it dont do it at...,also if you die you dont have to repay dont kn...,stackexchange network author adrian last edi...,stackexchange network author littleadv


In [9]:
df['first_removedStop'] = df['First_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['second_removedStop'] = df['Second_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [10]:
df.drop(columns = ['First_Question','Second_Question','irr1','irr2'], axis=1, inplace=True)

In [11]:
df.shape

(1571, 2)

In [12]:
import string

corpus = []

for index, row in df.iterrows():
    corpus.append(row['first_removedStop'])
    corpus.append(row['second_removedStop'])

In [13]:
corpus[0:5]

['think probably depends money',
 'depends country',
 'need read lot know like dont',
 'dont know',
 'obviously best book depends lot looking']

In [14]:
import string

questions1 = df['first_removedStop'].to_list()
questions2 = df['second_removedStop'].to_list()

In [15]:
type(questions1)

list

In [16]:
#ls = ["This","is","list","of", "words"]

def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        tokens = gensim.utils.simple_preprocess(list_of_words)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

data_for_training = list(tagged_document(corpus))

In [17]:
print(data_for_training[:3])

[TaggedDocument(words=['think', 'probably', 'depends', 'money'], tags=[0]), TaggedDocument(words=['depends', 'country'], tags=[1]), TaggedDocument(words=['need', 'read', 'lot', 'know', 'like', 'dont'], tags=[2])]


In [18]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(data_for_training)

model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

#embeddings2 = model.train(quest2, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
model.dv.most_similar(0)

[(760, 0.9611252546310425),
 (2678, 0.9599332809448242),
 (2293, 0.9598993062973022),
 (1165, 0.9537076950073242),
 (2679, 0.9518812298774719),
 (1504, 0.9515618681907654),
 (1014, 0.9514199495315552),
 (3034, 0.9505453109741211),
 (2613, 0.9504989385604858),
 (923, 0.9494186043739319)]

In [20]:
data_for_training[0]

TaggedDocument(words=['think', 'probably', 'depends', 'money'], tags=[0])

In [21]:
data_for_training[340]

TaggedDocument(words=['think', 'probably', 'depends', 'money'], tags=[340])

In [22]:
len(model.dv)

3142

### Evaluation

In [23]:
tokenized_ques1 =[]
for question in questions1:
    tokenized_ques1.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques1)

1571

In [24]:
tokenized_ques2 =[]
for question in questions2:
    tokenized_ques2.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques2)

1571

In [25]:
from scipy import spatial

sim_scores=[]

#tokenized_ques2[0:2]

length_ques1 = len(tokenized_ques1)

for i in range(0,length_ques1):
    #Some part taken from https://stackoverflow.com/questions/53503049/measure-similarity-between-two-documents-using-doc2vec
    vec1 = model.infer_vector(tokenized_ques1[i])
    vec2 = model.infer_vector(tokenized_ques2[i])
    cos_distance = spatial.distance.cosine(vec1, vec2)
    cos_sim = 1-cos_distance
    sim_scores.append(cos_sim)

In [26]:
sim_scores = [(score*5) for score in sim_scores]
sim_scores[0:50]

[4.023880064487457,
 4.697512686252594,
 3.0005133152008057,
 4.206242561340332,
 4.227454960346222,
 1.0262122750282288,
 0.5122652277350426,
 4.872941374778748,
 4.841373860836029,
 0.14943737536668777,
 4.280194044113159,
 3.523646891117096,
 4.44063663482666,
 -0.5134334787726402,
 2.953958809375763,
 4.785194993019104,
 4.303717911243439,
 4.056562185287476,
 4.015588760375977,
 4.043763279914856,
 4.933140873908997,
 4.428889453411102,
 4.678522050380707,
 3.9345359802246094,
 4.006284773349762,
 4.946971833705902,
 -0.5810559168457985,
 3.807748854160309,
 1.0831334441900253,
 3.3482518792152405,
 1.4485421776771545,
 3.90752375125885,
 3.6741408705711365,
 4.527727365493774,
 4.395000338554382,
 3.419620394706726,
 1.1449941247701645,
 4.03300017118454,
 4.863674342632294,
 3.0267858505249023,
 3.3053383231163025,
 4.870506823062897,
 4.74921852350235,
 4.570346176624298,
 3.861049711704254,
 2.6017963886260986,
 3.94627183675766,
 -0.0019808085926342756,
 4.786378741264343,
 1

In [27]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.answer-answer.cosine.txt', 'w') as file:
    for score in sim_scores:
        file.write('%s\n' %score)