In [1]:
#Code reference: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py
#Code reference: https://www.tutorialspoint.com/gensim/gensim_doc2vec_model.htm

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
import gensim

### Reading File

In [3]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.headlines.txt', 'r', encoding='utf-8') as file:
    data = file.read()

In [4]:
data[0:1000]

'Digital era threatens tenuous future of drive-ins\tDigital Era Threatens Future of Drive-Ins\tEurope Media Monitor (http://emm.newsbrief.eu)\tEurope Media Monitor (http://emm.newsbrief.eu)\nJessica Lal murder convict Manu Sharma gets 15-day parole\tJessica murder Manu Sharma gets parole\tEurope Media Monitor (http://emm.newsbrief.eu)\tEurope Media Monitor (http://emm.newsbrief.eu)\nUN to hold emergency DR Congo talks\tU.N. Council to Hold Emergency DR Congo Talks\tEurope Media Monitor (http://emm.newsbrief.eu)\tEurope Media Monitor (http://emm.newsbrief.eu)\nIran and IAEA resume nuclear talks\tIran, IAEA resume nuclear talks in Tehran\tEurope Media Monitor (http://emm.newsbrief.eu)\tEurope Media Monitor (http://emm.newsbrief.eu)\nThai protesters storm army headquarters\tThai protestors storm Royal army headquarters in Bangkok\tEurope Media Monitor (http://emm.newsbrief.eu)\tEurope Media Monitor (http://emm.newsbrief.eu)\n40 Still Missing in Deadly Canada Oil Train...\t40 still missing

In [5]:
len(data)

289000

### Pre-process and Tag the data

In [6]:
tp1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', data)
tp2 = re.sub(r'\d+', '', tp1)
clean= re.compile('<.*?>')
tp4 = re.sub(clean, '', tp2)
tp5 = tp4.lower()
tp7 = re.sub(r'[^\w\s]', '',tp5)
tp8 = re.sub(r'[^\x00-\x7f]',r'', tp7)
tp9 = tp8.replace("_","")

stop_words = set(stopwords.words('english'))

In [7]:
with open("doc2vec2.txt", 'w') as file:
    file.write("%s\n" % tp9)

In [8]:
df = pd.read_csv('doc2vec2.txt', sep='\t')
df.columns = ['First_Question','Second_Question','irr1','irr2']
df.head()

Unnamed: 0,First_Question,Second_Question,irr1,irr2
0,jessica lal murder convict manu sharma gets da...,jessica murder manu sharma gets parole,europe media monitor,europe media monitor
1,un to hold emergency dr congo talks,un council to hold emergency dr congo talks,europe media monitor,europe media monitor
2,iran and iaea resume nuclear talks,iran iaea resume nuclear talks in tehran,europe media monitor,europe media monitor
3,thai protesters storm army headquarters,thai protestors storm royal army headquarters ...,europe media monitor,europe media monitor
4,still missing in deadly canada oil train,still missing in deadly canada oil train crash,europe media monitor,europe media monitor


In [9]:
df['first_removedStop'] = df['First_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['second_removedStop'] = df['Second_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [10]:
df.drop(columns = ['First_Question','Second_Question','irr1','irr2'], axis=1, inplace=True)

In [11]:
df.shape

(1497, 2)

In [12]:
import string

corpus = []

for index, row in df.iterrows():
    corpus.append(row['first_removedStop'])
    corpus.append(row['second_removedStop'])

In [13]:
corpus[0:5]

['jessica lal murder convict manu sharma gets day parole',
 'jessica murder manu sharma gets parole',
 'un hold emergency dr congo talks',
 'un council hold emergency dr congo talks',
 'iran iaea resume nuclear talks']

In [14]:
import string

questions1 = df['first_removedStop'].to_list()
questions2 = df['second_removedStop'].to_list()

In [15]:
type(questions1)

list

In [16]:
#ls = ["This","is","list","of", "words"]

def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        tokens = gensim.utils.simple_preprocess(list_of_words)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

data_for_training = list(tagged_document(corpus))

In [17]:
print(data_for_training[:3])

[TaggedDocument(words=['jessica', 'lal', 'murder', 'convict', 'manu', 'sharma', 'gets', 'day', 'parole'], tags=[0]), TaggedDocument(words=['jessica', 'murder', 'manu', 'sharma', 'gets', 'parole'], tags=[1]), TaggedDocument(words=['un', 'hold', 'emergency', 'dr', 'congo', 'talks'], tags=[2])]


In [18]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(data_for_training)

model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

#embeddings2 = model.train(quest2, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
model.dv.most_similar(0)

[(1882, 0.9932394623756409),
 (1, 0.9930800199508667),
 (1623, 0.9928853511810303),
 (1745, 0.9923970103263855),
 (863, 0.9920471906661987),
 (913, 0.991890013217926),
 (908, 0.9918063879013062),
 (1387, 0.9916388988494873),
 (1750, 0.9912846684455872),
 (875, 0.9911544322967529)]

In [20]:
data_for_training[0]

TaggedDocument(words=['jessica', 'lal', 'murder', 'convict', 'manu', 'sharma', 'gets', 'day', 'parole'], tags=[0])

In [21]:
data_for_training[1132]

TaggedDocument(words=['saudi', 'womens', 'driving', 'kicks', 'without', 'arrests'], tags=[1132])

In [22]:
len(model.dv)

2994

### Evaluation

In [23]:
tokenized_ques1 =[]
for question in questions1:
    tokenized_ques1.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques1)

1497

In [24]:
tokenized_ques2 =[]
for question in questions2:
    tokenized_ques2.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques2)

1497

In [25]:
from scipy import spatial

sim_scores=[]

#tokenized_ques2[0:2]

length_ques1 = len(tokenized_ques1)

for i in range(0,length_ques1):
    #Some part taken from https://stackoverflow.com/questions/53503049/measure-similarity-between-two-documents-using-doc2vec
    vec1 = model.infer_vector(tokenized_ques1[i])
    vec2 = model.infer_vector(tokenized_ques2[i])
    cos_distance = spatial.distance.cosine(vec1, vec2)
    cos_sim = 1-cos_distance
    sim_scores.append(cos_sim)

In [26]:
sim_scores = [(score*5) for score in sim_scores]
sim_scores[0:50]

[4.53856498003006,
 4.954219162464142,
 4.441409409046173,
 4.160660803318024,
 4.841306209564209,
 4.965546429157257,
 4.87661749124527,
 4.793577492237091,
 4.628312289714813,
 4.946720004081726,
 4.221621453762054,
 4.800498187541962,
 4.9482908844947815,
 4.325942397117615,
 4.869650900363922,
 4.475871920585632,
 4.907644987106323,
 4.846101403236389,
 3.8619956374168396,
 4.967547357082367,
 4.861205518245697,
 4.841703772544861,
 4.659973978996277,
 4.744198024272919,
 4.902806878089905,
 4.669161438941956,
 4.575960040092468,
 4.80020135641098,
 3.2185643911361694,
 4.988655745983124,
 4.9781882762908936,
 4.89604264497757,
 4.274393618106842,
 4.978205561637878,
 4.909103512763977,
 3.395756185054779,
 3.6796683073043823,
 4.219439625740051,
 4.693052172660828,
 4.765040576457977,
 4.269209206104279,
 4.551390111446381,
 4.319972097873688,
 4.938136041164398,
 4.882149398326874,
 4.937873482704163,
 4.853110611438751,
 4.727130830287933,
 2.790728807449341,
 3.457280695438385]

In [27]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.headlines.cosine.txt', 'w') as file:
    for score in sim_scores:
        file.write('%s\n' %score)