In [1]:
#Code reference: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py
#Code reference: https://www.tutorialspoint.com/gensim/gensim_doc2vec_model.htm

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
import gensim

### Reading File

In [3]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.postediting.txt', 'r', encoding='utf-8') as file:
    data = file.read()

In [4]:
data[0:1000]

'Not only PAMPLEMOUSSE - according to experts, even orange juice and apple contain substances that inhibit medicines.\tThat FEBRUARY PAMPLEMOUSSE- According to the experts, even the orange juice and apple contains substances that inhibit the medicines.\tEAMT11 post-editting dataset (http://staffwww.dcs.shef.ac.uk/people/L.Specia/resources) target_postedited\tEAMT11 post-editting dataset (http://staffwww.dcs.shef.ac.uk/people/L.Specia/resources) target\nThe fact that the OSCE, during the evaluation of the campaign, has highlighted the positive changes rather than the shortcomings, such as the ban to run for a third of the candidates for the opposition, referred to a prior appeasement.\tThe fact that the OSCE, during the evaluation of the campaign, has highlighted the positive changes rather than on shortcomings, such as the ban to run for a third of the candidates for the opposition, referred to a prior appeasement.\tEAMT11 post-editting dataset (http://staffwww.dcs.shef.ac.uk/people/L.

In [5]:
len(data)

1448267

### Pre-process and Tag the data

In [6]:
tp1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', data)
tp2 = re.sub(r'\d+', '', tp1)
clean= re.compile('<.*?>')
tp4 = re.sub(clean, '', tp2)
tp5 = tp4.lower()
tp7 = re.sub(r'[^\w\s]', '',tp5)
tp8 = re.sub(r'[^\x00-\x7f]',r'', tp7)
tp9 = tp8.replace("_","")

stop_words = set(stopwords.words('english'))

In [7]:
with open("doc2vec4.txt", 'w') as file:
    file.write("%s\n" % tp9)

In [8]:
df = pd.read_csv('doc2vec4.txt', sep='\t')
df.columns = ['First_Question','Second_Question','irr1','irr2']
df.head()

Unnamed: 0,First_Question,Second_Question,irr1,irr2
0,the fact that the osce during the evaluation o...,the fact that the osce during the evaluation o...,eamt posteditting dataset targetpostedited,eamt posteditting dataset target
1,in the czech republic no chance say the mobile...,in the czech republic no chance say the mobile...,eamt posteditting dataset target,eamt posteditting dataset targetpostedited
2,i already knew the word thanks to my mangas,i already knew thanks to my mangas,eamt posteditting dataset targetpostedited,eamt posteditting dataset target
3,before off for the us nicolas sarkozy will tak...,before leaving for the us nicolas sarkozy will...,eamt posteditting dataset target,eamt posteditting dataset targetpostedited
4,and even if we could prove that for man how co...,and even if we could prove that for man how co...,eamt posteditting dataset targetpostedited,eamt posteditting dataset target


In [9]:
df['first_removedStop'] = df['First_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['second_removedStop'] = df['Second_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [10]:
df.drop(columns = ['First_Question','Second_Question','irr1','irr2'], axis=1, inplace=True)

In [11]:
df.shape

(3286, 2)

In [12]:
import string

corpus = []

for index, row in df.iterrows():
    corpus.append(row['first_removedStop'])
    corpus.append(row['second_removedStop'])

In [13]:
corpus[0:5]

['fact osce evaluation campaign highlighted positive changes rather shortcomings ban run third candidates opposition referred prior appeasement',
 'fact osce evaluation campaign highlighted positive changes rather shortcomings ban run third candidates opposition referred prior appeasement',
 'czech republic chance say mobile operators',
 'czech republic chance say mobile operators',
 'already knew word thanks mangas']

In [14]:
import string

questions1 = df['first_removedStop'].to_list()
questions2 = df['second_removedStop'].to_list()

In [15]:
type(questions1)

list

In [16]:
#ls = ["This","is","list","of", "words"]

def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        tokens = gensim.utils.simple_preprocess(list_of_words)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

data_for_training = list(tagged_document(corpus))

In [17]:
print(data_for_training[:3])

[TaggedDocument(words=['fact', 'osce', 'evaluation', 'campaign', 'highlighted', 'positive', 'changes', 'rather', 'shortcomings', 'ban', 'run', 'third', 'candidates', 'opposition', 'referred', 'prior', 'appeasement'], tags=[0]), TaggedDocument(words=['fact', 'osce', 'evaluation', 'campaign', 'highlighted', 'positive', 'changes', 'rather', 'shortcomings', 'ban', 'run', 'third', 'candidates', 'opposition', 'referred', 'prior', 'appeasement'], tags=[1]), TaggedDocument(words=['czech', 'republic', 'chance', 'say', 'mobile', 'operators'], tags=[2])]


In [18]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(data_for_training)

model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

#embeddings2 = model.train(quest2, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
model.dv.most_similar(0)

[(1, 0.993628978729248),
 (1827, 0.846962034702301),
 (4113, 0.8447048664093018),
 (3479, 0.8346331715583801),
 (5042, 0.8335431218147278),
 (1229, 0.8291702270507812),
 (3132, 0.8275589346885681),
 (1645, 0.8266435265541077),
 (3693, 0.8255499601364136),
 (3791, 0.8237966299057007)]

In [20]:
data_for_training[0]

TaggedDocument(words=['fact', 'osce', 'evaluation', 'campaign', 'highlighted', 'positive', 'changes', 'rather', 'shortcomings', 'ban', 'run', 'third', 'candidates', 'opposition', 'referred', 'prior', 'appeasement'], tags=[0])

In [21]:
data_for_training[3133]

TaggedDocument(words=['members', 'opposition', 'parliament', 'appointed', 'elected'], tags=[3133])

In [22]:
len(model.dv)

6572

### Evaluation

In [23]:
tokenized_ques1 =[]
for question in questions1:
    tokenized_ques1.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques1)

3286

In [24]:
tokenized_ques2 =[]
for question in questions2:
    tokenized_ques2.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques2)

3286

In [25]:
from scipy import spatial

sim_scores=[]

#tokenized_ques2[0:2]

length_ques1 = len(tokenized_ques1)

for i in range(0,length_ques1):
    #Some part taken from https://stackoverflow.com/questions/53503049/measure-similarity-between-two-documents-using-doc2vec
    vec1 = model.infer_vector(tokenized_ques1[i])
    vec2 = model.infer_vector(tokenized_ques2[i])
    cos_distance = spatial.distance.cosine(vec1, vec2)
    cos_sim = 1-cos_distance
    sim_scores.append(cos_sim)

In [26]:
sim_scores = [(score*5) for score in sim_scores]
sim_scores[0:50]

[4.945909380912781,
 4.343312084674835,
 4.7532713413238525,
 3.9504793286323547,
 4.852370321750641,
 4.916825294494629,
 4.932498335838318,
 4.676063060760498,
 4.729525446891785,
 4.857541918754578,
 2.6484891772270203,
 4.50373113155365,
 2.9285672307014465,
 4.905709326267242,
 4.861360788345337,
 4.437620639801025,
 4.784626662731171,
 4.209699034690857,
 3.9719021320343018,
 4.159365892410278,
 4.263936281204224,
 3.6478418111801147,
 4.865594804286957,
 3.1271320581436157,
 4.966279864311218,
 4.912915527820587,
 4.782242774963379,
 2.7681267261505127,
 3.9016193151474,
 4.544695317745209,
 4.8508307337760925,
 4.357633292675018,
 1.3850045204162598,
 4.146409332752228,
 4.608508050441742,
 4.4645801186561584,
 4.921918511390686,
 4.839906394481659,
 3.140803873538971,
 4.929653108119965,
 4.7817301750183105,
 3.503442108631134,
 4.453564286231995,
 4.904646873474121,
 4.740030467510223,
 3.0979150533676147,
 4.905167520046234,
 3.136085271835327,
 3.1969884037971497,
 4.390375

In [27]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.postediting.cosine.txt', 'w') as file:
    for score in sim_scores:
        file.write('%s\n' %score)