In [1]:
#Code reference: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py
#Code reference: https://www.tutorialspoint.com/gensim/gensim_doc2vec_model.htm

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
import gensim

### Reading File

In [3]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.plagiarism.txt', 'r', encoding='utf-8') as file:
    data = file.read()

In [4]:
data[0:1000]

'The vector space model has some limitations: 1.\tThe vector space model are the documents which are represented as “bags of words”.\tA Corpus of Plagiarised Short Answers - http://ir.shef.ac.uk/cloughie/resources/plagiarism_corpus.html File: g2pA_taskc.txt\tA Corpus of Plagiarised Short Answers - http://ir.shef.ac.uk/cloughie/resources/plagiarism_corpus.html File: g4pC_taskc.txt\nSecondly to define the value of the optimal solution recursively.\tDefine value of optimal solution recursively.\tA Corpus of Plagiarised Short Answers - http://ir.shef.ac.uk/cloughie/resources/plagiarism_corpus.html File: g2pE_taske.txt\tA Corpus of Plagiarised Short Answers - http://ir.shef.ac.uk/cloughie/resources/plagiarism_corpus.html File: g0pA_taske.txt\nVector space representation results in the loss of the order which the terms are in the document.\tIf a term occurs in the document, the value will be non-zero in the vector.\tA Corpus of Plagiarised Short Answers - http://ir.shef.ac.uk/cloughie/resour

In [5]:
len(data)

516548

### Pre-process and Tag the data

In [6]:
tp1 = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', data)
tp2 = re.sub(r'\d+', '', tp1)
clean= re.compile('<.*?>')
tp4 = re.sub(clean, '', tp2)
tp5 = tp4.lower()
tp7 = re.sub(r'[^\w\s]', '',tp5)
tp8 = re.sub(r'[^\x00-\x7f]',r'', tp7)
tp9 = tp8.replace("_","")

stop_words = set(stopwords.words('english'))

In [7]:
with open("doc2vectest.txt", 'w') as file:
    file.write("%s\n" % tp9)

In [8]:
df = pd.read_csv('doc2vectest.txt', sep='\t')
df.columns = ['First_Question','Second_Question','irr1','irr2']
df.head()

Unnamed: 0,First_Question,Second_Question,irr1,irr2
0,secondly to define the value of the optimal so...,define value of optimal solution recursively,a corpus of plagiarised short answers file g...,a corpus of plagiarised short answers file g...
1,vector space representation results in the los...,if a term occurs in the document the value wil...,a corpus of plagiarised short answers file g...,a corpus of plagiarised short answers file g...
2,pb is the prior or marginal probability of b a...,pa or the probability that the student is a gi...,a corpus of plagiarised short answers file g...,a corpus of plagiarised short answers file g...
3,it is prior in the sense that it does not take...,it is previous in the sense that it does not t...,a corpus of plagiarised short answers file g...,a corpus of plagiarised short answers file g...
4,the vector space model has the following limit...,models based on and extending the vector space...,a corpus of plagiarised short answers file o...,a corpus of plagiarised short answers file g...


In [9]:
df['first_removedStop'] = df['First_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['second_removedStop'] = df['Second_Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [10]:
df.drop(columns = ['First_Question','Second_Question','irr1','irr2'], axis=1, inplace=True)

In [11]:
df.shape

(1270, 2)

In [12]:
import string

corpus = []

for index, row in df.iterrows():
    corpus.append(row['first_removedStop'])
    corpus.append(row['second_removedStop'])

In [13]:
corpus[0:5]

['secondly define value optimal solution recursively',
 'define value optimal solution recursively',
 'vector space representation results loss order terms document',
 'term occurs document value nonzero vector',
 'pb prior marginal probability b acts normalizing constant']

In [14]:
import string

questions1 = df['first_removedStop'].to_list()
questions2 = df['second_removedStop'].to_list()

In [15]:
type(questions1)

list

In [16]:
#ls = ["This","is","list","of", "words"]

def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        tokens = gensim.utils.simple_preprocess(list_of_words)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

data_for_training = list(tagged_document(corpus))

In [17]:
print(data_for_training[:3])

[TaggedDocument(words=['secondly', 'define', 'value', 'optimal', 'solution', 'recursively'], tags=[0]), TaggedDocument(words=['define', 'value', 'optimal', 'solution', 'recursively'], tags=[1]), TaggedDocument(words=['vector', 'space', 'representation', 'results', 'loss', 'order', 'terms', 'document'], tags=[2])]


In [18]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(data_for_training)

model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

#embeddings2 = model.train(quest2, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
model.dv.most_similar(0)

[(795, 0.9775969982147217),
 (556, 0.9672380089759827),
 (612, 0.9625769853591919),
 (82, 0.9597797989845276),
 (2325, 0.9578133225440979),
 (1000, 0.953061044216156),
 (2324, 0.9454733729362488),
 (2509, 0.9441366195678711),
 (2428, 0.9430422186851501),
 (433, 0.9423292279243469)]

In [20]:
data_for_training[0]

TaggedDocument(words=['secondly', 'define', 'value', 'optimal', 'solution', 'recursively'], tags=[0])

In [21]:
data_for_training[1000]

TaggedDocument(words=['secondly', 'define', 'value', 'optimal', 'solution', 'recursively'], tags=[1000])

In [22]:
len(model.dv)

2540

### Evaluation

In [23]:
tokenized_ques1 =[]
for question in questions1:
    tokenized_ques1.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques1)

1270

In [24]:
tokenized_ques2 =[]
for question in questions2:
    tokenized_ques2.append(gensim.utils.simple_preprocess(question))
len(tokenized_ques2)

1270

In [25]:
from scipy import spatial

sim_scores=[]

#tokenized_ques2[0:2]

length_ques1 = len(tokenized_ques1)

for i in range(0,length_ques1):
    #Some part taken from https://stackoverflow.com/questions/53503049/measure-similarity-between-two-documents-using-doc2vec
    vec1 = model.infer_vector(tokenized_ques1[i])
    vec2 = model.infer_vector(tokenized_ques2[i])
    cos_distance = spatial.distance.cosine(vec1, vec2)
    cos_sim = 1-cos_distance
    sim_scores.append(cos_sim)

In [26]:
#sim_scores = [(score*5) for score in sim_scores]
sim_scores[0:50]

[0.9445233345031738,
 0.5178488492965698,
 0.4234110414981842,
 0.5243306159973145,
 0.697325587272644,
 0.7672815322875977,
 0.5562579035758972,
 0.8633687496185303,
 0.716844916343689,
 0.570695161819458,
 0.3774237632751465,
 0.7933225035667419,
 0.6086273789405823,
 0.6387276649475098,
 0.4082610309123993,
 0.6012640595436096,
 0.5029648542404175,
 0.21896317601203918,
 0.9251386523246765,
 0.8687998652458191,
 0.5568045973777771,
 -0.049515075981616974,
 0.9654817581176758,
 0.28646668791770935,
 -0.4853874444961548,
 0.1928643137216568,
 -0.05793950706720352,
 0.8111659288406372,
 0.13997626304626465,
 0.5826117992401123,
 0.43559393286705017,
 0.1930321902036667,
 0.2622233033180237,
 0.761972188949585,
 0.4414689242839813,
 0.4945022463798523,
 0.11140581965446472,
 0.5103105306625366,
 0.5341476798057556,
 0.7603869438171387,
 0.2615688443183899,
 0.6323397159576416,
 0.7671968936920166,
 0.8962646722793579,
 0.6947436928749084,
 0.23770368099212646,
 0.39364752173423767,
 0.0

In [27]:
with open('C:/Users/daksh/OneDrive/Desktop/sts2016-english-with-gs-v1.0/sts2016-english-with-gs-v1.0/STS2016.input.plagiarism.cosine.txt', 'w') as file:
    for score in sim_scores:
        file.write('%s\n' %score)