In [1]:
import spacy as sp
nlp = sp.load("en_core_web_lg")
#Uncomment the below line to use en_core_web_sm
#nlp = sp.load("en_core_web_sm")
import nltk
from nltk.corpus import wordnet   #Import wordnet from the NLTK
from nltk.stem.porter import *
from bert_score import score
from scipy.optimize import linear_sum_assignment
import numpy as np
from tqdm import tqdm

In [2]:
doc1 = "doc1.txt"
doc2 = "doc2.txt"
#For testing with non plagiarized doc, uncomment the below line
#doc2 = "lab5_non_plag.txt"

In [3]:
def getLines(filename):
    f = open(filename, 'r')
    s = f.read();
    #removing unecessary punctuations
    s = s.replace(',', ' ')
    s = s.replace(';', ' ')
    s = s.replace('-', ' ')
    s = s.lower()
    #splitting sentence wise
    ls = s.split('.')
    lines = []
    for i in ls:    
        x = i.split()
        if(x != []):
            lines.append(x)
            
    return lines
def remove_sw(lines):
    all_stopwords = list(sp.lang.en.stop_words.STOP_WORDS)
    #print(all_stopwords)
    l_sw = []
    stemmer = PorterStemmer()
    for i in range(len(lines)):
        temp = ''
        for word in lines[i]:
            if word not in all_stopwords:
                if(temp == ''):
                    #temp = stemmer.stem(word)
                    temp = word
                else:
                    #temp = temp + ' ' + stemmer.stem(word)
                    temp = temp + ' ' + word
            
        l_sw.append(temp)
    return l_sw

In [4]:
l1 = getLines(doc1)
l2 = getLines(doc2)
#Removing stop words and stemming
ls1 = remove_sw(l1)
ls2 = remove_sw(l2)

In [5]:
#different kinds of similarity functions
def cosSim(s1, s2):
    similarity_ = np.dot(s1, s2) / (norm(s1) * norm(s2))
    return similarity_

def bertSim(s1, s2, idf=False):
    p, _, _ = score([s1], [s2], lang='en', idf=True)
    return p.item()
def spacySimil(s1, s2):
    s1 = nlp(s1)
    s2 = nlp(s2)
    return s1.similarity(s2)

In [6]:
def getSentSimil(s1, s2):
    #Uncomment the line below to use Bert similarity
    #return bertSim(s1, s2)
    return spacySimil(s1, s2)

In [7]:
graph = [[0.0 for y in range(len(ls2))] for x in range(len(ls1))]
for i in tqdm(range(len(ls1))):
    for j in range(len(ls2)):
        graph[i][j] = getSentSimil(ls1[i], ls2[j])

100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [01:17<00:00,  3.51s/it]


In [8]:
def convertToSquare(graph):
    l1, l2 = len(graph), len(graph[0])
    g = [[0.01 for x in range(l2)] for y in range(l1+l2)]
    if(l1>l2):
        g = [[0.01 for x in range(l1+l2)] for y in range(l1)]
    for i in range(l1):
        for j in range(l2):
            g[i][j] = graph[i][j]
    return g

def removeLowMatches(graph):
    for i in range(len(graph)):
        for j in range(len(graph[0])):
            if(graph[i][j]<=0.6):
                graph[i][j] = 0.0
    return graph

def maxMatching(graph, n1, n2):
    #print(n1, n2)
    ret = []
    st = set()
    if(n1<n2):
        for i in range(n1):
            ind = -1
            maxnum = 0
            #print(i)
            for j in range(n2):
                if(graph[i][j]>=maxnum and (j not in st)):
                    ind = j
                    maxnum = graph[i][j]
            
            ret.append((i, ind))
            st.add(ind)
    else:
        for j in range(n2):
            ind = -1
            maxnum = 0
            for i in range(n1):
                if(graph[i][j]>=maxnum and (i not in st)):
                    ind = i
                    maxnum = graph[i][j]
            ret.append((ind, j))
            st.add(ind)
    ret.sort()
    return(ret)
            
def getOptimalMatching(graph):
    #print(graph)
    graph = removeLowMatches(graph)
    n1, n2 = len(graph), len(graph[0])
    graph = convertToSquare(graph)
    #return maxMatching(graph, n1, n2)
    #print(graph)
    graph = -1*np.array(graph)
    #print(graph)
    rc, cc = linear_sum_assignment(graph)
    ret = []
    for i in range(len(rc)):
        ret.append((rc[i], cc[i]))
    #print(rc)
    #print(cc)
    return(ret)

In [9]:
graph = np.array(graph)
opti = getOptimalMatching(graph)
print("Optimal matching of sentences:\n\n")
for i in range(len(opti)):
    p1 = "None"
    if(opti[i][0] < len(ls1)):
        p1 = ls1[opti[i][0]]
    p2 = "None"
    if(opti[i][1] < len(ls2)):
        p2 = ls2[opti[i][1]]
    print(opti[i][0], opti[i][1], p1, " ---- ", p2)

Optimal matching of sentences:


0 0 hello arooshi verma  ----  hi avanika
1 1 4th year dual degree student iit bhubaneswar  ----  study 3rd year iit indore
2 9 sample data plagiarism detector  ----  computer electronic device perform tasks like messaging calculations data storage printing etc
3 2 working emotion detection poem audios  ----  working feeling detection audios data cheating detector
4 25 dataset created hindi language  ----  None
5 3 forensic science known forensics application science law  ----  forensic science uses highly developed tech find evidence variety fields
6 10 uses highly developed technology uncover scientific evidence variety fields  ----  computers developed 1940s
7 4 modern forensic science broad range applications  ----  broad range applications
8 5 civil cases forgeries fraud negligence  ----  civil cases fraud negligence
9 6 common use forensic science investigate criminal cases involving victim assault robbery kidnapping rape murder  ----  usually for

In [10]:
def getDocSimilarityScore(matching, graph):
    sum1 = 0.0
    #print(matching)
    for i in range(len(matching)):
        ind1, ind2 = matching[i][0], matching[i][1]
        if(ind1>=len(graph) or ind2>=len(graph[ind1])):
            continue
        sum1 += graph[ind1][ind2]
        #print(i, ind1, ind2, graph[ind1][ind2])
    return(sum1/(0.5*(len(graph)+len(graph[0]))))

In [11]:
simil = getDocSimilarityScore(opti, graph)
threshold = 0.5
print("Similarity score:", simil)
if(simil > 0.6):
    print("High chance document is plagiarized.")
elif(simil > 0.4):
    print("Chance of document being plagiarized")
else:
    print("Document not plagiarized")

Similarity score: 0.5868503683561415
Chance of document being plagiarized


In [12]:
locals = []
i = 1
while(i<len(ls1)):
    curr = i-1
    cnt = 1
    while(i<len(opti) and opti[i][0]<len(ls1) and opti[i-1][1]<len(ls2)) and opti[i][1]-opti[i-1][1] == 1:
        cnt+=1
        i+=1
    if(curr==i-1):
        i+=1
    else:
        locals.append((curr, curr+cnt-1, opti[curr][1], opti[curr+cnt-1][1]))
print(locals)

[(0, 1, 0, 1), (7, 11, 4, 8), (12, 14, 15, 17)]


In [13]:
for i in range(len(locals)):
    print("Line numbers ",locals[i][0],"to",locals[i][1], "of the first document match lines",locals[i][2],"to",locals[i][3],"of the second document")

Line numbers  0 to 1 of the first document match lines 0 to 1 of the second document
Line numbers  7 to 11 of the first document match lines 4 to 8 of the second document
Line numbers  12 to 14 of the first document match lines 15 to 17 of the second document
