## Plagiarism Detection using cosine similarity

In [13]:
import os
import docx
import nltk
import numpy as np
import math
from nltk.corpus import stopwords

In [14]:
# Getting docs file into single text
def getText(filename):
    doc=docx.Document(filename)
    fullText=[]
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

In [3]:
# Reading .docx file and converting it in list
docFiles=[]
docNames=[]
dirPath='./sample_files'
for filename in os.listdir(dirPath): # Add path for the files
    if filename.endswith('.docx'):
        docNames.append(filename)
        filename=getText(dirPath+'/'+filename)
        docFiles.append(filename)
# docFiles.sort(key=str.lower)
# docNames.sort(key=str.lower)

print(len(docFiles))

6


In [4]:
# Building vocabulary from the words in all docs
def build_lexicon(corpus):
    lexicon=set()
    for doc in corpus:
        word_token=[word for word in doc.split()]
        lower_word_list=[i.lower() for i in word_token]
        
        porter=nltk.PorterStemmer()
        stemmed_word=[porter.stem(t) for t in lower_word_list]
        
        stop_words=set(stopwords.words('english'))
        filtered_bag_of_word=[w for w in stemmed_word if not w in stop_words]
        lexicon.update(filtered_bag_of_word)
    return lexicon



In [5]:
vocabulary=build_lexicon(docFiles)

In [6]:
# Generating term frequency : Total occurence of words from vocabulary in each doc
def tf(term,document):
    return freq(term,document)

def freq(term,document):
    return document.split().count(term)

In [7]:
doc_term_matrix=[]
print('\nOur vocabulary vector is ['+', '.join(list(vocabulary))+']')
for doc in docFiles:
    tf_vector=[tf(word,doc) for word in vocabulary]
    tf_vector_string=', '.join(format(freq,'d') for freq in tf_vector)
    print('\nThe tf vector for Document %d is [%s]' % ((docFiles.index(doc)+1),tf_vector_string))
    doc_term_matrix.append(tf_vector)

print('\nAll combined, here is our master document tree matrix: ')
print(doc_term_matrix)


Our vocabulary vector is [align, may, found, 1, heading., emphasis., sane., verius., table,, ebooks,, want., ingenium, rop, text:, part, whose, subscript., here,, column,, correctly., outlin, doe, "web, display, document,, allow, necessarili, prove, document", box, item,, pages,, technolog, reader, paragraph., i.e., videmu, previou, bold-italic,, galleries., wide, —, document, document., para/doca, click, locations., take, special, main, either, interrupt, veri, voluptas., approach, obviously,, books”, descript, everi, table", left, purpos, vast, make, focu, de, cave, horizont, column, calendar,, theme,, "lists", float, amet,, "tables,", conversion,, table—on, troubl, bodi, correctli, iocans;, except, chang, 3, nihil, devic, dropcap, types., boundaries., illustr, header,, time, pute, tu, microsoft, illi,, three, typograph, coordinated., narrow, dure, too,, best, locat, run, look, headings,, family., sever, split, screens., power, even, original,, plugin, ebooks., long, effect, honest,


The tf vector for Document 3 is [0, 0, 0, 0, 0, 0, 0, 0, 466, 0, 466, 0, 0, 0, 0, 0, 0, 0, 466, 0, 0, 0, 0, 0, 466, 0, 0, 468, 0, 468, 0, 0, 0, 0, 0, 0, 0, 0, 0, 466, 0, 0, 1400, 468, 0, 2332, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 466, 0, 0, 468, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 936, 466, 0, 0, 0, 0, 0, 0, 466, 0, 0, 466, 468, 0, 0, 468, 0, 0, 0, 0, 0, 0, 466, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2330, 0, 0, 0, 0, 0, 466, 0, 0, 0, 0, 0, 0, 0, 466, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 468, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 466, 1400, 0, 0, 0, 0, 0, 0, 468, 0, 0, 0, 0, 0, 0, 0, 0, 466, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 468, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 466, 0, 0, 0, 0, 0, 468, 468, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 466, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 934, 0, 0, 0, 0, 0, 468, 466, 0, 0, 0, 0

In [8]:
# Normalizing the values so that the square of each value is not greater than 1
def l2_normalizer(vec):
    denom=np.sum([el**2 for el in vec])
    return [(el/math.sqrt(denom)) for el in vec]

doc_term_matrix_l2=[]
for vec in doc_term_matrix:
    doc_term_matrix_l2.append(l2_normalizer(vec))
    
print('\nA regular old document term matrix: ')
print(np.matrix(doc_term_matrix))
print('\nA document term matrix with row-wise L2 norms of 1: ')
print(np.matrix(doc_term_matrix_l2))


A regular old document term matrix: 
[[0 1 1 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]]

A document term matrix with row-wise L2 norms of 1: 
[[0.         0.03097891 0.03097891 ... 0.         0.         0.        ]
 [0.         0.20134682 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.03097891 0.03097891 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.14586499 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [9]:
# Calculating idf - inverse document frequency
def numDocsContaining(word,doclist):
    document=0
    for doc in doclist:
        if freq(word,doc)>0:
            document+=1
    return document

def idf(word,doclist):
    n_samples=len(doclist)
    df=numDocsContaining(word,doclist)
    return np.log(n_samples/1+df)

my_idf_vector=[idf(word,docFiles) for word in vocabulary]

print('\nOur vocabulary vector is ['+', '.join(list(vocabulary))+']')
print('\nThe inverse document frequency vector is ['+', '.join(format(freq,'f') for freq in my_idf_vector)+']')


Our vocabulary vector is [align, may, found, 1, heading., emphasis., sane., verius., table,, ebooks,, want., ingenium, rop, text:, part, whose, subscript., here,, column,, correctly., outlin, doe, "web, display, document,, allow, necessarili, prove, document", box, item,, pages,, technolog, reader, paragraph., i.e., videmu, previou, bold-italic,, galleries., wide, —, document, document., para/doca, click, locations., take, special, main, either, interrupt, veri, voluptas., approach, obviously,, books”, descript, everi, table", left, purpos, vast, make, focu, de, cave, horizont, column, calendar,, theme,, "lists", float, amet,, "tables,", conversion,, table—on, troubl, bodi, correctli, iocans;, except, chang, 3, nihil, devic, dropcap, types., boundaries., illustr, header,, time, pute, tu, microsoft, illi,, three, typograph, coordinated., narrow, dure, too,, best, locat, run, look, headings,, family., sever, split, screens., power, even, original,, plugin, ebooks., long, effect, honest,

In [10]:
# Building a diagonal matrix from idf
def build_idf_matrix(idf_vector):
    idf_mat=np.zeros((len(idf_vector),len(idf_vector)))
    np.fill_diagonal(idf_mat,idf_vector)
    return idf_mat

my_idf_matrix=build_idf_matrix(my_idf_vector)
print('\nidf matrix is: ')
print(my_idf_matrix)                     


idf matrix is: 
[[1.79175947 0.         0.         ... 0.         0.         0.        ]
 [0.         2.19722458 0.         ... 0.         0.         0.        ]
 [0.         0.         2.07944154 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.79175947 0.         0.        ]
 [0.         0.         0.         ... 0.         1.94591015 0.        ]
 [0.         0.         0.         ... 0.         0.         1.79175947]]


In [11]:
# Generating tfidf matrix
doc_term_matrix_tfidf=[]

for tf_vector in doc_term_matrix:
    doc_term_matrix_tfidf.append(np.dot(tf_vector,my_idf_matrix))
    
doc_term_matrix_tfidf_l2=[]
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))

print(vocabulary)
print(np.matrix(doc_term_matrix_tfidf_l2))

{'align', 'may', 'found', '1', 'heading.', 'emphasis.', 'sane.', 'verius.', 'table,', 'ebooks,', 'want.', 'ingenium', 'rop', 'text:', 'part', 'whose', 'subscript.', 'here,', 'column,', 'correctly.', 'outlin', 'doe', '"web', 'display', 'document,', 'allow', 'necessarili', 'prove', 'document"', 'box', 'item,', 'pages,', 'technolog', 'reader', 'paragraph.', 'i.e.', 'videmu', 'previou', 'bold-italic,', 'galleries.', 'wide', '—', 'document', 'document.', 'para/doca', 'click', 'locations.', 'take', 'special', 'main', 'either', 'interrupt', 'veri', 'voluptas.', 'approach', 'obviously,', 'books”', 'descript', 'everi', 'table"', 'left', 'purpos', 'vast', 'make', 'focu', 'de', 'cave', 'horizont', 'column', 'calendar,', 'theme,', '"lists"', 'float', 'amet,', '"tables,"', 'conversion,', 'table—on', 'troubl', 'bodi', 'correctli', 'iocans;', 'except', 'chang', '3', 'nihil', 'devic', 'dropcap', 'types.', 'boundaries.', 'illustr', 'header,', 'time', 'pute', 'tu', 'microsoft', 'illi,', 'three', 'typogr

In [12]:
# Calculating plagiarism percentage using cosine similarity
for i in range (len(docFiles)):
    for j in range(i+1,len(docFiles)):
        result_nltk=nltk.cluster.util.cosine_distance(doc_term_matrix_tfidf_l2[i],doc_term_matrix_tfidf_l2[j])
        print('\nComparison between %s and %s: '%(docNames[i],docNames[j]))
#         print(result_nltk)
        cos_sin=1-result_nltk;
        angle_in_radians=math.acos(cos_sin)
        
        plagiarism=int(cos_sin*100)
        print('plagiarism=%s %%\n'% plagiarism)


Comparison between sample1.docx and sample3.docx: 
plagiarism=34 %


Comparison between sample1.docx and sample4.docx: 
plagiarism=27 %


Comparison between sample1.docx and sample5.docx: 
plagiarism=100 %


Comparison between sample1.docx and sample2.docx: 
plagiarism=0 %


Comparison between sample1.docx and sample6.docx: 
plagiarism=94 %


Comparison between sample3.docx and sample4.docx: 
plagiarism=15 %


Comparison between sample3.docx and sample5.docx: 
plagiarism=34 %


Comparison between sample3.docx and sample2.docx: 
plagiarism=0 %


Comparison between sample3.docx and sample6.docx: 
plagiarism=28 %


Comparison between sample4.docx and sample5.docx: 
plagiarism=27 %


Comparison between sample4.docx and sample2.docx: 
plagiarism=0 %


Comparison between sample4.docx and sample6.docx: 
plagiarism=25 %


Comparison between sample5.docx and sample2.docx: 
plagiarism=0 %


Comparison between sample5.docx and sample6.docx: 
plagiarism=94 %


Comparison between sample2.docx and 