## Plagiarism Detection using cosine similarity

In [None]:
import os
import docx
import nltk
import numpy as np
import math
from nltk.corpus import stopwords

In [None]:
# Getting docs file into single text
def getText(filename):
    doc=docx.Document(filename)
    fullText=[]
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

In [None]:
# Reading .docx file and converting it in list
docFiles=[]
docNames=[]
dirPath='./sample_files'
for filename in os.listdir(dirPath): # Add path for the files
    if filename.endswith('.docx'):
        docNames.append(filename)
        filename=getText(dirPath+'/'+filename)
        docFiles.append(filename)
# docFiles.sort(key=str.lower)
# docNames.sort(key=str.lower)

print(len(docFiles))

In [None]:
# Building vocabulary from the words in all docs
def build_lexicon(corpus):
    lexicon=set()
    for doc in corpus:
        word_token=[word for word in doc.split()]
        lower_word_list=[i.lower() for i in word_token]
        
        porter=nltk.PorterStemmer()
        stemmed_word=[porter.stem(t) for t in lower_word_list]
        
        stop_words=set(stopwords.words('english'))
        filtered_bag_of_word=[w for w in stemmed_word if not w in stop_words]
        lexicon.update(filtered_bag_of_word)
    return lexicon



In [None]:
vocabulary=build_lexicon(docFiles)

In [None]:
# Generating term frequency : Total occurence of words from vocabulary in each doc
def tf(term,document):
    return freq(term,document)

def freq(term,document):
    return document.split().count(term)

In [None]:
doc_term_matrix=[]
print('\nOur vocabulary vector is ['+', '.join(list(vocabulary))+']')
for doc in docFiles:
    tf_vector=[tf(word,doc) for word in vocabulary]
    tf_vector_string=', '.join(format(freq,'d') for freq in tf_vector)
    print('\nThe tf vector for Document %d is [%s]' % ((docFiles.index(doc)+1),tf_vector_string))
    doc_term_matrix.append(tf_vector)

print('\nAll combined, here is our master document tree matrix: ')
print(doc_term_matrix)

In [None]:
# Normalizing the values so that the square of each value is not greater than 1
def l2_normalizer(vec):
    denom=np.sum([el**2 for el in vec])
    return [(el/math.sqrt(denom)) for el in vec]

doc_term_matrix_l2=[]
for vec in doc_term_matrix:
    doc_term_matrix_l2.append(l2_normalizer(vec))
    
print('\nA regular old document term matrix: ')
print(np.matrix(doc_term_matrix))
print('\nA document term matrix with row-wise L2 norms of 1: ')
print(np.matrix(doc_term_matrix_l2))

In [None]:
# Calculating idf - inverse document frequency
def numDocsContaining(word,doclist):
    document=0
    for doc in doclist:
        if freq(word,doc)>0:
            document+=1
    return document

def idf(word,doclist):
    n_samples=len(doclist)
    df=numDocsContaining(word,doclist)
    return np.log(n_samples/1+df)

my_idf_vector=[idf(word,docFiles) for word in vocabulary]

print('\nOur vocabulary vector is ['+', '.join(list(vocabulary))+']')
print('\nThe inverse document frequency vector is ['+', '.join(format(freq,'f') for freq in my_idf_vector)+']')

In [None]:
# Building a diagonal matrix from idf
def build_idf_matrix(idf_vector):
    idf_mat=np.zeros((len(idf_vector),len(idf_vector)))
    np.fill_diagonal(idf_mat,idf_vector)
    return idf_mat

my_idf_matrix=build_idf_matrix(my_idf_vector)
print('\nidf matrix is: ')
print(my_idf_matrix)                     

In [None]:
# Generating tfidf matrix
doc_term_matrix_tfidf=[]

for tf_vector in doc_term_matrix:
    doc_term_matrix_tfidf.append(np.dot(tf_vector,my_idf_matrix))
    
doc_term_matrix_tfidf_l2=[]
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))

print(vocabulary)
print(np.matrix(doc_term_matrix_tfidf_l2))

In [None]:
# Calculating plagiarism percentage using cosine similarity
for i in range (len(docFiles)):
    for j in range(i+1,len(docFiles)):
        result_nltk=nltk.cluster.util.cosine_distance(doc_term_matrix_tfidf_l2[i],doc_term_matrix_tfidf_l2[j])
        print('\nComparison between %s and %s: '%(docNames[i],docNames[j]))
#         print(result_nltk)
        cos_sin=1-result_nltk;
        angle_in_radians=math.acos(cos_sin)
        
        plagiarism=int(cos_sin*100)
        print('plagiarism=%s %%\n'% plagiarism)