In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity


The cosine_similarity function from the sklearn.metrics.pairwise module is used to compute the cosine similarity between two sets of vectors. Cosine similarity is a measure of similarity between two non-zero vectors, which calculates the cosine of the angle between them. It ranges from -1 (completely opposite) to 1 (completely similar), with 0 indicating orthogonality or no similarity.

In [4]:
# Sample documents for plagiarism detection


doc = {
    "doc1": "Artificial intelligence is transforming the world by automating tasks.",
    "doc2": "AI is revolutionizing the globe by automating repetitive tasks.",
    "doc3": "Deep learning is a subset of artificial intelligence that is highly popular.",
    "doc4": "Quantum computing is a revolutionary field unrelated to AI."
}

In [6]:
doc

{'doc1': 'Artificial intelligence is transforming the world by automating tasks.',
 'doc2': 'AI is revolutionizing the globe by automating repetitive tasks.',
 'doc3': 'Deep learning is a subset of artificial intelligence that is highly popular.',
 'doc4': 'Quantum computing is a revolutionary field unrelated to AI.'}

In [8]:
# Step 1: Create a list of document contents

doc_names=list(doc.keys())
doc_names

['doc1', 'doc2', 'doc3', 'doc4']

In [10]:
doc_contents=list(doc.values())
doc_contents

['Artificial intelligence is transforming the world by automating tasks.',
 'AI is revolutionizing the globe by automating repetitive tasks.',
 'Deep learning is a subset of artificial intelligence that is highly popular.',
 'Quantum computing is a revolutionary field unrelated to AI.']

In [12]:
# Step 2: Vectorize documents using TF-IDF

vect=TfidfVectorizer(stop_words='english')
tmatrix=vect.fit_transform(doc_contents)

In [14]:
# Step 3: Compute cosine similarity between documents

cosim=cosine_similarity(tmatrix,tmatrix)

In [16]:
cosim

array([[1.        , 0.2661073 , 0.23490118, 0.        ],
       [0.2661073 , 1.        , 0.        , 0.11886245],
       [0.23490118, 0.        , 1.        , 0.        ],
       [0.        , 0.11886245, 0.        , 1.        ]])

In [18]:
# Step 4: Display similarity scores

print("Plagiarism Similarity Matrix (Cosine Similarity):")
for i in range(len(doc_names)):
    for j in range(len(doc_names)):
        if i !=j: # Avoid diagonal elements
            print(f"Similarity between {doc_names[i]} and {doc_names[j]}: {cosim[i][j]:.3f}")

Plagiarism Similarity Matrix (Cosine Similarity):
Similarity between doc1 and doc2: 0.266
Similarity between doc1 and doc3: 0.235
Similarity between doc1 and doc4: 0.000
Similarity between doc2 and doc1: 0.266
Similarity between doc2 and doc3: 0.000
Similarity between doc2 and doc4: 0.119
Similarity between doc3 and doc1: 0.235
Similarity between doc3 and doc2: 0.000
Similarity between doc3 and doc4: 0.000
Similarity between doc4 and doc1: 0.000
Similarity between doc4 and doc2: 0.119
Similarity between doc4 and doc3: 0.000


In [20]:
# Step 5: Thresholding for plagiarism detection

threshold=0.2

In [22]:
print("\nPotential Plagiarism cases:")
for i in range(len(doc_names)):
    for j in range(i+1, len(doc_names)): # Avoid repeating pairs
        if cosim[i][j]>threshold:
            print(f"Plagiarism detected between {doc_names[i]} and {doc_names[j]} with similarity {cosim[i][j]:.2f}")


Potential Plagiarism cases:
Plagiarism detected between doc1 and doc2 with similarity 0.27
Plagiarism detected between doc1 and doc3 with similarity 0.23
