In [25]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

In [26]:
nltk.download("punkt")
nltk.download('punkt_tab')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [27]:

def clean_tokenize_text(text):
    text = re.sub(r'[^a-z\s]', '', text.lower())
    tokens = [w for w in text.split() if w not in stop_words]
    return tokens

def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    return len(set1 & set2) / len(set1 | set2) if (set1 | set2) else 0

def ngram_jaccard(text1, text2, n=2):
    ngrams1 = set(ngrams(clean_tokenize_text(text1), n))
    ngrams2 = set(ngrams(clean_tokenize_text(text2), n))
    return len(ngrams1 & ngrams2) / len(ngrams1 | ngrams2) if (ngrams1 | ngrams2) else 0

def ngram_cosine(text1, text2, n=2):
    def to_ngram_string(text, n):
        toks = clean_tokenize_text(text)
        return ["_".join(ng) for ng in ngrams(toks, n)]
    docs = [" ".join(to_ngram_string(text1, n)), " ".join(to_ngram_string(text2, n))]
    vec = TfidfVectorizer()
    tfidf_matrix = vec.fit_transform(docs)
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

def semantic_similarity(text1, text2, model):
    emb1 = model.encode(text1, convert_to_tensor=True)
    emb2 = model.encode(text2, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()

def sentence_level_check(text1, text2, model, threshold=0.75):
    sents1 = sent_tokenize(text1)
    sents2 = sent_tokenize(text2)
    flagged = []

    for i, s1 in enumerate(sents1):
        for j, s2 in enumerate(sents2):
            sim = semantic_similarity(s1, s2, model)
            if sim > threshold:
                flagged.append((s1, s2, sim))
    return flagged

In [None]:
with open("data\text1.txt", "r") as f:
    text1 = f.read()
with open("data\text2.txt", "r") as f:
    text2 = f.read()

tokens_1 = clean_tokenize_text(text1)
tokens_2 = clean_tokenize_text(text2)

model = SentenceTransformer("all-MiniLM-L6-v2")

jaccard_score = jaccard_similarity(tokens_1, tokens_2)
cosine_sim = cosine_similarity(
    TfidfVectorizer().fit_transform([' '.join(tokens_1), ' '.join(tokens_2)])[0:1],
    TfidfVectorizer().fit_transform([' '.join(tokens_1), ' '.join(tokens_2)])[1:2]
)[0][0]
sem_score = semantic_similarity(text1, text2, model)
bigram_jaccard = ngram_jaccard(text1, text2, n=2)
bigram_cosine = ngram_cosine(text1, text2, n=2)

print(f"Jaccard (unigram): {jaccard_score:.4f}")
print(f"Cosine (TF-IDF): {cosine_sim:.4f}")
print(f"Semantic Similarity: {sem_score:.4f}")
print(f"Bigram Jaccard: {bigram_jaccard:.4f}")
print(f"Bigram Cosine: {bigram_cosine:.4f}")


def plagiarism_score(jaccard, cosine, semantic, bigram_j, bigram_c):
    return (0.45 * semantic +
            0.20 * cosine +
            0.15 * jaccard +
            0.10 * bigram_j +
            0.10 * bigram_c)

score = plagiarism_score(jaccard_score, cosine_sim, sem_score, bigram_jaccard, bigram_cosine)
print(f"\nFinal Plagiarism Score: {score:.4f}")


Jaccard (unigram): 0.1402
Cosine (TF-IDF): 0.3318
Semantic Similarity: 0.8852
Bigram Jaccard: 0.0157
Bigram Cosine: 0.0159

Final Plagiarism Score: 0.4889


In [29]:
if jaccard_score > 0.6 or bigram_jaccard > 0.5 or score > 0.75:
    print("❌ Copy-Paste Plagiarism")
elif sem_score > 0.75 and 0.4 < score <= 0.75:
    print("⚠️ Paraphrased Plagiarism")
elif score <= 0.4:
    print("✅ Original")
else:
    print("⚠️ Borderline Case – Needs Manual Review")

print("\nSentence-level suspicious matches:")
flagged = sentence_level_check(text1, text2, model, threshold=0.75)
if flagged:
    for s1, s2, sim in flagged:
        print(f"\nText1: {s1}\nText2: {s2}\nSimilarity: {sim:.2f}")
else:
    print("No highly similar sentences found.")


⚠️ Paraphrased Plagiarism

Sentence-level suspicious matches:

Text1: Artificial Intelligence (AI) has become one of the most influential technologies of the 21st century, reshaping industries and redefining the way people work, learn, and communicate.
Text2: Artificial Intelligence (AI) is emerging as a defining technology of our time, influencing industries and altering the ways humans interact, study, and perform tasks.
Similarity: 0.87

Text1: At its core, AI refers to the simulation of human intelligence in machines that are designed to think, reason, and act in ways similar to humans.
Text2: Simply put, AI involves creating machines that can mimic human thought processes and behaviors.
Similarity: 0.85

Text1: The growth of AI has been largely driven by advancements in machine learning, where algorithms are trained on vast amounts of data and continuously improve their accuracy without explicit programming.
Text2: The rise of AI has been fueled by machine learning techniques, whe