## 1. Import Libraries

In [1]:
import re
from typing import List

## 2. Sentence Preprocessing

In [2]:
def preprocess_sentences(text: str) -> List[str]:
    sentences = re.split(r'[.!?]\s*', text.strip())
    sentences = [re.sub(r'[^a-z0-9\s]', '', s.lower()).strip() for s in sentences if s.strip()]
    return sentences

## 3. Word Preprocessing

In [3]:
def preprocess_words(text: str) -> List[str]:
    text = re.sub(r'[^a-z0-9\s]', '', text.lower())
    return text.split()

## 4. Levenshtein Distance

In [4]:
def levenshtein_distance(s1: str, s2: str) -> int:
    m, n = len(s1), len(s2)
    if m == 0: return n
    if n == 0: return m

    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m+1): dp[i][0] = i
    for j in range(n+1): dp[0][j] = j

    for i in range(1, m+1):
        for j in range(1, n+1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            dp[i][j] = min(dp[i-1][j]+1, dp[i][j-1]+1, dp[i-1][j-1]+cost)
    return dp[m][n]

## 5. Sentence Similarity

In [5]:
def sentence_similarity(s1: str, s2: str) -> float:
    if len(s1) == 0 and len(s2) == 0:
        return 1.0
    distance = levenshtein_distance(s1, s2)
    return 1 - (distance / max(len(s1), len(s2)))

## 6. Plagiarism Detection

In [6]:
def detect_plagiarism(doc1_text: str, doc2_text: str, threshold: float=0.7):
    doc1_sentences = preprocess_sentences(doc1_text)
    doc2_sentences = preprocess_sentences(doc2_text)

    matched_sentences = 0
    total_sentences = len(doc1_sentences)

    print("\nSentence Alignment and Similarity:\n")
    for i, s1 in enumerate(doc1_sentences):
        best_similarity = 0
        best_j = -1
        for j, s2 in enumerate(doc2_sentences):
            sim = sentence_similarity(s1, s2)
            if sim > best_similarity:
                best_similarity = sim
                best_j = j

        status = "Potential Plagiarism" if best_similarity >= threshold else ""
        if status: matched_sentences += 1

        print(f"Doc1 [{i}]: {s1}")
        print(f"Doc2 [{best_j}]: {doc2_sentences[best_j] if best_j >=0 else '---'}")
        print(f"Similarity: {best_similarity*100:.2f}% {status}")
        print("-"*60)

    plagiarism_percentage = (matched_sentences / total_sentences) * 100
    print(f"\nOverall Plagiarism Percentage: {plagiarism_percentage:.2f}%")

## 7. Example Execution

In [7]:
if __name__ == "__main__":
    doc1 = """Artificial Intelligence is a branch of computer science.
    It deals with creating intelligent agents.
    Agents perceive their environment and take actions.
    AI includes machine learning and deep learning."""

    doc2 = """Artificial Intelligence is a field of computer science.
    It focuses on creating smart agents.
    Agents perceive environment and act accordingly.
    AI covers machine learning and deep learning."""

    detect_plagiarism(doc1, doc2, threshold=0.7)


Sentence Alignment and Similarity:

Doc1 [0]: artificial intelligence is a branch of computer science
Doc2 [0]: artificial intelligence is a field of computer science
Similarity: 89.09% Potential Plagiarism
------------------------------------------------------------
Doc1 [1]: it deals with creating intelligent agents
Doc2 [1]: it focuses on creating smart agents
Similarity: 53.66% 
------------------------------------------------------------
Doc1 [2]: agents perceive their environment and take actions
Doc2 [2]: agents perceive environment and act accordingly
Similarity: 66.00% 
------------------------------------------------------------
Doc1 [3]: ai includes machine learning and deep learning
Doc2 [3]: ai covers machine learning and deep learning
Similarity: 86.96% Potential Plagiarism
------------------------------------------------------------

Overall Plagiarism Percentage: 50.00%
