In [None]:
import heapq
from collections import defaultdict
import re
import string
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np
from google.colab import drive


In [None]:

nltk.download('punkt')
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Text Preprocessing
def preprocess_text(text):
    text = text.lower()
    sentences = sent_tokenize(text)
    cleaned_sentences = [remove_punctuation(sentence) for sentence in sentences]
    return cleaned_sentences

def remove_punctuation(sentence):
    return sentence.translate(str.maketrans("\n", " ", string.punctuation))

def read_file(file_path):
    with open(file_path, "r") as file:
        content = file.read()
    return content


In [None]:
# State Representation and A* Search
class Node:
    def __init__(self, state, parent=None):
        self.state = state
        self.parent = parent
        self.g = 0
        self.h = 0
        self.f = 0

    def __lt__(self, other):
        return self.f < other.f

def get_successors(node, doc1_len, doc2_len):
    moves = [(1, 1, 0), (0, 1, 1), (1, 0, 2)]
    successors = []
    for move in moves:
        new_state = (node.state[0] + move[0], node.state[1] + move[1], move[2])
        if new_state[0] <= doc1_len and new_state[1] <= doc2_len:
            successors.append(Node(new_state, node))
    return successors

In [None]:
# semantic distance calculation
def semantic_distance(sent1, sent2):
    words1 = set(sent1.split())
    words2 = set(sent2.split())
    common_words = words1.intersection(words2)
    return 1 - len(common_words) / max(len(words1), len(words2))

# heuristic function
def heuristic(state, doc1, doc2):
    remaining_sentences1 = len(doc1) - state[0]
    remaining_sentences2 = len(doc2) - state[1]
    return abs(remaining_sentences1 - remaining_sentences2)


# A* star search
def a_star_search(doc1, doc2):
    start_state = (0, 0, 0)
    goal_state = (len(doc1), len(doc2), 0)
    start_node = Node(start_state)
    open_list = [start_node]
    closed_set = set()

    while open_list:
        current_node = heapq.heappop(open_list)

        if current_node.state[:2] == goal_state[:2]:
            path = []
            while current_node:
                path.append(current_node.state)
                current_node = current_node.parent
            return path[::-1]

        closed_set.add(current_node.state)

        for successor in get_successors(current_node, len(doc1), len(doc2)):
            if successor.state in closed_set:
                continue

            successor.g = current_node.g + semantic_distance(
                doc1[successor.state[0]-1] if successor.state[0] > 0 else "",
                doc2[successor.state[1]-1] if successor.state[1] > 0 else ""
            )
            successor.h = heuristic(successor.state, doc1, doc2)
            successor.f = successor.g + successor.h

            heapq.heappush(open_list, successor)

    return None


In [None]:
# Plagiarism Detection with threshold 0.2
def detect_plagiarism(doc1, doc2, threshold=0.2):
    alignment = a_star_search(doc1, doc2)
    plagiarism_detected = []

    for i in range(1, len(alignment)):
        prev, curr = alignment[i-1], alignment[i]
        if curr[2] == 0:  # Sentences aligned
            similarity = 1 - semantic_distance(doc1[curr[0]-1], doc2[curr[1]-1])
            if similarity > threshold:
                plagiarism_detected.append((curr[0]-1, curr[1]-1, similarity))

    return plagiarism_detected

In [None]:
!ls /content/drive/MyDrive/Test/Test_1

doc1.txt  doc2.txt


In [None]:
# Preprocesssing

doc1 = preprocess_text(read_file("/content/drive/MyDrive/Test/Test_1/doc1.txt"))
doc2 = preprocess_text(read_file("/content/drive/MyDrive/Test/Test_1/doc2.txt"))

# Plagiarism detection
plagiarism_results = detect_plagiarism(doc1, doc2)

# Getting the results (similarity)
print("Plagiarism Detection Results:")
for i, j, similarity in plagiarism_results:
  print(f"Document 1, Sentence {i+1}: {doc1[i]}")
  print(f"Document 2, Sentence {j+1}: {doc2[j]}")
  print(f"Similarity: {similarity:.2f}")
  print()

# Final result
print(f"Total sentences in Document 1: {len(doc1)}")
print(f"Total sentences in Document 2: {len(doc2)}")
print(f"Number of potentially plagiarized sentences: {len(plagiarism_results)}")

Plagiarism Detection Results:
Document 1, Sentence 1: artificial intelligence is revolutionizing industries
Document 2, Sentence 1: artificial intelligence is revolutionizing industries
Similarity: 1.00

Document 1, Sentence 2: machine learning algorithms can process vast amounts of data
Document 2, Sentence 2: machine learning algorithms can process vast amounts of data
Similarity: 1.00

Document 1, Sentence 3: neural networks mimic the human brains structure
Document 2, Sentence 3: neural networks mimic the human brains structure
Similarity: 1.00

Total sentences in Document 1: 3
Total sentences in Document 2: 3
Number of potentially plagiarized sentences: 3


In [None]:
# Preprocesssing
doc1 = preprocess_text(read_file("/content/drive/MyDrive/Test/Test_2/doc1.txt"))
doc2 = preprocess_text(read_file("/content/drive/MyDrive/Test/Test_2/doc2.txt"))

# Plagiarism detection
plagiarism_results = detect_plagiarism(doc1, doc2)

# Getting the results (similarity)
print("Plagiarism Detection Results:")
for i, j, similarity in plagiarism_results:
  print(f"Document 1, Sentence {i+1}: {doc1[i]}")
  print(f"Document 2, Sentence {j+1}: {doc2[j]}")
  print(f"Similarity: {similarity:.2f}")
  print()

# Final result
print(f"Total sentences in Document 1: {len(doc1)}")
print(f"Total sentences in Document 2: {len(doc2)}")
print(f"Number of potentially plagiarized sentences: {len(plagiarism_results)}")

Plagiarism Detection Results:
Document 1, Sentence 1: quantum computation utilizes quantum physics concepts
Document 2, Sentence 1: quantum computation utilizes quantum physics concepts
Similarity: 1.00

Document 1, Sentence 2: quantum bits can be in various states at once
Document 2, Sentence 2: quantum bits can be in various states at once
Similarity: 1.00

Document 1, Sentence 3: this innovation may tackle intricate issues at unprecedented speeds
Document 2, Sentence 3: this innovation may tackle intricate issues at unprecedented speeds
Similarity: 1.00

Total sentences in Document 1: 3
Total sentences in Document 2: 3
Number of potentially plagiarized sentences: 3


In [None]:
# Preprocesssing
doc1 = preprocess_text(read_file("/content/drive/MyDrive/Test/Test_3/doc1.txt"))
doc2 = preprocess_text(read_file("/content/drive/MyDrive/Test/Test_3/doc2.txt"))

# Plagiarism detection
plagiarism_results = detect_plagiarism(doc1, doc2)

# Getting the results (similarity)
print("Plagiarism Detection Results:")
for i, j, similarity in plagiarism_results:
  print(f"Document 1, Sentence {i+1}: {doc1[i]}")
  print(f"Document 2, Sentence {j+1}: {doc2[j]}")
  print(f"Similarity: {similarity:.2f}")
  print()

# Final result
print(f"Total sentences in Document 1: {len(doc1)}")
print(f"Total sentences in Document 2: {len(doc2)}")
print(f"Number of potentially plagiarized sentences: {len(plagiarism_results)}")


Plagiarism Detection Results:
Total sentences in Document 1: 3
Total sentences in Document 2: 3
Number of potentially plagiarized sentences: 0


In [None]:
# Preprocesssing
doc1 = preprocess_text(read_file("/content/drive/MyDrive/Test/Test_4/doc1.txt"))
doc2 = preprocess_text(read_file("/content/drive/MyDrive/Test/Test_4/doc2.txt"))

# Plagiarism detection
plagiarism_results = detect_plagiarism(doc1, doc2)

# Getting the results (similarity)
print("Plagiarism Detection Results:")
for i, j, similarity in plagiarism_results:
  print(f"Document 1, Sentence {i+1}: {doc1[i]}")
  print(f"Document 2, Sentence {j+1}: {doc2[j]}")
  print(f"Similarity: {similarity:.2f}")
  print()

# Final result
print(f"Total sentences in Document 1: {len(doc1)}")
print(f"Total sentences in Document 2: {len(doc2)}")
print(f"Number of potentially plagiarized sentences: {len(plagiarism_results)}")

Plagiarism Detection Results:
Document 1, Sentence 1: natural language processing nlp allows computers to understand human language
Document 2, Sentence 1: natural language processing nlp allows computers to understand human language
Similarity: 1.00

Total sentences in Document 1: 3
Total sentences in Document 2: 3
Number of potentially plagiarized sentences: 1
