In [11]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
#import libraries

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk import FreqDist
from collections import Counter
import string


In [13]:
# preprocessing the text
#   Text preprocessing is crucial in plagiarism detection. The goal is to standardize the text and remove noise, ensuring that comparisons focus on meaningful content

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [word.lower() for word in tokens]

    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens


In [14]:
#Create N-Grams

# N-grams are continuous sequences of n items from a given sample of text. They are useful in plagiarism detection because they capture the structure and order of the text.
# Common choices are bigrams (2 words) or trigrams (3 words).

def generate_ngrams(tokens, n=3):
    n_grams = list(ngrams(tokens, n))
    return [' '.join(grams) for grams in n_grams]


In [15]:
# Compare Texts

def compare_texts(text1, text2, n=3):
    # Preprocess both texts
    tokens1 = preprocess_text(text1)
    tokens2 = preprocess_text(text2)

    # Generate n-grams
    ngrams1 = generate_ngrams(tokens1, n)
    ngrams2 = generate_ngrams(tokens2, n)

    # Calculate similarity
    common_ngrams = set(ngrams1).intersection(set(ngrams2))
    similarity = len(common_ngrams) / max(len(ngrams1), len(ngrams2))

    return similarity

# Note: The compare_texts function returns a similarity score between 0 and 1. A score close to 1 indicates high similarity, while a score close to 0 indicates low similarity.

In [16]:
#usage of the funtion

#Example 1

text1 = """Natural Language Processing is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language."""
text2 = """Natural Language Processing is an area of artificial intelligence concerned with the interaction between computers and humans in their natural language."""

similarity_score = compare_texts(text1, text2)
print(f"Similarity Score: {similarity_score}")

if similarity_score > 0.5:
    print("Potential plagiarism detected!")
else:
    print("Texts are sufficiently different.")


Similarity Score: 0.4
Texts are sufficiently different.


In [18]:
#Example 2

text1 = """Natural Language Processing is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language."""
text2 = """Natural Language Processing is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language."""

similarity_score = compare_texts(text1, text2)
print(f"Similarity Score: {similarity_score}")

if similarity_score > 0.5:
    print("Potential plagiarism detected!")
else:
    print("Texts are sufficiently different.")


Similarity Score: 1.0
Potential plagiarism detected!


Observations:

Explanation of the Process-
  - Preprocessing: Cleans the text by tokenizing, removing stopwords, and normalizing case. This is to make the text meaningful.
  
  - N-Grams Creation: Transforms the cleaned text into sequences of n-grams. This step captures the structure of the text, which is crucial for detecting similarity beyond just common words.
  - Comparison: Compares the n-grams of two texts and calculates the similarity based on the proportion of shared n-grams.
  - Threshold-Based Decision: Interprets the similarity score to decide if plagiarism is likely based on a predefined threshold.

Limitations and Enhancements
  - Threshold Selection: The threshold for determining plagiarism is subjective and may vary depending on the application.
  - Paraphrasing: The system might not detect sophisticated paraphrasing, as it relies heavily on exact or near-exact matches.
  - Synonyms: The system does not account for synonyms, which could result in missed plagiarism detection.

Potential Enhancements
  - Semantic Analysis: Incorporating word embeddings or other semantic techniques to detect similarity in meaning, not just exact wording.
  - Machine Learning Models: Using supervised learning models trained on known cases of plagiarism to improve detection.
