In [6]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [15]:
# Preprocess text function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

# Example preprocessing
example_text = "This is a sample document used for testing plagiarism detection."
preprocessed_text = preprocess_text(example_text)
preprocessed_text

'sample document used testing plagiarism detection'

In [21]:
# Load dataset and split into sentences
data = {
    'doc_id': [1, 2, 3],
    'text': [
        "This is a sample document used for testing plagiarism detection.",
        "This is a test document designed to evaluate plagiarism detection",
        "She opened the book and immediately got lost in the story's magical world."
    ]
}
df = pd.DataFrame(data)

# Tokenize documents into sentences
df['sentences'] = df['text'].apply(sent_tokenize)
df.head()

Unnamed: 0,doc_id,text,sentences
0,1,This is a sample document used for testing pla...,[This is a sample document used for testing pl...
1,2,This is a test document designed to evaluate p...,[This is a test document designed to evaluate ...
2,3,She opened the book and immediately got lost i...,[She opened the book and immediately got lost ...


In [22]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a sentence
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach()

# Example usage
sample_sentence = "Plagiarism detection using BERT model."
embedding = get_bert_embedding(sample_sentence)
embedding.shape


torch.Size([1, 768])

In [23]:
# Compare document sentences and calculate similarity
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(doc_sentences, source_sentences):
    plagiarism_score = 0
    total_sentences = len(doc_sentences)
    
    for doc_sentence in doc_sentences:
        doc_embedding = get_bert_embedding(doc_sentence)
        max_similarity = 0
        
        for source_sentence in source_sentences:
            source_embedding = get_bert_embedding(source_sentence)
            similarity = cosine_similarity(doc_embedding, source_embedding)[0][0]
            if similarity > max_similarity:
                max_similarity = similarity
        
        if max_similarity > 0.8:  # Similarity threshold for plagiarism
            plagiarism_score += 1
    
    return plagiarism_score / total_sentences

# Example comparison between a document and a source
doc_sentences = sent_tokenize("Plagiarism detection using advanced techniques in NLP.")
source_sentences = sent_tokenize(df['text'][1])  # Source document

similarity_score = calculate_similarity(doc_sentences, source_sentences)
similarity_score


1.0

In [24]:
# Plagiarism detection function
def detect_plagiarism(doc_text, sources_df):
    doc_sentences = sent_tokenize(doc_text)
    plagiarism_scores = []
    
    for idx, row in sources_df.iterrows():
        source_sentences = row['sentences']
        similarity_score = calculate_similarity(doc_sentences, source_sentences)
        plagiarism_scores.append((row['doc_id'], similarity_score))
    
    return plagiarism_scores

# Example detection
plagiarized_scores = detect_plagiarism("This is a sample document used for testing plagiarism detection.", df)
plagiarized_scores

[(1, 1.0), (2, 1.0), (3, 0.0)]

In [25]:
# Output the plagiarism results
def report_plagiarism_results(plagiarized_scores):
    print("Plagiarism Results:")
    for doc_id, score in plagiarized_scores:
        print(f"Source Document {doc_id}: {score*100:.2f}% of content matches.")

# Example report
report_plagiarism_results(plagiarized_scores)

Plagiarism Results:
Source Document 1: 100.00% of content matches.
Source Document 2: 100.00% of content matches.
Source Document 3: 0.00% of content matches.
