In [2]:
from sentence_transformers import SentenceTransformer
import os

SBERT_DIR = "c:/Users/nihar/Desktop/8th Sem/AI Tools/sbert_model"
os.makedirs(SBERT_DIR, exist_ok=True)

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model.save(SBERT_DIR)

print("SBERT model downloaded and saved locally.")


SBERT model downloaded and saved locally.


In [7]:
import nltk
import os

# Set custom path
nltk_data_dir = r"c:/Users/nihar/Desktop/8th Sem/AI Tools/nltk_data"  # Change this to your desired path
os.makedirs(nltk_data_dir, exist_ok=True)

# Set the path for NLTK to look for data
nltk.data.path.append(nltk_data_dir)

# Download required resources to the custom directory
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)

print("NLTK data downloaded to:", nltk_data_dir)


[nltk_data] Downloading package punkt to c:/Users/nihar/Desktop/8th
[nltk_data]     Sem/AI Tools/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     c:/Users/nihar/Desktop/8th Sem/AI Tools/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


NLTK data downloaded to: c:/Users/nihar/Desktop/8th Sem/AI Tools/nltk_data


In [3]:
import requests
import re
import bs4
import fitz  # PyMuPDF
import docx
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from googlesearch import search
from sentence_transformers import util, SentenceTransformer
import torch
import os

# Use locally stored NLTK models
nltk.download('punkt')
nltk.download('stopwords')

# Use locally stored SBERT model
SBERT_DIR = os.path.join(os.getcwd(), "sbert_model")
model = SentenceTransformer(SBERT_DIR)

# Use CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(device)
print(f"Model loaded on {device}")

def extract_text_from_file(file_path):
    """Extract text from a PDF or DOC file."""
    text = ""
    if file_path.endswith(".pdf"):
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text() + "\n"
    elif file_path.endswith(".docx"):
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    return text.strip()

def preprocess_text(text):
    """Basic text preprocessing (lowercasing, removing special characters, extra spaces)."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip()

def extract_keywords(text, num_keywords=10):
    """Extract important keywords using NLTK."""
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalnum()]  # Remove non-alphanumeric words
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    freq_dist = nltk.FreqDist(words)
    keywords = [word for word, _ in freq_dist.most_common(num_keywords)]
    print(keywords)
    return " ".join(keywords)  # Return keywords for search

def google_search(query, num_results=10):
    """Perform Google search and return the top website URLs."""
    return list(search(query, num_results=num_results))

def scrape_website(url):
    """Scrape and return cleaned text from a webpage."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=5)
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = " ".join([p.get_text() for p in paragraphs])
        return preprocess_text(text)
    except Exception as e:
        return ""

def jaccard_similarity(text1, text2):
    """Calculate Jaccard similarity using 3-grams."""
    ngram1 = set(ngrams(text1.split(), 3))
    ngram2 = set(ngrams(text2.split(), 3))
    intersection = len(ngram1.intersection(ngram2))
    union = len(ngram1.union(ngram2))
    return round((intersection / union) * 100, 2) if union != 0 else 0

def calculate_similarity(input_text, website_text):
    """Compute semantic similarity using SBERT with CUDA if available."""
    if not website_text:
        return 0
    input_embedding = model.encode(input_text, convert_to_tensor=True).to(device)
    website_embedding = model.encode(website_text, convert_to_tensor=True).to(device)
    sbert_similarity = util.pytorch_cos_sim(input_embedding, website_embedding).item()
    jaccard_score = jaccard_similarity(input_text, website_text)
    return round(((sbert_similarity * 100) + jaccard_score) / 2, 2)

def check_plagiarism(input_text):
    """Main function to check plagiarism."""
    input_text = preprocess_text(input_text)
    search_query = extract_keywords(input_text)
    urls = google_search(search_query)
    print(list(urls))
    similarities = {}
    for url in urls:
        web_text = scrape_website(url)
        similarity = calculate_similarity(input_text, web_text)
        similarities[url] = similarity
    sorted_results = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:3]
    return sorted_results

# Example usage
if __name__ == "__main__":
    user_text = """The research paper remains the most common writing assignment on college campuses, more popular than ever (Lunsford, 2008). In the first year, just as students are encountering the conventions of academic writing, we also hope to ground them in the mechanics of using a print/digital hybrid library, probably much larger than any they’ve ever used before, to make informed choices among possible sources, and extract meaning from them in order to support a well-organized synthesis or argument. We know they will be asked to do this in the future, so we try to get them acculturated quickly."""
    results = check_plagiarism(user_text)
    for url, similarity in results:
        print(f"Similarity: {similarity}% - {url}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nihar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nihar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


cuda
Model loaded on cuda
['writing', 'ever', 'research', 'paper', 'remains', 'common', 'assignment', 'college', 'campuses', 'popular']
['/search?num=12', 'https://www.sciencedirect.com/topics/computer-science/research-paper', 'https://www.quora.com/What-is-your-opinion-about-writing-assignments-in-colleges-Does-it-really-improve-students-learning-If-yes-how', 'https://engoo.com/app/daily-news/article/long-writing-assignments-less-common-at-us-colleges/eFhBlL_hEeiGy49GgymAgA', 'https://radicalscholarship.com/2023/04/01/revisiting-the-research-paper-problem-for-college-students-as-writers/', 'https://learningenglish.voanews.com/a/long-writing-assignments-not-so-common-at-us-colleges/4580476.html', 'https://www.researchgate.net/publication/222118091_Writing_in_business_courses_An_analysis_of_assignment_types_their_characteristics_and_required_skills', 'https://openoregon.pressbooks.pub/oregonwrites/chapter/what-does-the-professor-want-understanding-the-assignment/', 'https://opentextbc.c

In [None]:
import requests
import bs4
import spacy
import re
from googlesearch import search  # Install using: pip install google

# Load NLP model
nlp = spacy.load("en_core_web_sm")

def extract_key_sentences(text, num_sentences=5):
    """Extracts the most meaningful sentences from text"""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    
    # Choose middle sentences as they often contain key details
    return " ".join(sentences[:num_sentences])

def google_search(query, num_results=10):
    """Searches Google for the query and returns URLs"""
    return list(search(query, num_results=num_results))

def find_source_website(text):
    """Finds the most likely source of the given text"""
    key_sentences = extract_key_sentences(text)
    
    # Use quotes to search for exact text match
    search_query = f'"{key_sentences}"'
    print(f"Searching for: {search_query}")

    search_results = google_search(search_query, num_results=5)

    if search_results:
        return search_results  # List of URLs
    else:
        return "No search results found."

# Example Usage
text_snippet = """Explanation of Parameters
1. decorator_name(func):

decorator_name: This is the name of the decorator function.
func: This parameter represents the function being decorated. When you use a decorator, the decorated function is passed to this parameter.
2. wrapper(*args, **kwargs):
wrapper: This is a nested function inside the decorator. It wraps the original function, adding additional functionality.
*args: This collects any positional arguments passed to the decorated function into a tuple.
**kwargs: This collects any keyword arguments passed to the decorated function into a dictionary.
The wrapper function allows the decorator to handle functions with any number and types of arguments.
3. @decorator_name:
This syntax applies the decorator to the function_to_decorate function. It is equivalent to writing function_to_decorate = decorator_name(function_to_decorate).
Higher-Order Functions
In Python, higher-order functions are functions that take one or more functions as arguments, return a function as a result or do both. Essentially, a higher-order function is a function that operates on other functions. This is a powerful concept in functional programming and is a key component in understanding how decorators work.
Key Properties of Higher-Order Functions:
Taking functions as arguments: A higher-order function can accept other functions as parameters.
Returning functions: A higher-order function can return a new function that can be called later."""
# print(find_source_website(text_snippet))
ans = find_source_website(text_snippet)
for i in ans:
    print(i)

Searching for: "Explanation of Parameters
1. decorator_name(func):

decorator_name: This is the name of the decorator function. func: This parameter represents the function being decorated. When you use a decorator, the decorated function is passed to this parameter. 2. wrapper(*args, **kwargs):

wrapper: This is a nested function inside the decorator."
/search?num=7
https://www.geeksforgeeks.org/decorators-in-python/
https://realpython.com/primer-on-python-decorators/
https://www.wscubetech.com/resources/python/decorators
https://www.geeksforgeeks.org/decorators-with-parameters-in-python/


In [30]:
import requests
import bs4
import spacy
from googlesearch import search
from summa import summarizer
from sentence_transformers import SentenceTransformer, util
from nltk.util import ngrams

def preprocess_text(text):
    """Cleans text by removing special characters and extra spaces."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s.!?]', '', text)  # Keep '.', '!', '?'
    print(text)
    return text
    # doc = nlp(text)

    # Lemmatization and stopword removal
    # clean_text = " ".join([token.lemma_ for token in doc if not token.is_stop])
    # return text

# Load NLP model
nlp = spacy.load("en_core_web_sm")
local_model_path = r"C:\Users\nihar\Desktop\8th Sem\AI Tools\all-MiniLM-L6-v2"

# Load the model from the local directory
model = SentenceTransformer(local_model_path)

def extract_key_sentences_textrank(text, num_sentences=3):
    """Extracts important sentences using TextRank."""
    summary = summarizer.summarize(text, ratio=0.3)  # Extract 30% of key sentences
    print(summary)
    print(f"TextRank Summary:\n{summary}")  # Debugging

    if not summary.strip():
        print("⚠️ TextRank failed to generate a summary.")
        return text  # Return original text as a fallback

    doc = nlp(summary)
    sentences = [sent.text.strip() for sent in doc.sents]
    return " ".join(sentences[:num_sentences])


def extract_key_sentences_embeddings(text, num_sentences=3):
    """Finds unique & important sentences using embeddings."""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    print(f"Extracted {len(sentences)} sentences from input text.")  # Debugging

    if not sentences:
        print("⚠️ No sentences extracted from text.")
        return text  # Fallback

    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    sentence_scores = sentence_embeddings.mean(dim=1).tolist()

    sorted_sentences = [sent for _, sent in sorted(zip(sentence_scores, sentences), reverse=True)]
    print(sorted_sentences)
    return " ".join(sorted_sentences[:num_sentences])


def extract_key_sentences(text):
    """Choose method based on text length."""
    print(f"Extracting key sentences from text ({len(text.split())} words)...")  # Debugging

    if len(text.split()) < 100:
        return extract_key_sentences_textrank(text, num_sentences=3)
    
    return extract_key_sentences_embeddings(text, num_sentences=5)


def google_search(query, num_results=10):
    """Fetches top search results from Google."""
    return list(search(query, num_results=num_results))

def scrape_website(url):
    """Scrape and return cleaned text from a webpage."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=5)
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = " ".join([p.get_text() for p in paragraphs])
        return preprocess_text(text)
    except Exception:
        return ""

def jaccard_similarity(text1, text2):
    """Calculate Jaccard similarity using 3-grams."""
    ngram1 = set(ngrams(text1.split(), 3))
    ngram2 = set(ngrams(text2.split(), 3))
    intersection = len(ngram1.intersection(ngram2))
    union = len(ngram1.union(ngram2))
    return round((intersection / union) * 100, 2) if union != 0 else 0

def calculate_similarity(input_text, website_text):
    """Compute semantic similarity using SBERT and Jaccard similarity."""
    if not website_text:
        return 0
    input_embedding = model.encode(input_text, convert_to_tensor=True)
    website_embedding = model.encode(website_text, convert_to_tensor=True)
    sbert_similarity = util.pytorch_cos_sim(input_embedding, website_embedding).item()
    jaccard_score = jaccard_similarity(input_text, website_text)
    return round(((sbert_similarity * 100) + jaccard_score) / 2, 2)

def check_plagiarism(input_text):
    """Main function to check plagiarism."""
    input_text = preprocess_text(input_text)
    key_sentences = extract_key_sentences(input_text)
    print(key_sentences)
    print(type(key_sentences))
    urls = google_search(key_sentences)
    similarities = {}
    print(urls)
    for url in urls:
        web_text = scrape_website(url)
        similarity = calculate_similarity(input_text, web_text)
        similarities[url] = similarity
    sorted_results = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:3]
    return sorted_results

# Example usage
if __name__ == "__main__":
    user_text = """1. decorator_name(func):

decorator_name: This is the name of the decorator function.
func: This parameter represents the function being decorated. When you use a decorator, the decorated function is passed to this parameter.
2. wrapper(*args, **kwargs):

wrapper: This is a nested function inside the decorator. It wraps the original function, adding additional functionality.
*args: This collects any positional arguments passed to the decorated function into a tuple.
**kwargs: This collects any keyword arguments passed to the decorated function into a dictionary.
The wrapper function allows the decorator to handle functions with any number and types of arguments.
3. @decorator_name:

This syntax applies the decorator to the function_to_decorate function. It is equivalent to writing function_to_decorate = decorator_name(function_to_decorate).
Higher-Order Functions
In Python, higher-order functions are functions that take one or more functions as arguments, return a function as a result or do both. Essentially, a higher-order function is a function that operates on other functions. This is a powerful concept in functional programming and is a key component in understanding how decorators work.

Key Properties of Higher-Order Functions:
Taking functions as arguments: A higher-order function can accept other functions as parameters.
Returning functions: A higher-order function can return a new function that can be called later.kly."""
    results = check_plagiarism(user_text)
    for url, similarity in results:
        print(f"Similarity: {similarity}% - {url}")


1. decorator_namefunc decorator_name this is the name of the decorator function. func this parameter represents the function being decorated. when you use a decorator the decorated function is passed to this parameter. 2. wrapperargs kwargs wrapper this is a nested function inside the decorator. it wraps the original function adding additional functionality. args this collects any positional arguments passed to the decorated function into a tuple. kwargs this collects any keyword arguments passed to the decorated function into a dictionary. the wrapper function allows the decorator to handle functions with any number and types of arguments. 3. decorator_name this syntax applies the decorator to the function_to_decorate function. it is equivalent to writing function_to_decorate  decorator_namefunction_to_decorate. higherorder functions in python higherorder functions are functions that take one or more functions as arguments return a function as a result or do both. essentially a higher

In [18]:
from sentence_transformers import SentenceTransformer
# Define the local directory where you want to save the model
local_model_path = r"C:\Users\nihar\Desktop\8th Sem\AI Tools\all-MiniLM-L6-v2"

# Download and save the model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model.save(local_model_path)

print(f"Model downloaded and saved to {local_model_path}")


Model downloaded and saved to C:\Users\nihar\Desktop\8th Sem\AI Tools\all-MiniLM-L6-v2


In [None]:
import requests
import bs4
import spacy
from googlesearch import search
from summa import summarizer
from sentence_transformers import SentenceTransformer, util
from nltk.util import ngrams

def preprocess_text(text):
    """Cleans text by removing special characters and extra spaces."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s.!?]', '', text)  # Keep '.', '!', '?'
    # print(text)
    return text

# Load NLP model
nlp = spacy.load("en_core_web_sm")
local_model_path = r"C:\Users\nihar\Desktop\8th Sem\AI Tools\all-MiniLM-L6-v2"

# Load the model from the local directory
model = SentenceTransformer(local_model_path)

def extract_key_sentences_textrank(text, num_sentences=3):
    """Extracts important sentences using TextRank."""
    summary = summarizer.summarize(text, ratio=0.3)  # Extract 30% of key sentences
    # print(summary)
    print(f"TextRank Summary:\n{summary}")  # Debugging

    if not summary.strip():
        print("⚠️ TextRank failed to generate a summary.")
        return text  # Return original text as a fallback

    doc = nlp(summary)
    sentences = [sent.text.strip() for sent in doc.sents]
    return " ".join(sentences[:num_sentences])


def extract_key_sentences_embeddings(text, num_sentences=3):
    """Finds unique & important sentences using embeddings."""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    # print(f"Extracted {len(sentences)} sentences from input text.")  # Debugging

    if not sentences:
        print("⚠️ No sentences extracted from text.")
        return text  # Fallback

    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    sentence_scores = sentence_embeddings.mean(dim=1).tolist()

    sorted_sentences = [sent for _, sent in sorted(zip(sentence_scores, sentences), reverse=True)]
    return " ".join(sorted_sentences[:num_sentences])


def extract_key_sentences(text):
    """Choose method based on text length."""
    print(f"Extracting key sentences from text ({len(text.split())} words)...")  # Debugging

    if len(text.split()) < 100:
        return extract_key_sentences_textrank(text, num_sentences=3)
    
    return extract_key_sentences_embeddings(text, num_sentences=5)


def google_search(query, num_results=10):
    """Fetches top search results from Google."""
    return list(search(query, num_results=num_results))

def preprocess_web_text(text):
    """Cleans web-scraped text by removing unwanted parts but preserving structure."""
    
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'\n+', ' ', text)  # Remove excessive new lines
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets (e.g., citations)
    text = re.sub(r'\(.*?\)', '', text)  # Remove content inside parentheses (if needed)
    
    # Keep meaningful punctuation for readability
    text = re.sub(r'[“”‘’]', '', text)  # Remove special quotation marks
    text = re.sub(r'[^a-z0-9.,!?;:\'\"()\s]', '', text)  # Keep basic punctuation

    return text


def scrape_website(url):
    """Scrape and return cleaned text from a webpage."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=5)
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = " ".join([p.get_text() for p in paragraphs])
        return preprocess_web_text(text)
    except Exception:
        return ""

def calculate_similarity(input_text, website_text):
    if not website_text:
        return 0
    input_embedding = model.encode(input_text, convert_to_tensor=True)
    website_embedding = model.encode(website_text, convert_to_tensor=True)
    sbert_similarity = util.pytorch_cos_sim(input_embedding, website_embedding).item()

    print(f"SBERT similarity: {round(sbert_similarity*100,2)}")
    return round(sbert_similarity * 100, 2)

def check_plagiarism(input_text):
    """Main function to check plagiarism."""
    input_text = preprocess_text(input_text)
    key_sentences = extract_key_sentences(input_text)
    print(key_sentences)
    print(len(key_sentences))
    urls = google_search(key_sentences)
    similarities = {}
    print(urls)
    for url in urls:
        web_text = scrape_website(url)
        similarity = calculate_similarity(input_text, web_text)
        similarities[url] = similarity
    sorted_results = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:3]
    return sorted_results

# Example usage
if __name__ == "__main__":
    user_text = """1. decorator_name(func):

decorator_name: This is the name of the decorator function.
func: This parameter represents the function being decorated. When you use a decorator, the decorated function is passed to this parameter.
2. wrapper(*args, **kwargs):

wrapper: This is a nested function inside the decorator. It wraps the original function, adding additional functionality.
*args: This collects any positional arguments passed to the decorated function into a tuple.
**kwargs: This collects any keyword arguments passed to the decorated function into a dictionary.
The wrapper function allows the decorator to handle functions with any number and types of arguments.
3. @decorator_name:

This syntax applies the decorator to the function_to_decorate function. It is equivalent to writing function_to_decorate = decorator_name(function_to_decorate).
Higher-Order Functions
In Python, higher-order functions are functions that take one or more functions as arguments, return a function as a result or do both. Essentially, a higher-order function is a function that operates on other functions. This is a powerful concept in functional programming and is a key component in understanding how decorators work.

Key Properties of Higher-Order Functions:
Taking functions as arguments: A higher-order function can accept other functions as parameters.
Returning functions: A higher-order function can return a new function that can be called later.kly."""
    results = check_plagiarism(user_text)
    for url, similarity in results:
        print(f"Similarity: {similarity}% - {url}")


Extracting key sentences from text (202 words)...
it wraps the original function adding additional functionality. this is a nested function inside the decorator. 1. decorator_namefunc decorator_name this is the name of the decorator function. this is a powerful concept in functional programming and is a key component in understanding how decorators work. the wrapper function allows the decorator to handle functions with any number and types of arguments.
408
['/search?num=12', 'https://www.geeksforgeeks.org/decorators-in-python/', 'https://realpython.com/primer-on-python-decorators/', 'https://www.datacamp.com/tutorial/decorators-python', 'https://www.geeksforgeeks.org/nested-decorators-in-python/', 'https://www.freecodecamp.org/news/the-python-decorator-handbook/', 'https://stackoverflow.com/questions/14368102/referring-to-the-name-of-the-decorated-function-inside-decorator', 'https://www.simplilearn.com/tutorials/python-tutorial/decorators-in-python', 'https://www.codechef.com/learn/