<a href="https://colab.research.google.com/github/amithkm9/py/blob/master/TOC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import io
from google.colab import files

def rabin_karp(text, pattern, q=101):
    """
    Rabin-Karp string matching algorithm.
    text: The text in which to search.
    pattern: The pattern to search for.
    q: A large prime number used for hashing.
    """
    d = 256  # Number of characters in the input alphabet
    n = len(text)
    m = len(pattern)
    h = 1
    p = 0  # Hash value for pattern
    t = 0  # Hash value for text
    result = []

    # The value of h would be "pow(d, m-1) % q"
    for _ in range(m - 1):
        h = (h * d) % q

    # Calculate the hash value of pattern and first window of text
    for i in range(m):
        p = (d * p + ord(pattern[i])) % q
        t = (d * t + ord(text[i])) % q

    # Slide the pattern over text one character at a time
    for i in range(n - m + 1):
        # Check the hash values of current window of text and pattern
        # If the hash values match, check the characters one by one
        if p == t:
            if text[i:i + m] == pattern:
                result.append(i)

        # Calculate hash value for next window of text
        # Remove leading digit and add trailing digit
        if i < n - m:
            t = (d * (t - ord(text[i]) * h) + ord(text[i + m])) % q

            # We might get negative values of t, converting it to positive
            if t < 0:
                t += q

    return result

def detect_plagiarism(doc1, doc2, window_size=10, similarity_threshold=0.1):
    """
    Detect similar substrings between two documents using Rabin-Karp.

    Args:
    doc1 (str): First document text
    doc2 (str): Second document text
    window_size (int): Size of the substring to compare
    similarity_threshold (float): Minimum proportion of matching substrings to flag plagiarism

    Returns:
    dict: Plagiarism detection results
    """
    matches = []
    total_windows = 0
    matching_windows = 0

    # Normalize documents to lowercase and remove extra whitespace
    doc1 = ' '.join(doc1.lower().split())
    doc2 = ' '.join(doc2.lower().split())

    for i in range(len(doc1) - window_size + 1):
        total_windows += 1
        substring = doc1[i:i + window_size]
        indices = rabin_karp(doc2, substring)

        if indices:
            matching_windows += 1
            matches.append({
                'substring': substring,
                'doc1_position': i,
                'doc2_positions': indices
            })

    # Calculate similarity ratio
    similarity_ratio = matching_windows / total_windows if total_windows > 0 else 0

    return {
        'is_plagiarized': similarity_ratio >= similarity_threshold,
        'similarity_ratio': similarity_ratio,
        'total_windows': total_windows,
        'matching_windows': matching_windows,
        'matches': matches
    }

def main():
    print("Plagiarism Detection Tool")

    # Step 1: Upload files
    print("\nPlease upload two text files to compare:")
    uploaded = files.upload()

    # Verify files are uploaded
    if len(uploaded) != 2:
        print("Error: Please upload exactly 2 text files.")
        return

    # Read uploaded files
    file_contents = []
    for filename in uploaded:
        content = uploaded[filename].decode('utf-8')
        file_contents.append(content)

    # Detect plagiarism
    result = detect_plagiarism(file_contents[0], file_contents[1])

    # Print results
    print("\n--- Plagiarism Detection Report ---")
    print(f"Plagiarism Detected: {'Yes' if result['is_plagiarized'] else 'No'}")
    print(f"Similarity Ratio: {result['similarity_ratio']:.2%}")
    print(f"Total Windows: {result['total_windows']}")
    print(f"Matching Windows: {result['matching_windows']}")

    # Detailed matches
    if result['is_plagiarized']:
        print("\nDetailed Matches:")
        for match in result['matches'][:10]:  # Limit to first 10 matches
            print(f"Substring: '{match['substring']}'")
            print(f"  In Document 1 at position: {match['doc1_position']}")
            print(f"  In Document 2 at positions: {match['doc2_positions']}\n")

# Run the main function
if __name__ == "__main__":
    main()

Plagiarism Detection Tool

Please upload two text files to compare:


Saving document1.txt to document1 (3).txt
Saving document2.txt to document2 (3).txt

--- Plagiarism Detection Report ---
Plagiarism Detected: Yes
Similarity Ratio: 64.11%
Total Windows: 1028
Matching Windows: 659

Detailed Matches:
Substring: ' technolog'
  In Document 1 at position: 13
  In Document 2 at positions: [39, 67, 811]

Substring: 'technology'
  In Document 1 at position: 14
  In Document 2 at positions: [40, 68]

Substring: 'echnology '
  In Document 1 at position: 15
  In Document 2 at positions: [41, 69]

Substring: ' education'
  In Document 1 at position: 34
  In Document 2 at positions: [21, 121, 297, 506, 573, 737, 844, 912]

Substring: 'education '
  In Document 1 at position: 35
  In Document 2 at positions: [22, 574]

Substring: 'cation in '
  In Document 1 at position: 38
  In Document 2 at positions: [650]

Substring: ' in recent'
  In Document 1 at position: 44
  In Document 2 at positions: [50]

Substring: 'in recent '
  In Document 1 at position: 45
  In Docum