<a href="https://colab.research.google.com/github/anushka827/PROJECT-5/blob/main/plagiarism-checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary modules!
import os  # Module for interacting with the operating system
from sklearn.feature_extraction.text import TfidfVectorizer  # Module for text vectorization using TF-IDF
from sklearn.metrics.pairwise import cosine_similarity  # Module for calculating cosine similarity


In [2]:
# Get a list of all text files in the current directory
student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]


# Read the contents of each student's text file
student_notes = [open(_file, encoding='utf-8').read() for _file in student_files]


# Function to vectorize the text using TF-IDF
def vectorize(Text):
    return TfidfVectorizer().fit_transform(Text).toarray()


# Function to calculate cosine similarity between two documents
def similarity(doc1, doc2):
    return cosine_similarity([doc1, doc2])



In [3]:
vectors = vectorize(student_notes)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()

In [4]:
def check_plagiarism():
    global s_vectors
    for student_a, text_vector_a in s_vectors:
        new_vectors = s_vectors.copy()
        current_index = new_vectors.index((student_a, text_vector_a))
        del new_vectors[current_index]
        for student_b, text_vector_b in new_vectors:
            # Calculate cosine similarity between two text vectors
            sim_score = similarity(text_vector_a, text_vector_b)[0][1]
            # Sort the student file names alphabetically to avoid duplicates
            student_pair = sorted((student_a, student_b))
            # Create a tuple with student file names and similarity score
            score = (student_pair[0], student_pair[1], sim_score)
            # Add the tuple to plagiarism_results set
            plagiarism_results.add(score)
    return plagiarism_results


In [5]:
for data in check_plagiarism():
    print("Similarity data:\n", data)


Similarity data:
 ('Ben.txt', 'Clark.txt', np.float64(0.4454634995096725))


In [7]:
highest = max(plagiarism_results, key=lambda x: x[2])
print("\nMost similar pair:", highest)



Most similar pair: ('Ben.txt', 'Clark.txt', np.float64(0.4454634995096725))


In [10]:
pip install streamlit scikit-learn


Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.2/10.2 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m91.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.51.0


In [15]:
%%writefile app.py
import streamlit as st
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

st.set_page_config(page_title="Plagiarism Checker", layout="centered")
st.title("üìë Plagiarism Checker Web App")
st.write("Upload two or more text files to check similarity.")

uploaded_files = st.file_uploader("Upload text files", type=["txt"], accept_multiple_files=True)

def vectorize(text):
    return TfidfVectorizer().fit_transform(text).toarray()

def get_similarity(doc1, doc2):
    return cosine_similarity([doc1, doc2])[0][1]

if uploaded_files and len(uploaded_files) >= 2:
    st.success("Files uploaded successfully!")

    file_names = [file.name for file in uploaded_files]
    file_contents = [file.read().decode("utf-8") for file in uploaded_files]

    vectors = vectorize(file_contents)

    st.subheader("üîç Plagiarism Results")
    results = []

    for i in range(len(uploaded_files)):
        for j in range(i + 1, len(uploaded_files)):
            sim_score = get_similarity(vectors[i], vectors[j])
            percentage = round(sim_score * 100, 2)
            results.append((file_names[i], file_names[j], percentage))
            st.write(f"üìÑ **{file_names[i]}** vs **{file_names[j]}** ‚Üí **{percentage}% similar**")

    if results:
        highest = max(results, key=lambda x: x[2])
        st.subheader("üî• Highest Similarity Detected")
        st.write(f"**{highest[0]}** and **{highest[1]}** have the highest similarity: **{highest[2]}%**")
else:
    st.info("Please upload at least two .txt files to check plagiarism.")


Writing app.py


In [None]:
!streamlit run app.py --server.enableCORS false --server.enableXsrfProtection false --server.port 8501



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.106.210.155:8501[0m
[0m
