In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

file_path = 'Assignment1/good_files.txt'

file_paths = []

with open(file_path, 'r') as file:
    file_paths = [line.strip() for line in file.readlines()]

data = {"file_path": [], "content": [], "remove_mask": []}

for file_path in file_paths:
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            data["file_path"].append(file_path)
            data["content"].append(content)
            data["remove_mask"].append(0)  # Initially set mask to 0
    else:
        print(f"File not found: {file_path}")

df = pd.DataFrame(data)

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the content into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(df["content"])

# Set the threshold for cosine similarity
threshold = 0.7  

num_docs = len(df)

# Loop to compare cosine similarity between documents
for i in range(num_docs):
    for j in range(i + 1, num_docs):
        # Compute cosine similarity between document i and document j
        similarity = cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])[0][0]
        
        if similarity > threshold:
            df.at[i, "remove_mask"] = 1  # Mark as duplicate
            break  # Break after finding the first duplicate

print(df)

# Filter out documents that are marked as duplicates (remove_mask == 1)
filtered_df = df[df["remove_mask"] == 0]

# Save the filtered documents to a CSV file
filtered_df.to_csv("filtered_documents.csv", index=False)

print(filtered_df)
