In [29]:
# Cell 1: Importing Libraries
import os
import glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
# Cell 2: Function to Read Documents
def read_documents(folder_path):
    documents = {}
    for file_path in glob.glob(os.path.join(folder_path, "*.txt")):  # Assuming text files
        with open(file_path, 'r', encoding='utf-8') as file:
            documents[file_path] = file.read()
    return documents

In [31]:
# Cell 3: Function to Calculate Document Similarity
def calculate_similarity(documents):
    # Creating a DataFrame from the documents
    df = pd.DataFrame(documents.items(), columns=['File', 'Text'])
    
    # Applying TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['Text'])
    
    # Calculating Cosine Similarity
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    return df, cosine_sim


In [32]:
# Cell 4: Function to Display Similarity Results
def display_similarity(df, cosine_sim):
    sim_df = pd.DataFrame(cosine_sim, index=df['File'], columns=df['File'])
    return sim_df


In [34]:
# Cell 5: Main Execution
folder_path = r'C:\Users\ahmed\Documents\Python Scripts\Document similarity\Documents sample'  # Set your folder path here

documents = read_documents(folder_path)
if len(documents) < 2:
    print("Please provide at least 2 documents for similarity calculation.")
else:
    df, cosine_sim = calculate_similarity(documents)
    similarity_results = display_similarity(df, cosine_sim)
    print("Document Similarity Matrix:")
    print(similarity_results)

Document Similarity Matrix:
File                                                C:\Users\ahmed\Documents\Python Scripts\Document similarity\Documents sample\Text 1.txt  \
File                                                                                                                                          
C:\Users\ahmed\Documents\Python Scripts\Documen...                                           1.000000                                         
C:\Users\ahmed\Documents\Python Scripts\Documen...                                           0.619989                                         
C:\Users\ahmed\Documents\Python Scripts\Documen...                                           0.620000                                         

File                                                C:\Users\ahmed\Documents\Python Scripts\Document similarity\Documents sample\Text 2.txt  \
File                                                                                                             