In [54]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import nltk 
import string

from nltk.corpus import stopwords

In [55]:
 stopwords =   ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll", "you'd",
                "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "she's", "her",
                "hers", "herself", "it", "it's", "its", "itself", "they", "them", "their", "theirs", "themselves",
                "what", "which", "who", "whom", "this", "that", "that'll", "these", "those", "am", "is", "are", "was",
                "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing",
                "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", 
                "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", 
                "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under",
                "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", 
                "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own",
                "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should",
                "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", 
                "couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasnt", "haven", 
                "haven't", "isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", 
                "shan", "shan't", "shouldn",
 "shouldn't", "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't"]

In [56]:
filepath = 'sample.txt'

with open(filepath, "r") as fp : 
    contents = fp.read()

In [60]:
def preprocess_and_count_words(text):
    
    
    '''Convert a sentence into lowercase, remove punctuation, stopwords
    '''
    
    translator = str.maketrans('', '', string.punctuation)
    cleaned_text = text.translate(translator).lower()
    
    # Tokenize the cleaned text
    words = cleaned_text.split()
    
    # Remove stopwords
    words = [word for word in words if word not in stopwords]
    
    # Count the frequency of each word
    word_freq = {}
    for word in words:
        word_freq[word] = word_freq.get(word, 0) + 1
    
    return word_freq

In [61]:
document1, document2, document3, document4, document5 = contents.split('\n\n')

# # Create a list of documents
documents = [document1, document2, document3, document4, document5]

In [63]:
# Create a dictionary to store word frequencies for each document
word_frequencies = {}


for idx, doc in enumerate(documents):
    word_freq = preprocess_and_count_words(doc)
    word_frequencies[f"Document {idx+1}"] = word_freq

word_freq_df = pd.DataFrame.from_dict(word_frequencies, orient='index')

# Fill missing values (NaN) with 0 for words not present in a document
word_freq_df = word_freq_df.fillna(0).astype(int)

word_freq_df

Unnamed: 0,lorem,ipsum,dolor,sit,amet,consectetur,adipiscing,elit,phasellus,ornare,...,hendrerit,tempus,mattis,praesent,imperdiet,feugiat,potenti,sem,gravida,placerat
Document 1,2,2,2,3,3,1,1,1,3,2,...,0,0,0,0,0,0,0,0,0,0
Document 3,1,2,3,2,2,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Document 4,1,0,0,1,1,0,0,1,0,0,...,1,1,1,0,0,0,0,0,0,0
Document 2,0,1,0,1,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
Document 5,0,0,0,1,1,0,0,0,0,0,...,0,0,0,1,1,1,1,1,1,1


In [82]:
def cosine_similarity(word_freq_df) :
    
    '''similarity function to determine how far two vectors are similar''' 
    num_docs = word_freq_df.shape[0]
    cosine_sim_matrix = np.zeros((num_docs, num_docs))

    for i in range(num_docs):
        for j in range(i, num_docs):
            
            # Extract word frequency vectors for documents i and j
            vector_i = word_freq_df.iloc[i].values
            vector_j = word_freq_df.iloc[j].values

            # Calculate dot product
            dot_product = np.dot(vector_i, vector_j)

            # Calculate norms
            norm_i = np.linalg.norm(vector_i)
            norm_j = np.linalg.norm(vector_j)

            # Calculate cosine similarity
            if norm_i != 0 and norm_j != 0:
                cosine_sim = dot_product / (norm_i * norm_j)
            else:
                cosine_sim = 0  

            cosine_sim_matrix[i][j] = cosine_sim
            cosine_sim_matrix[j][i] = cosine_sim 

    return cosine_sim_matrix 


In [83]:
cosine_sim_matrix = cosine_similarity(word_freq_df)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=word_freq_df.index, columns=word_freq_df.index)

In [84]:
cosine_sim_df

Unnamed: 0,Document 1,Document 3,Document 4,Document 2,Document 5
Document 1,1.0,0.580721,0.347434,0.482861,0.423972
Document 3,0.580721,1.0,0.261861,0.402467,0.325407
Document 4,0.347434,0.261861,1.0,0.297143,0.214121
Document 2,0.482861,0.402467,0.297143,1.0,0.258573
Document 5,0.423972,0.325407,0.214121,0.258573,1.0
