In [5]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

nltk.download('punkt')
nltk.download('stopwords')

documents_path = "Documents"

if not os.path.exists(documents_path):
    raise FileNotFoundError(f"The specified path {documents_path} does not exist. Please provide the correct path.")

def read_file(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        with open(filepath, 'r', encoding='latin-1') as file:
            return file.read()

# Tokenization
all_tokens = []
documents_tokens = {}

for filename in os.listdir(documents_path):
    if filename.endswith(".txt"):
        filepath = os.path.join(documents_path, filename)
        text = read_file(filepath)
        tokens = word_tokenize(text)
        documents_tokens[filename] = tokens
        all_tokens.extend(tokens)

total_tokens_count = len(all_tokens)
unique_tokens_count = len(set(all_tokens))

print("Number of tokens in each document:")
for doc, tokens in documents_tokens.items():
    print(f"{doc}: {len(tokens)} tokens")

print(f"Total number of tokens in the entire collection: {total_tokens_count}")
print(f"Number of unique tokens in the entire collection: {unique_tokens_count}")

# Stop Words Removal
stop_words = set(stopwords.words('english'))
documents_tokens_no_stopwords = {}

for doc, tokens in documents_tokens.items():
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    documents_tokens_no_stopwords[doc] = filtered_tokens

all_tokens_no_stopwords = [token for token in all_tokens if token.lower() not in stop_words and token not in string.punctuation]
total_tokens_no_stopwords_count = len(all_tokens_no_stopwords)
unique_tokens_no_stopwords_count = len(set(all_tokens_no_stopwords))

print("Number of tokens in each document after removing stop words:")
for doc, tokens in documents_tokens_no_stopwords.items():
    print(f"{doc}: {len(tokens)} tokens")

print(f"Total number of tokens in the entire collection after removing stop words: {total_tokens_no_stopwords_count}")
print(f"Number of unique tokens in the entire collection after removing stop words: {unique_tokens_no_stopwords_count}")


# Compute TF-IDF
documents_texts = [" ".join(tokens) for tokens in documents_tokens_no_stopwords.values()]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents_texts)

print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Compute Cosine Similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

print("Cosine Similarity Matrix:")
print(cosine_sim_matrix)


Number of tokens in each document:
Jerry Decided To Buy a Gun.txt: 302 tokens
Rentals at the Oceanside Community.txt: 376 tokens
Gasoline Prices Hit Record High.txt: 292 tokens
Cloning Pets.txt: 262 tokens
Crazy Housing Prices.txt: 390 tokens
Man Injured at Fast Food Place.txt: 170 tokens
A Festival of Books.txt: 307 tokens
Food Fight Erupted in Prison.txt: 222 tokens
Better To Be Unlucky.txt: 356 tokens
Sara Went Shopping.txt: 165 tokens
Freeway Chase Ends at Newsstand.txt: 335 tokens
Trees Are a Threat.txt: 335 tokens
A Murder-Suicide.txt: 398 tokens
Happy and Unhappy Renters.txt: 313 tokens
Pulling Out Nine Tons of Trash.txt: 293 tokens
Total number of tokens in the entire collection: 4516
Number of unique tokens in the entire collection: 1475
Number of tokens in each document after removing stop words:
Jerry Decided To Buy a Gun.txt: 136 tokens
Rentals at the Oceanside Community.txt: 193 tokens
Gasoline Prices Hit Record High.txt: 150 tokens
Cloning Pets.txt: 122 tokens
Crazy Housi

[nltk_data] Downloading package punkt to /Users/ansha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ansha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
