In [6]:
import os
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

# Download the required NLTK tokenizer model
nltk.download('punkt')

# Path to the documents directory
documents_path = './documents'

# Function for tokenization using NLTK
def basic_tokenize(text):
    tokens = word_tokenize(text)
    return tokens

# Counters for token counts
token_counts_per_doc = {}
total_token_count = Counter()

# Process the files
for filename in os.listdir(documents_path):
    file_path = os.path.join(documents_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        tokens = basic_tokenize(text)
        token_counts_per_doc[filename] = len(tokens)
        total_token_count.update(tokens)

# Get the sum of all tokens
total_tokens = sum(total_token_count.values())

# Get the number of unique tokens
unique_tokens = len(total_token_count)

# Print results
print("Token counts per document:")
for filename, count in token_counts_per_doc.items():
    print(f"{filename}: {count} tokens")

print("\nTotal number of tokens in the entire collection:", total_tokens)
print("Number of unique tokens in the entire collection:", unique_tokens)


[nltk_data] Downloading package punkt to /Users/Arman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Token counts per document:
Jerry Decided To Buy a Gun.txt: 305 tokens
Rentals at the Oceanside Community.txt: 380 tokens
Gasoline Prices Hit Record High.txt: 303 tokens
Cloning Pets.txt: 266 tokens
Crazy Housing Prices.txt: 403 tokens
Man Injured at Fast Food Place.txt: 173 tokens
A Festival of Books.txt: 305 tokens
Food Fight Erupted in Prison.txt: 222 tokens
Better To Be Unlucky.txt: 359 tokens
Sara Went Shopping.txt: 165 tokens
Freeway Chase Ends at Newsstand.txt: 335 tokens
Trees Are a Threat.txt: 344 tokens
A Murder-Suicide.txt: 407 tokens
Happy and Unhappy Renters.txt: 317 tokens
Pulling Out Nine Tons of Trash.txt: 298 tokens

Total number of tokens in the entire collection: 4582
Number of unique tokens in the entire collection: 1427


In [8]:
import os
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Check if stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Get list of stop words
stop_words = set(stopwords.words('english'))

# Function for tokenization using NLTK
def basic_tokenize(text):
    tokens = word_tokenize(text)
    return tokens

# Remove stop words from tokens
def remove_stop_words(tokens):
    return [token for token in tokens if token.lower() not in stop_words]

# Counters for tokens after stop words removal
token_counts_per_doc_no_stop = {}
total_token_count_no_stop = Counter()

documents_path = './documents'  

# Process each file in the documents folder
for filename in os.listdir(documents_path):
    file_path = os.path.join(documents_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        tokens = basic_tokenize(text)
        filtered_tokens = remove_stop_words(tokens)
        token_counts_per_doc_no_stop[filename] = len(filtered_tokens)
        total_token_count_no_stop.update(filtered_tokens)

# Total number of tokens in the entire collection after stop words removal
total_tokens_no_stop = sum(total_token_count_no_stop.values())

# Number of unique tokens in the entire collection after stop words removal
unique_tokens_no_stop = len(total_token_count_no_stop)

# Print results
print("Token counts per document (no stop words):")
for filename, count in token_counts_per_doc_no_stop.items():
    print(f"{filename}: {count} tokens")

print("\nTotal number of tokens in the entire collection (no stop words):", total_tokens_no_stop)
print("Number of unique tokens in the entire collection (no stop words):", unique_tokens_no_stop)


Token counts per document (no stop words):
Jerry Decided To Buy a Gun.txt: 176 tokens
Rentals at the Oceanside Community.txt: 224 tokens
Gasoline Prices Hit Record High.txt: 176 tokens
Cloning Pets.txt: 158 tokens
Crazy Housing Prices.txt: 226 tokens
Man Injured at Fast Food Place.txt: 107 tokens
A Festival of Books.txt: 198 tokens
Food Fight Erupted in Prison.txt: 140 tokens
Better To Be Unlucky.txt: 218 tokens
Sara Went Shopping.txt: 112 tokens
Freeway Chase Ends at Newsstand.txt: 186 tokens
Trees Are a Threat.txt: 196 tokens
A Murder-Suicide.txt: 221 tokens
Happy and Unhappy Renters.txt: 185 tokens
Pulling Out Nine Tons of Trash.txt: 201 tokens

Total number of tokens in the entire collection (no stop words): 2724
Number of unique tokens in the entire collection (no stop words): 1265


[nltk_data] Downloading package stopwords to /Users/Arman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Arman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

documents = []
filenames = []
documents_path = './documents'  

for filename in os.listdir(documents_path):
    file_path = os.path.join(documents_path, filename)
    with open(file_path, 'r', encoding='latin-1') as file:
        text = file.read()
        documents.append(text)
        filenames.append(filename)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')  

tfidf_matrix = vectorizer.fit_transform(documents)

#convert to data frame for print
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=filenames, columns=vectorizer.get_feature_names_out())

print("TF-IDF Matrix:")

print(tfidf_df)


TF-IDF Matrix:
                                              00       000        09  \
Jerry Decided To Buy a Gun.txt          0.000000  0.037550  0.000000   
Rentals at the Oceanside Community.txt  0.000000  0.033862  0.000000   
Gasoline Prices Hit Record High.txt     0.000000  0.000000  0.074431   
Cloning Pets.txt                        0.000000  0.130012  0.000000   
Crazy Housing Prices.txt                0.000000  0.170128  0.000000   
Man Injured at Fast Food Place.txt      0.101645  0.000000  0.000000   
A Festival of Books.txt                 0.000000  0.081401  0.000000   
Food Fight Erupted in Prison.txt        0.000000  0.000000  0.000000   
Better To Be Unlucky.txt                0.000000  0.123298  0.000000   
Sara Went Shopping.txt                  0.000000  0.000000  0.000000   
Freeway Chase Ends at Newsstand.txt     0.000000  0.000000  0.000000   
Trees Are a Threat.txt                  0.000000  0.154807  0.000000   
A Murder-Suicide.txt                    0.000000 

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Convert the similarity matrix to a DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=filenames, columns=filenames)

print("Cosine Similarity Matrix:")

print(cosine_sim_df)

Cosine Similarity Matrix:
                                        Jerry Decided To Buy a Gun.txt  \
Jerry Decided To Buy a Gun.txt                                1.000000   
Rentals at the Oceanside Community.txt                        0.011724   
Gasoline Prices Hit Record High.txt                           0.052392   
Cloning Pets.txt                                              0.036922   
Crazy Housing Prices.txt                                      0.071246   
Man Injured at Fast Food Place.txt                            0.069547   
A Festival of Books.txt                                       0.045236   
Food Fight Erupted in Prison.txt                              0.020018   
Better To Be Unlucky.txt                                      0.032867   
Sara Went Shopping.txt                                        0.057951   
Freeway Chase Ends at Newsstand.txt                           0.032660   
Trees Are a Threat.txt                                        0.018511   
A Murder-Sui