In [1]:
# Source: https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089

# TF-IDF algorithm:
# tf(t, d) = count of t in d / number of words in d
# idf(t) = N/df
# idf(t) = log(N/(df + 1))
# tf-idf(t, d) = tf(t, d) * log(N/(df + 1))

Hello


In [None]:
import os
from collections import defaultdict
import math

# Initialize document frequency (DF)
DF = defaultdict(set)

# Load whole documents and calculate DF
documents = []
for i, filename in enumerate(os.listdir(preprocessed_directory)):
    file_path = os.path.join(preprocessed_directory, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        documents.append(text)
        unique_tokens = set(text.split())
        for word in unique_tokens:
            DF[word].add(i)

# Number of documents
num_documents = len(documents)

# Calculate Inverse Document Frequency (IDF)
IDF = {word: math.log(num_documents / (1 + len(doc_indexes))) for word, doc_indexes in DF.items()}

# Calculate Term Frequency (TF) and TF-IDF for each document
document_tf_idf = []
for text in documents:
    tokens = text.split()
    tf = defaultdict(int)
    for word in tokens:
        tf[word] += 1
    
    doc_tf_idf = {}
    for word in tf:
        normalized_tf = tf[word] / len(tokens)
        tf_idf_value = normalized_tf * IDF[word]
        doc_tf_idf[word] = tf_idf_value
    
    document_tf_idf.append(doc_tf_idf)

# 'document_tf_idf' contains the TF-IDF score for each word in each document


In [None]:
def get_top_n_tokens(document_tf_idf, n):
    # Aggregate TF-IDF scores for each token across all documents
    token_scores = defaultdict(float)
    for doc in document_tf_idf:
        for token, score in doc.items():
            token_scores[token] += score

    # Sort tokens by their aggregated scores
    sorted_tokens = sorted(token_scores.items(), key=lambda item: item[1], reverse=True)

    # Get top n tokens
    top_n_tokens = sorted_tokens[:n]

    return top_n_tokens

n = 1024 # Number of tokens to use for vector length
top_tokens = get_top_n_tokens(document_tf_idf, n)

print("Top {} tokens:".format(n))
for token, score in top_tokens:
    print(f"{token}: {score}")
