In [None]:
import math
from collections import defaultdict

def compute_tf(text):
    """
    Compute Term Frequency (TF) for a single document.
    TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
    """
    tf_dict = {}
    words = text.split()
    total_words = len(words)

    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1

    # Normalize by total words in document
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / total_words

    return tf_dict

def compute_idf(documents):
    """
    Compute Inverse Document Frequency (IDF) for all terms in the document collection.
    IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
    """
    idf_dict = defaultdict(int)
    total_documents = len(documents)

    # Count in how many documents each word appears
    for doc in documents:
        unique_words = set(doc.split())
        for word in unique_words:
            idf_dict[word] += 1

    # Calculate IDF for each word
    for word in idf_dict:
        idf_dict[word] = math.log(total_documents / idf_dict[word])

    return idf_dict

def compute_tfidf(documents):
    """
    Compute TF-IDF scores for all terms in all documents.
    TF-IDF(t) = TF(t) * IDF(t)
    """
    # Preprocess all documents
    processed_docs = [doc.lower() for doc in documents]

    # Calculate IDF for the document collection
    idf = compute_idf(processed_docs)

    tfidf_collection = []

    for doc in processed_docs:
        tf = compute_tf(doc)
        tfidf = {}

        for word in tf:
            tfidf[word] = tf[word] * idf[word]

        tfidf_collection.append(tfidf)

    return tfidf_collection

def print_tfidf_results(documents, tfidf_results):
    """Helper function to print TF-IDF results"""
    for i, (doc, tfidf) in enumerate(zip(documents, tfidf_results)):
        print(f"\nDocument {i+1}: '{doc}'")
        print("TF-IDF Scores:")
        for word, score in sorted(tfidf.items(), key=lambda x: x[1], reverse=True):
            print(f"  {word}: {score:.4f}")

# Example document collection
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "A fast fox is better than a slow dog",
    "Dogs and foxes are both canines"
]

# Compute TF-IDF
tfidf_results = compute_tfidf(documents)

# Print results
print("TF-IDF Calculation Results:")
print_tfidf_results(documents, tfidf_results)

TF-IDF Calculation Results:

Document 1: 'The quick brown fox jumps over the lazy dog'
TF-IDF Scores:
  the: 0.1540
  quick: 0.1540
  brown: 0.1540
  jumps: 0.1540
  fox: 0.0770
  over: 0.0770
  lazy: 0.0770
  dog: 0.0320

Document 2: 'Never jump over the lazy dog quickly'
TF-IDF Scores:
  never: 0.1980
  jump: 0.1980
  quickly: 0.1980
  over: 0.0990
  the: 0.0990
  lazy: 0.0990
  dog: 0.0411

Document 3: 'A fast fox is better than a slow dog'
TF-IDF Scores:
  a: 0.3081
  fast: 0.1540
  is: 0.1540
  better: 0.1540
  than: 0.1540
  slow: 0.1540
  fox: 0.0770
  dog: 0.0320

Document 4: 'Dogs and foxes are both canines'
TF-IDF Scores:
  dogs: 0.2310
  and: 0.2310
  foxes: 0.2310
  are: 0.2310
  both: 0.2310
  canines: 0.2310
