# TF-IDF and Bag-of-Words Corpus

This notebook defines a 50-document corpus for TF-IDF and bag-of-words experiments.


In [14]:
doc_ids = [
    "doc01",
    "doc02",
    "doc03",
    "doc04",
    "doc05",
    "doc06",
    "doc07",
    "doc08",
    "doc09",
    "doc10",
    "doc11",
    "doc12",
    "doc13",
    "doc14",
    "doc15",
    "doc16",
    "doc17",
    "doc18",
    "doc19",
    "doc20",
    "doc21",
    "doc22",
    "doc23",
    "doc24",
    "doc25",
    "doc26",
    "doc27",
    "doc28",
    "doc29",
    "doc30",
    "doc31",
    "doc32",
    "doc33",
    "doc34",
    "doc35",
    "doc36",
    "doc37",
    "doc38",
    "doc39",
    "doc40",
    "doc41",
    "doc42",
    "doc43",
    "doc44",
    "doc45",
    "doc46",
    "doc47",
    "doc48",
    "doc49",
    "doc50"
]


corpus = [
    "The cat sleeps on the warm window ledge.",
    "A dog chases a red ball across the yard.",
    "The sun rises early in summer and sets late.",
    "Rain falls softly on the quiet city streets.",
    "Fresh bread smells good in the morning.",
    "A chef slices tomatoes and stirs a soup.",
    "Coffee brews while the newspaper rustles.",
    "The runner ties her shoes before the race.",
    "A cyclist rides uphill with steady effort.",
    "The gym opens at six for early workouts.",
    "A teacher explains math with clear examples.",
    "Students take notes during a history lecture.",
    "The library is silent except for turning pages.",
    "A musician practices scales on the piano.",
    "The guitarist tunes strings before the show.",
    "A painter mixes blue and yellow into green.",
    "The gallery displays modern art and sculpture.",
    "The movie starts at eight with no previews.",
    "Popcorn and soda spill on the theater floor.",
    "A novelist edits a chapter late at night.",
    "The phone battery dies during a long call.",
    "A laptop boots slowly after a system update.",
    "The app crashes when the network is weak.",
    "A router blinks as data moves through the home.",
    "The server logs show errors and retries.",
    "The farmer plants seeds before spring rain.",
    "A tractor moves across the field in straight lines.",
    "Bees collect pollen from bright flowers.",
    "The garden grows tomatoes, basil, and peppers.",
    "A storm knocks branches onto the road.",
    "The traveler packs light for a weekend trip.",
    "A train arrives late at the busy station.",
    "The plane lands smoothly after a long flight.",
    "A taxi driver knows every shortcut downtown.",
    "The hotel lobby smells of clean linen.",
    "The chef prepares a spicy curry for dinner.",
    "A baker decorates a cake with fresh berries.",
    "The waiter refills water without being asked.",
    "The menu lists soup, salad, and sandwiches.",
    "A cyclist repairs a flat tire on the trail.",
    "The coach outlines strategy before the match.",
    "Fans cheer loudly as the team scores.",
    "The referee blows the whistle to stop play.",
    "A swimmer practices laps in the pool.",
    "The hiker checks a map at the trailhead.",
    "A camper lights a small fire for warmth.",
    "The lake is calm at sunrise with light fog.",
    "A photographer captures birds in flight.",
    "The market sells apples, rice, and tea.",
    "The cashier scans items and prints a receipt."
]


# Quick preview
len(corpus), corpus[0]


(50, 'The cat sleeps on the warm window ledge.')

## Preprocessing

Lowercase, remove punctuation, and optionally drop simple stopwords.


In [15]:
import re
from collections import Counter

STOPWORDS = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for',
    'from', 'in', 'is', 'it', 'its', 'not', 'of', 'on', 'or', 'that',
    'the', 'this', 'to', 'was', 'were', 'with'
}

def preprocess(text, remove_stopwords=True):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    if remove_stopwords:
        tokens = [t for t in tokens if t not in STOPWORDS]
    return tokens

tokenized_corpus = [preprocess(doc) for doc in corpus]
tokenized_corpus[:3]


[['cat', 'sleeps', 'warm', 'window', 'ledge'],
 ['dog', 'chases', 'red', 'ball', 'across', 'yard'],
 ['sun', 'rises', 'early', 'summer', 'sets', 'late']]

## Bag of Words (manual)


In [16]:
vocab = sorted({token for doc in tokenized_corpus for token in doc})
vocab_index = {token: i for i, token in enumerate(vocab)}

bow_matrix = []
for doc in tokenized_corpus:
    counts = Counter(doc)
    row = [0] * len(vocab)
    for token, count in counts.items():
        row[vocab_index[token]] = count
    bow_matrix.append(row)

# Nonzero terms for the first document
doc0_counts = {token: count for token, count in zip(vocab, bow_matrix[0]) if count}
doc0_counts


{'cat': 1, 'ledge': 1, 'sleeps': 1, 'warm': 1, 'window': 1}

## TF-IDF (manual)


In [17]:
import math

N = len(tokenized_corpus)
df = Counter()
for doc in tokenized_corpus:
    for token in set(doc):
        df[token] += 1

def idf(term):
    return math.log((N + 1) / (df[term] + 1)) + 1

tfidf_matrix = []
for doc in tokenized_corpus:
    counts = Counter(doc)
    doc_len = len(doc)
    row = []
    for term in vocab:
        tf = counts[term] / doc_len if doc_len else 0.0
        row.append(tf * idf(term))
    tfidf_matrix.append(row)

def top_terms(doc_index, k=5):
    scores = tfidf_matrix[doc_index]
    pairs = sorted(zip(vocab, scores), key=lambda x: x[1], reverse=True)
    return pairs[:k]

top_terms(0)


[('cat', 0.8477356904328762),
 ('ledge', 0.8477356904328762),
 ('sleeps', 0.8477356904328762),
 ('warm', 0.8477356904328762),
 ('window', 0.8477356904328762)]

## Cosine similarity

Compute cosine similarity between documents using the manual BoW and TF-IDF matrices.

In [18]:
import math

def cosine_sim(v1, v2):
    dot = sum(a * b for a, b in zip(v1, v2))
    norm1 = math.sqrt(sum(a * a for a in v1))
    norm2 = math.sqrt(sum(b * b for b in v2))
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

def cosine_matrix(matrix):
    sims = []
    for i in range(len(matrix)):
        row = []
        for j in range(len(matrix)):
            row.append(cosine_sim(matrix[i], matrix[j]))
        sims.append(row)
    return sims

def top_similar(doc_index, matrix, k=5):
    sims = []
    for i in range(len(matrix)):
        if i == doc_index:
            continue
        sims.append((doc_ids[i], cosine_sim(matrix[doc_index], matrix[i])))
    sims.sort(key=lambda x: x[1], reverse=True)
    return sims[:k]

bow_cosine = cosine_matrix(bow_matrix)
tfidf_cosine = cosine_matrix(tfidf_matrix)

top_similar(0, bow_matrix, k=5), top_similar(0, tfidf_matrix, k=5)

([('doc02', 0.0),
  ('doc03', 0.0),
  ('doc04', 0.0),
  ('doc05', 0.0),
  ('doc06', 0.0)],
 [('doc02', 0.0),
  ('doc03', 0.0),
  ('doc04', 0.0),
  ('doc05', 0.0),
  ('doc06', 0.0)])

## Optional: scikit-learn vectorizers


In [19]:
try:
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
except ImportError:
    print('scikit-learn not installed. Run: pip install scikit-learn')
else:
    bow = CountVectorizer().fit_transform(corpus)
    tfidf = TfidfVectorizer().fit_transform(corpus)
    bow.shape, tfidf.shape


scikit-learn not installed. Run: pip install scikit-learn
