In [12]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

#--- NLP ---
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# --- Topic Modeling (scikit-learn) ---
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Download required NLTK data (runs once; safe to re-run)
nltk.download('punkt',        quiet=True)
nltk.download('punkt_tab',    quiet=True)
nltk.download('stopwords',    quiet=True)
nltk.download('wordnet',      quiet=True)

True

In [13]:
# ── Configuration ────────────────────────────────────────────────────────────
# Paths are relative to the notebook location inside '2. Topic_modelling/'
DATA_DIR  = "../1.Collection_midterm/ucsb_scraping/data/raw_texts/"
META_PATH = "../1.Collection_midterm/ucsb_scraping/data/metadata.csv"

K_VALUES     = [5, 10, 15]   # number of topics to evaluate
TOP_N        = 10            # top words to display per topic
RANDOM_STATE = 42            # seed for reproducibility

# Step 1: Preprocessing

In [14]:
# ─────────────────────────────────────────────────────────────────────────────
# CELL 4: Preprocessing Pipeline
# ─────────────────────────────────────────────────────────────────────────────

# ── 1. Load metadata ─────────────────────────────────────────────────────────
metadata = pd.read_csv(META_PATH)
print(f"Metadata loaded: {len(metadata)} rows")
print(metadata.head(3))

Metadata loaded: 485 rows
      filename       date                                              title  \
0  doc_000.txt  18-Aug-80  Address to the Veterans of Foreign Wars Conven...   
1  doc_001.txt  20-Jan-81                       Ronald Reagan Event Timeline   
2  doc_002.txt   3-Mar-81  Excerpts From an Interview With Walter Cronkit...   

       president                                                url  \
0  Ronald Reagan  https://www.presidency.ucsb.edu/documents/addr...   
1  Ronald Reagan  https://www.presidency.ucsb.edu/documents/rona...   
2  Ronald Reagan  https://www.presidency.ucsb.edu/documents/exce...   

                    source  
0  UCSB Presidency Project  
1  UCSB Presidency Project  
2  UCSB Presidency Project  


In [15]:
# --- 2. Load raw texts in filename order
filenames = []
for f in os.listdir(DATA_DIR):
    if f.endswith('.txt'):
        filenames.append(f)
print(f"\nText files found: {len(filenames)}")

raw_texts = {}
for fname in filenames:
    with open(os.path.join(DATA_DIR, fname), 'r', encoding='utf-8', errors='replace') as fh:
        raw_texts[fname] = fh.read()


Text files found: 485


In [24]:
# 3. Define stopwords 
standard_stops = set(stopwords.words('english')) # using nltk function 

CUSTOM_STOPWORDS = {
    "america", "american", "americans", "people", "nation", "government",
    "president", "reagan", "united", "states", "would", "also", "said",
    "mr", "mrs", "secretary", "administration", "year", "years",
    "country"
}

all_stops = standard_stops | CUSTOM_STOPWORDS

# Preprocessing function 
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    #clean one doc and return list of processed tokens
    text = re.sub(r'\[.*?\]', ' ', text)   # remove [Applause], [Laughter], etc.
    text = text.lower()                     # lowercase
    text = re.sub(r'[^a-z\s]', ' ', text)  # keep only letters
    tokens = word_tokenize(text)            # split into words
    tokens = [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok not in standard_stops and len(tok) >= 3
    ]
    return tokens

# ── 5. Apply preprocessing to all documents ──────────────────────────────────
tokenized_docs  = []
processed_texts = []
doc_filenames   = []

for fname in filenames:
    tokens = preprocess(raw_texts[fname])
    tokenized_docs.append(tokens)
    processed_texts.append(" ".join(tokens))
    doc_filenames.append(fname)

# ── 6. Sanity checks ──────────────────────────────────────────────────────────
total_tokens = sum(len(d) for d in tokenized_docs)
avg_tokens   = total_tokens / len(tokenized_docs)

print(f"\nDocuments preprocessed : {len(processed_texts)}")
print(f"Total tokens           : {total_tokens:,}")
print(f"Average tokens/doc     : {avg_tokens:.0f}")
print(f"\nSample tokens from doc_000:\n  {tokenized_docs[0][:20]}")


Documents preprocessed : 485
Total tokens           : 520,846
Average tokens/doc     : 1074

Sample tokens from doc_000:
  ['iran', 'arm', 'contra', 'aid', 'controversy', 'president', 'good', 'evening', 'word', 'take', 'question', 'brief', 'remark', 'eighteen', 'month', 'ago', 'said', 'last', 'thursday', 'administration']


# Step 2: Vectorization 
