In [51]:
import os
import re
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# --- Topic Modeling (scikit-learn) ---
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Download required NLTK data (runs once; safe to re-run)
nltk.download('punkt',        quiet=True)
nltk.download('punkt_tab',    quiet=True)
nltk.download('stopwords',    quiet=True)
nltk.download('wordnet',      quiet=True)

True

In [52]:
# ── Configuration ────────────────────────────────────────────────────────────
# Paths are relative to the notebook location inside '2. Topic_modelling/'
DATA_DIR  = "../1.Collection_midterm/ucsb_scraping/data/raw_texts/"
META_PATH = "../1.Collection_midterm/ucsb_scraping/data/metadata.csv"

K_VALUES     = [5, 10, 15]   # number of topics to evaluate
TOP_N        = 10            # top words to display per topic
RANDOM_STATE = 42            # seed for reproducibility

# Step 1: Preprocessing

In [53]:
# ─────────────────────────────────────────────────────────────────────────────
# CELL 4: Preprocessing Pipeline
# ─────────────────────────────────────────────────────────────────────────────

# ── 1. Load metadata ─────────────────────────────────────────────────────────
metadata = pd.read_csv(META_PATH)
print(f"Metadata loaded: {len(metadata)} rows")
print(metadata.head(3))

Metadata loaded: 485 rows
      filename       date                                              title  \
0  doc_000.txt  18-Aug-80  Address to the Veterans of Foreign Wars Conven...   
1  doc_001.txt  20-Jan-81                       Ronald Reagan Event Timeline   
2  doc_002.txt   3-Mar-81  Excerpts From an Interview With Walter Cronkit...   

       president                                                url  \
0  Ronald Reagan  https://www.presidency.ucsb.edu/documents/addr...   
1  Ronald Reagan  https://www.presidency.ucsb.edu/documents/rona...   
2  Ronald Reagan  https://www.presidency.ucsb.edu/documents/exce...   

                    source  
0  UCSB Presidency Project  
1  UCSB Presidency Project  
2  UCSB Presidency Project  


In [None]:
# --- 2. Load raw texts in filename order
filenames = []
for f in os.listdir(DATA_DIR):
    if f.endswith('.txt'):
        filenames.append(f)
filenames = sorted(filenames)
print(f"\nText files found: {len(filenames)}")

raw_texts = {}
for fname in filenames:
    with open(os.path.join(DATA_DIR, fname), 'r', encoding='utf-8', errors='replace') as fh:
        raw_texts[fname] = fh.read()


Text files found: 485


In [None]:
# 3. Define stopwords 
standard_stops = set(stopwords.words('english')) # using nltk function 

CUSTOM_STOPWORDS = {
    "america", "american", "americans", "people", "nation", "government",
    "president", "reagan", "united", "states", "would", "also", "said",
    "mr", "mrs", "secretary", "administration", "year", "years",
    "country", "well"
}

all_stops = standard_stops | CUSTOM_STOPWORDS

# Preprocessing function 
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    #clean one doc and return list of processed tokens
    text = re.sub(r'\[.*?\]', ' ', text)   # remove [Applause], [Laughter], etc.
    text = text.lower()                     # lowercase
    text = re.sub(r'[^a-z\s]', ' ', text)  # keep only letters
    tokens = word_tokenize(text)            # split into words
    tokens = [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok not in all_stops and len(tok) >= 3
    ]
    return tokens

# ── 5. Apply preprocessing to all documents ──────────────────────────────────
tokenized_docs  = []
processed_texts = []
doc_filenames   = []

for fname in filenames:
    tokens = preprocess(raw_texts[fname])
    tokenized_docs.append(tokens)
    processed_texts.append(" ".join(tokens))
    doc_filenames.append(fname)

# ── 6. Sanity checks ──────────────────────────────────────────────────────────
total_tokens = sum(len(d) for d in tokenized_docs)
avg_tokens   = total_tokens / len(tokenized_docs)

print(f"\nDocuments preprocessed : {len(processed_texts)}")
print(f"Total tokens           : {total_tokens:,}")
print(f"Average tokens/doc     : {avg_tokens:.0f}")
print(f"\nSample tokens from doc_000:\n  {tokenized_docs[0][:20]}")


Documents preprocessed : 485
Total tokens           : 483,318
Average tokens/doc     : 997

Sample tokens from doc_000:
  ['iran', 'arm', 'contra', 'aid', 'controversy', 'good', 'evening', 'word', 'take', 'question', 'brief', 'remark', 'eighteen', 'month', 'ago', 'last', 'thursday', 'began', 'secret', 'initiative']


# Step 2: Vectorization 


In [48]:
# CountVectorizer for LDA --- raw integer word counts
count_vec = CountVectorizer(max_df = 0.8, min_df = 2)
dtm_count = count_vec.fit_transform(processed_texts)

# TfidVectorizer for NMF -- TF-ID weighted scores
tfidf_vec = TfidfVectorizer(max_df=0.8, min_df=2)
dtm_tfidf = tfidf_vec.fit_transform(processed_texts)
document_term_matrix = pd.DataFrame(
    dtm_tfidf.toarray(), index=doc_filenames, columns=tfidf_vec.get_feature_names_out()
)

print(f"DTM shape (count) : {dtm_count.shape}  <- (documents x vocabulary terms)")
print(f"DTM shape (TF-IDF): {dtm_tfidf.shape}")

# Vectorized corpus
print(processed_texts)

DTM shape (count) : (485, 11298)  <- (documents x vocabulary terms)
DTM shape (TF-IDF): (485, 11298)


In [49]:
# Extracting IDF weights from vectorizer object (slide 18 deck 11)
idf_weights = pd.Series(tfidf_vec.idf_, index=tfidf_vec.get_feature_names_out())

# 5 most distinctive words in corpus
for word, weight in idf_weights.sort_values(ascending=False).head(5).items():
    print(word, weight)

# 5 most distinctive words in corpus
for word, weight in idf_weights.sort_values().head(5).items():
    print(word, weight)

zulu 6.087596335232384
greening 6.087596335232384
goon 6.087596335232384
undervalued 6.087596335232384
goudie 6.087596335232384
government 1.2226292802820473
would 1.227783930870712
year 1.2303712544356629
one 1.2329652896127095
well 1.2407880152939186


# Step 3: Topic Modeling Algorithms


## LDA

In [50]:
print("Building LDA model using training set...", end=" ", flush=True)
n_topics = 20
lda = LatentDirichletAllocation(n_components=n_topics = )


SyntaxError: invalid syntax (2419967250.py, line 3)