In [19]:
import os
import re
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# --- Topic Modeling (scikit-learn) ---
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Download required NLTK data (runs once; safe to re-run)
nltk.download('punkt',        quiet=True)
nltk.download('punkt_tab',    quiet=True)
nltk.download('stopwords',    quiet=True)
nltk.download('wordnet',      quiet=True)

True

In [20]:
# ── Configuration ────────────────────────────────────────────────────────────
# Paths are relative to the notebook location inside '2. Topic_modelling/'
DATA_DIR  = "../1.Collection_midterm/ucsb_scraping/data/raw_texts/"
META_PATH = "../1.Collection_midterm/ucsb_scraping/data/metadata.csv"

K_VALUES     = [5, 10, 15]   # number of topics to evaluate
TOP_N        = 10            # top words to display per topic
RANDOM_STATE = 42            # seed for reproducibility

# Step 1: Preprocessing

In [21]:
# ─────────────────────────────────────────────────────────────────────────────
# CELL 4: Preprocessing Pipeline
# ─────────────────────────────────────────────────────────────────────────────

# ── 1. Load metadata ─────────────────────────────────────────────────────────
metadata = pd.read_csv(META_PATH)
print(f"Metadata loaded: {len(metadata)} rows")
print(metadata.head(3))

Metadata loaded: 485 rows
      filename       date                                              title  \
0  doc_000.txt  18-Aug-80  Address to the Veterans of Foreign Wars Conven...   
1  doc_001.txt  20-Jan-81                       Ronald Reagan Event Timeline   
2  doc_002.txt   3-Mar-81  Excerpts From an Interview With Walter Cronkit...   

       president                                                url  \
0  Ronald Reagan  https://www.presidency.ucsb.edu/documents/addr...   
1  Ronald Reagan  https://www.presidency.ucsb.edu/documents/rona...   
2  Ronald Reagan  https://www.presidency.ucsb.edu/documents/exce...   

                    source  
0  UCSB Presidency Project  
1  UCSB Presidency Project  
2  UCSB Presidency Project  


In [22]:
# --- 2. Load raw texts in filename order
filenames = []
for f in os.listdir(DATA_DIR):
    if f.endswith('.txt'):
        filenames.append(f)
filenames = sorted(filenames)
print(f"\nText files found: {len(filenames)}")

raw_texts = {}
for fname in filenames:
    with open(os.path.join(DATA_DIR, fname), 'r', encoding='utf-8', errors='replace') as fh:
        raw_texts[fname] = fh.read()


Text files found: 485


In [23]:
# 3. Define stopwords 
standard_stops = set(stopwords.words('english')) # using nltk function 

CUSTOM_STOPWORDS = {
    "america", "american", "americans", "people", "nation", "government",
    "president", "reagan", "united", "states", "would", "also", "said",
    "mr", "mrs", "secretary", "administration", "year", "years",
    "country", "world", "think", "know", "want", "get", "let", "make",
    "well", "going", "today", "new", "one", "time", "come", "back",
    "nicaragua", "nicaraguan",
    "congress", "house", "senate", "vote", "law", "policy", "support",
    "must", "us", "every", "even", "still", "last", "first", "two",
    "ask", "say", "see", "take", "right", "way", "work"
}

all_stops = standard_stops | CUSTOM_STOPWORDS

# Preprocessing function 
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    #clean one doc and return list of processed tokens
    text = re.sub(r'\[.*?\]', ' ', text)   # remove [Applause], [Laughter], etc.
    text = text.lower()                     # lowercase
    text = re.sub(r'[^a-z\s]', ' ', text)  # keep only letters
    tokens = word_tokenize(text)            # split into words
    tokens = [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok not in all_stops and len(tok) >= 3
    ]
    return tokens

# ── 5. Apply preprocessing to all documents ──────────────────────────────────
tokenized_docs  = []
processed_texts = []
doc_filenames   = []

for fname in filenames:
    tokens = preprocess(raw_texts[fname])
    tokenized_docs.append(tokens)
    processed_texts.append(" ".join(tokens))
    doc_filenames.append(fname)

# ── 6. Sanity checks ──────────────────────────────────────────────────────────
total_tokens = sum(len(d) for d in tokenized_docs)
avg_tokens   = total_tokens / len(tokenized_docs)

print(f"\nDocuments preprocessed : {len(processed_texts)}")
print(f"Total tokens           : {total_tokens:,}")
print(f"Average tokens/doc     : {avg_tokens:.0f}")
print(f"\nSample tokens from doc_000:\n  {tokenized_docs[0][:20]}")


Documents preprocessed : 485
Total tokens           : 430,886
Average tokens/doc     : 888

Sample tokens from doc_000:
  ['thank', 'commander', 'vanderclute', 'four', 'week', 'ago', 'deeply', 'honored', 'national', 'convention', 'party', 'accept', 'greatest', 'honor', 'bestow', 'nomination', 'presidency', 'wonderful', 'pleasure', 'accept']


# Step 2: Vectorization 


In [None]:
# CountVectorizer for LDA --- raw integer word counts
count_vec = CountVectorizer(max_df = 0.8, min_df = 2)
dtm_count = count_vec.fit_transform(processed_texts)

# TfidVectorizer for NMF -- TF-ID weighted scores
tfidf_vec = TfidfVectorizer(max_df=0.8, min_df=2)
dtm_tfidf = tfidf_vec.fit_transform(processed_texts)
document_term_matrix = pd.DataFrame(
    dtm_tfidf.toarray(), index=doc_filenames, columns=tfidf_vec.get_feature_names_out()
)

print(f"DTM shape (count) : {dtm_count.shape}  <- (documents x vocabulary terms)")
print(f"DTM shape (TF-IDF): {dtm_tfidf.shape}")

DTM shape (count) : (485, 11285)  <- (documents x vocabulary terms)
DTM shape (TF-IDF): (485, 11285)


In [25]:
# Extracting IDF weights from vectorizer object (slide 18 deck 11)
idf_weights = pd.Series(tfidf_vec.idf_, index=tfidf_vec.get_feature_names_out())

# 5 most distinctive words in corpus
for word, weight in idf_weights.sort_values(ascending=False).head(5).items():
    print(word, weight)

# 5 most distinctive words in corpus
for word, weight in idf_weights.sort_values().head(5).items():
    print(word, weight)

zulu 6.087596335232384
portraying 6.087596335232384
parental 6.087596335232384
daunting 6.087596335232384
pare 6.087596335232384
freedom 1.3028862354122148
soviet 1.3169117107667192
made 1.3225774483023967
peace 1.3282754694170344
democracy 1.3602085165200435


"freedom" and "soviet" appear as the most common words in the corpus after the generic filter. That's actually substantively meaningful. It may suggest that the anticommunist framing ("soviet", "freedom") is the dominant rhetorical register across these 485 documents.

# Step 3: Topic Modeling Algorithms


## LDA

In [26]:
K_VALUES = [5, 10, 15, 20]
lda_models = {}

for n_topics in K_VALUES:
    lda = LatentDirichletAllocation(n_components=n_topics, n_jobs=-1, random_state=RANDOM_STATE)
    doc_topic_distrib = lda.fit_transform(dtm_count)
    lda_models[n_topics] = {
        'model': lda,
        'doc_topic_distrib': doc_topic_distrib
    }
    print(f"k={n_topics} done.", flush=True)



k=5 done.
k=10 done.
k=15 done.
k=20 done.


### Inspecting the topics

In [29]:
for n_topics, model_data in lda_models.items():
    lda = model_data['model']
    
    # Normalize topic-word matrix
    topic_word_matrix = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
    topic_words_df_lda = pd.DataFrame(
        topic_word_matrix,
        columns=count_vec.get_feature_names_out()
    )
    
    print(f"\n{'='*50}")
    print(f"LDA | k={n_topics}")
    print(f"{'='*50}")
    for topic, topic_row in topic_words_df_lda.iterrows():
        top_10_words = ", ".join(topic_row.sort_values(ascending=False).head(10).index)
        print(f"  Topic {topic}: {top_10_words}")




LDA | k=5
  Topic 0: soviet, freedom, peace, human, free, right, nation, hope, economic, union
  Topic 1: national, act, iran, security, program, trade, federal, may, country, hostage
  Topic 2: democracy, freedom, central, democratic, peace, sandinistas, communist, soviet, salvador, military
  Topic 3: thing, could, soviet, like, tax, weapon, question, yes, kind, made
  Topic 4: tax, audience, opponent, day, could, great, thank, economic, senator, long

LDA | k=10
  Topic 0: freedom, national, may, sandinistas, emergency, communist, soviet, force, fighter, report
  Topic 1: federal, program, drug, trade, act, national, economic, private, free, budget
  Topic 2: central, democratic, democracy, peace, economic, country, assistance, salvador, security, region
  Topic 3: thing, could, soviet, question, kind, like, yes, made, believe, weapon
  Topic 4: iran, hostage, arm, contra, north, board, national, mcfarlane, meeting, security
  Topic 5: tax, rate, job, economic, great, thank, day, l

## NMF

In [32]:
K_VALUES = [5, 10, 15, 20]
nmf_models = {}

for n_topics in K_VALUES:
    nmf = NMF(n_components=n_topics, random_state=RANDOM_STATE)
    doc_topic_distrib = nmf.fit_transform(dtm_count)
    nmf_models[n_topics] = {
        'model': nmf,
        'doc_topic_distrib': doc_topic_distrib
    }
    print(f"k={n_topics} done.", flush=True)

k=5 done.
k=10 done.
k=15 done.
k=20 done.




### Inspecting the topics

In [33]:
for n_topics, model_data in nmf_models.items():
    nmf = model_data['model']
    
    # Normalize topic-word matrix
    topic_word_matrix = nmf.components_ / nmf.components_.sum(axis=1)[:, np.newaxis]
    topic_words_df_nmf = pd.DataFrame(
        topic_word_matrix,
        columns=tfidf_vec.get_feature_names_out()
    )
    
    print(f"\n{'='*50}")
    print(f"LDA | k={n_topics}")
    print(f"{'='*50}")
    for topic, topic_row in topic_words_df_nmf.iterrows():
        top_10_words = ", ".join(topic_row.sort_values(ascending=False).head(10).index)
        print(f"  Topic {topic}: {top_10_words}")


LDA | k=5
  Topic 0: soviet, weapon, union, nuclear, peace, arm, war, missile, could, defense
  Topic 1: iran, north, board, hostage, arm, mcfarlane, meeting, contra, initiative, national
  Topic 2: federal, program, economic, free, tax, drug, budget, individual, act, service
  Topic 3: freedom, democracy, central, democratic, peace, sandinistas, communist, soviet, free, economic
  Topic 4: thing, could, tax, like, yes, kind, question, budget, great, made

LDA | k=10
  Topic 0: thing, could, yes, kind, question, like, budget, trying, believe, made
  Topic 1: iran, north, board, hostage, arm, mcfarlane, meeting, contra, initiative, national
  Topic 2: federal, program, budget, drug, free, individual, tax, service, act, legislation
  Topic 3: soviet, nuclear, union, peace, weapon, arm, defense, missile, war, reduction
  Topic 4: freedom, human, right, great, peace, hope, day, nation, free, life
  Topic 5: freedom, sandinistas, democracy, communist, central, fighter, soviet, democratic, 

# Table listing Document-Topic Distribution
