In [1]:
import json
import pandas as pd

with open("../data/interim/comedy_preprocessed.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df.head()

Unnamed: 0,url,comedian,special,transcript,text_clean,text_lower
0,https://scrapsfromtheloft.com/comedy/conan-obr...,Conan O'Brien,,Conan O’Brien: The Kennedy Center Mark Twain P...,Conan O’Brien: The Kennedy Center Mark Twain P...,conan o’brien: the kennedy center mark twain p...
1,https://scrapsfromtheloft.com/comedy/cristela-...,Cristela Alonzo,,Cristela Alonzo: Middle Classy (2022) | Full t...,Cristela Alonzo: Middle Classy (2022) | Full t...,cristela alonzo: middle classy (2022) | full t...
2,https://scrapsfromtheloft.com/comedy/dave-chap...,Dave Chappelle,,Dave Chappelle: The Unstoppable (2025)Release ...,Dave Chappelle: The Unstoppable (2025)Release ...,dave chappelle: the unstoppable (2025)release ...
3,https://scrapsfromtheloft.com/comedy/dave-smit...,Dave Smith,,Part of the ProblemEpisode number:1306Premiere...,Part of the ProblemEpisode number:1306Premiere...,part of the problemepisode number:1306premiere...
4,https://scrapsfromtheloft.com/comedy/david-spa...,David Spade,,David Spade: Dandelion (2025) [cheers and appl...,David Spade: Dandelion (2025) <APPLAUSE> Nice....,david spade: dandelion (2025) <applause> nice....


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_df=0.95,
    min_df=2,
    ngram_range=(1, 2)
)

X = vectorizer.fit_transform(df["text_lower"])

In [3]:
from sklearn.decomposition import NMF

n_topics = 7

nmf = NMF(
    n_components=n_topics,
    random_state=42
)

W = nmf.fit_transform(X) # document-topic
H = nmf.components_   # topic-word



- Inspecting topics

In [4]:
def display_topics(model, feature_names, n_top_words=12):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:]]
        topics.append(top_words)
        print(f"Topic {topic_idx}: {', '.join(top_words)}")

    return topics

feature_names = vectorizer.get_feature_names_out()
topics = display_topics(nmf, feature_names)



In [5]:
df_topics = pd.DataFrame(
    W,
    columns=[f"Topic_{i}" for i in range(n_topics)]
)

df = pd.concat([df.reset_index(drop=True), df_topics], axis=1)
df.head()

Unnamed: 0,url,comedian,special,transcript,text_clean,text_lower,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6
0,https://scrapsfromtheloft.com/comedy/conan-obr...,Conan O'Brien,,Conan O’Brien: The Kennedy Center Mark Twain P...,Conan O’Brien: The Kennedy Center Mark Twain P...,conan o’brien: the kennedy center mark twain p...,0.08857,0.034919,0.254455,0.0,0.0,0.0,0.0
1,https://scrapsfromtheloft.com/comedy/cristela-...,Cristela Alonzo,,Cristela Alonzo: Middle Classy (2022) | Full t...,Cristela Alonzo: Middle Classy (2022) | Full t...,cristela alonzo: middle classy (2022) | full t...,0.428716,0.0,0.0,0.040196,0.0,0.0,0.001239
2,https://scrapsfromtheloft.com/comedy/dave-chap...,Dave Chappelle,,Dave Chappelle: The Unstoppable (2025)Release ...,Dave Chappelle: The Unstoppable (2025)Release ...,dave chappelle: the unstoppable (2025)release ...,0.329307,0.021553,0.022901,0.0,0.193578,0.0,0.007054
3,https://scrapsfromtheloft.com/comedy/dave-smit...,Dave Smith,,Part of the ProblemEpisode number:1306Premiere...,Part of the ProblemEpisode number:1306Premiere...,part of the problemepisode number:1306premiere...,0.002846,0.232883,0.0,0.0,0.325476,0.003484,0.031567
4,https://scrapsfromtheloft.com/comedy/david-spa...,David Spade,,David Spade: Dandelion (2025) [cheers and appl...,David Spade: Dandelion (2025) <APPLAUSE> Nice....,david spade: dandelion (2025) <applause> nice....,0.3729,0.0,0.0,0.0,0.0,0.157711,0.022028


- The model groups co-occuring words
- Interpretation is subjective
- Results are indicative, not definitive