In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import numpy as np
import tqdm as notebook_tqdm
import emoji
import seaborn as sns
import os
import torch
from umap.umap_ import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity
import re
import plotly.io as pio
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
topic_model = BERTopic.load("bertopic_model")

In [3]:
# Your existing themes dictionary
themes = {
    "General Diet and Nutrition Discussion": [3, 4, 6, 9, 16, 28],
    "Skepticism Toward Mainstream Health Institutions": [10, 15, 24, 48, 61, 81, 83, 84, 86, 94, 128],
    "Anti-Veganism / Pro-Carnivore Discussion": [1, 2, 9, 13, 26, 35, 60, 69, 85, 107, 119, 127],
    "Broader Political Discussion": [5, 36, 75, 117, 89, 135],
    "Health and Wellness Concerns": [106, 7, 11, 32, 39, 43, 54, 58, 59, 23, 67, 80, 87, 88, 100, 108, 123, 125],
    "Food Choice Discussion": [0, 18, 19, 20, 21, 22, 23, 25, 29, 30, 33, 34, 40, 41, 42, 44, 45, 46, 53, 57, 65, 66, 99, 132, 133],
    "Community Building": [8, 12, 17, 49, 50, 55, 73, 77, 79, 96, 97, 110, 114, 129]
}

# Create reverse lookup: topic → theme
topic_to_theme = {}
for theme, topics in themes.items():
    for topic in topics:
        topic_to_theme[topic] = theme

In [14]:
# Step 1: Load full data and preprocess
data_all = pd.read_csv("comments_final.csv", index_col=0)
data_all = data_all.dropna(subset=["clean_body", "created_utc"])
data_all = data_all[data_all["clean_body"].str.strip().str.len() > 0]
data_all = data_all[~data_all["clean_body"].str.contains("i am a bot", case=False, na=False)]

# Step 2: Convert timestamps (needed to get quarters)
data_all["created_utc"] = pd.to_datetime(data_all["created_utc"], unit="s")
data_all["quarter"] = data_all["created_utc"].dt.to_period("Q").dt.to_timestamp()

# Step 3: Save the clean texts used in topic modeling (before filtering by date)
texts = data_all["clean_body"].tolist()

# Step 4: Apply quarter filtering AFTER saving `texts`, and use it to determine slice size
data_filtered = data_all[(data_all["quarter"] >= "2013-01-01") & (data_all["quarter"] <= "2024-12-31")]
timestamps = data_filtered["quarter"].tolist()

# Step 5: Truncate to match the number of timestamps, like in original topic modeling
texts = texts[:len(timestamps)]
data_used = data_all.iloc[:len(timestamps)]

In [16]:
len(texts)

360160

In [17]:
assert len(texts) == len(topic_model.topics_) == len(data_used)

In [18]:
df = pd.DataFrame({
    "comment_id": data_used["comment_id"].values,
    "Document": data_used["clean_body"].values,
    "Topic": topic_model.topics_,
    "Timestamp": data_used["created_utc"].values,
    "quarter": data_used["quarter"].values,
    "score": data_used["score"].values
})

# Step 7: Add themes
df["Theme"] = df["Topic"].map(topic_to_theme)

In [20]:
# Step 8: Save
df.to_csv("document_topic_theme_assignments_with_ids.csv", index=False)

In [21]:
df

Unnamed: 0,comment_id,Document,Topic,Timestamp,quarter,score,Theme
0,md8mlm6,"Chill bro, my wife is on here",31,2025-02-17 12:44:54,2025-01-01,78.0,
1,md7zvec,Solid facial gains! How was your diet?,-1,2025-02-17 09:10:25,2025-01-01,73.0,
2,md845a7,"Impressive! I can only hope, that I can presen...",16,2025-02-17 09:54:30,2025-01-01,27.0,General Diet and Nutrition Discussion
3,md93t4g,Stop lying. That's not the same person. Looks ...,8,2025-02-17 14:35:12,2025-01-01,14.0,Community Building
4,md847sx,Bro is a model now. Congrats! Looking good!,-1,2025-02-17 09:55:12,2025-01-01,11.0,
...,...,...,...,...,...,...,...
360155,l3grtl0,"Any truth to this in peat world? ""Research sug...",14,2024-05-10 18:14:09,2024-04-01,1.0,
360156,l3hrxcl,What about cheeses?,42,2024-05-10 21:56:03,2024-04-01,1.0,Food Choice Discussion
360157,m4bmv8u,"Sounds like candida to me, carbs feed it",-1,2024-12-29 07:20:59,2024-10-01,1.0,
360158,l3cytbl,you may want to look into /u/exfatloss 's diet...,14,2024-05-09 23:56:05,2024-04-01,0.0,
