In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import numpy as np
import tqdm as notebook_tqdm
import emoji
import seaborn as sns
import os
import torch
from umap.umap_ import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity
import re
import plotly.io as pio
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("comments_final.csv", index_col=0)

In [3]:
data = data.dropna(subset=["clean_body", "created_utc"])
data = data[data["clean_body"].str.strip().str.len() > 0]

data = data[~data["clean_body"].str.contains("i am a bot", case=False, na=False)]

In [4]:
# Prepare the texts
texts = data["clean_body"].tolist()

# Convert created_utc from seconds to datetime
data["created_utc"] = pd.to_datetime(data["created_utc"], unit="s")

# Clip time range to 2013–2024 and assign quarter bins
data["quarter"] = data["created_utc"].dt.to_period("Q").dt.to_timestamp()
data = data[(data["quarter"] >= "2013-01-01") & (data["quarter"] <= "2024-12-31")]

# Align timestamps with texts
timestamps = data["quarter"].tolist()

# Make sure lengths match
texts = texts[:len(timestamps)]

In [18]:
len(texts), len(timestamps)  # Check lengths

(360160, 360160)

In [5]:
embeddings = np.load('embeddings_final_original.npy')

In [6]:
seed_topic_list = [["vegans", "vegan", "veganism", "plant-based", "plantbased", "plant based", "plant-based diet", "plantbased diet", "plant based diet"],
                   ["meat", "meat eaters", "meat eater", "carnivore", "carnivore diet" "meat-based", "meatbased", "meat based", "meat-based diet", "meatbased diet", "meat based diet"],
                   ["seed oils", "seed oil"],
                   ["rfk", "robert kennedy", "robert f kennedy", "robert f. kennedy", "rfk jr", "rfk, jr", "rfk, jr.", "kennedy", "trump", "donald trump", "maga"],
                   ["biden", "joe biden", "biden administration", "biden presidency", "biden's presidency", "biden's administration", "kamala harris", "kamala", "kamala harris"],
                   ["covid vaccine", "covid vaccines", "covid-19 vaccine", "covid-19 vaccines", "covid-19", "covid", "vaccines", "vaccine", "vaccination", "vaccinations"],
                   ["soy", "soybean", "soybeans", "soy milk", "soy sauce", "soy protein", "soy protein isolate", "soy protein powder", "soyboy", "soy boy"],
                   ["estrogen", "estrogens", "estrogenic", "estrogenicity", "estrogenic activity", "estrogen receptor", "estrogen receptors", "estrogen dominance", "xenoestrogens"],
                   ["ray peat", "peaty", "ray", "peat", "peating", "peat diet", "ray peat diet", "ray peat's diet", "ray peat's", "peats"],
                   ["testosterone", "testosterone levels", "testosterone replacement therapy", "testosterone therapy", "testosterone boosters", "testosterone booster", "testosterone supplements", "testosterone supplement", "low testosterone", "low t", "low testosterone levels"],
]

In [7]:
model = SentenceTransformer("all-mpnet-base-v2")

In [11]:
# Fine-tuning UMAP parameters
umap_model = UMAP(n_neighbors=5, n_components=15, min_dist=0.0, metric='cosine', random_state=42)

# Fine-tuning HDBSCAN parameters
hdbscan_model = HDBSCAN(min_cluster_size=100, min_samples=15, metric='euclidean', cluster_selection_method="eom", prediction_data=False)

# Use ClassTfidfTransformer for better term-document representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=True)

# Vectorizer model (CountVectorizer or TF-IDF)
vectorizer_model = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS), min_df=5, binary=False, max_df=0.6, ngram_range=(1, 2))


In [12]:
topic_model = BERTopic(
    embedding_model=model,
    calculate_probabilities=False,
    min_topic_size=100,
    verbose=True,
    nr_topics=150,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=MaximalMarginalRelevance(diversity=0.1),
    seed_topic_list=seed_topic_list,
    )

# Fit the model with the embeddings (without recalculating embeddings)
topics = topic_model.fit_transform(texts, embeddings)

2025-05-13 18:09:46,389 - BERTopic - Guided - Find embeddings highly related to seeded topics.
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.92it/s]
2025-05-13 18:09:48,498 - BERTopic - Guided - Completed ✓
2025-05-13 18:09:48,498 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-13 18:21:21,323 - BERTopic - Dimensionality - Completed ✓
2025-05-13 18:21:21,338 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-13 18:22:05,854 - BERTopic - Cluster - Completed ✓
2025-05-13 18:22:05,854 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-13 18:22:16,354 - BERTopic - Representation - Completed ✓
2025-05-13 18:22:16,367 - BERTopic - Topic reduction - Reducing number of topics
2025-05-13 18:22:16,822 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-13 18:22:29,939 - BERTopic - Representation - Completed ✓
2025-05-13 18:22:29,974 - BERTopic - Topic reduction - R

In [15]:
pd.reset_option('display.max_colwidth')
topic_info = topic_model.get_topic_info()
topic_info.head(17)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,173420,-1_vegans_carbs_protein_beef,"[vegans, carbs, protein, beef, carnivore, vega...",[These are my current raw meals to give you an...
1,0,14608,0_seed oils_oils_seed oil_olive oil,"[seed oils, oils, seed oil, olive oil, canola ...",[What were you looking at that had seed oils i...
2,1,13370,1_vegans_veganism_vegan_vegan diet,"[vegans, veganism, vegan, vegan diet, animals,...",[I am a recent vegan as well. After a year and...
3,2,12355,2_bacon_raw meat_carnivore diet_carnivores,"[bacon, raw meat, carnivore diet, carnivores, ...","[Cremated bacon, Well done my carnivore friend..."
4,3,6015,3_keto_carbs_ketosis_carb,"[keto, carbs, ketosis, carb, ketogenic, low ca...","[You mentioned keto. Have you seen this group,..."
5,4,5915,4_b12_vitamin_thiamine_supplements,"[b12, vitamin, thiamine, supplements, sodium, ...",[I'm glad you are willing to try thiamine agai...
6,5,5743,5_trump_vote_political_government,"[trump, vote, political, government, election,...","[After Biden beat Trump in 2020, Trump produce..."
7,6,4128,6_fasting_protein_fasts_meal,"[fasting, protein, fasts, meal, low protein, m...","[Up the protein my guy, Any fasting?, Damn aut..."
8,7,3961,7_estrogen_thyroid_progesterone_testosterone,"[estrogen, thyroid, progesterone, testosterone...",[Can you explain more about the options to imp...
9,8,3823,8_congratulations_progress_amazing progress_fe...,"[congratulations, progress, amazing progress, ...","[Congratulations!🤸🏻‍♀️, Congratulations keep i..."


# No seed words

In [16]:
topic_model = BERTopic(
    embedding_model=model,
    calculate_probabilities=False,
    min_topic_size=100,
    verbose=True,
    nr_topics=150,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=MaximalMarginalRelevance(diversity=0.1),
    )

# Fit the model with the embeddings (without recalculating embeddings)
topics = topic_model.fit_transform(texts, embeddings)

2025-05-13 18:27:25,803 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-13 18:37:06,234 - BERTopic - Dimensionality - Completed ✓
2025-05-13 18:37:06,248 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-13 18:37:58,847 - BERTopic - Cluster - Completed ✓
2025-05-13 18:37:58,847 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-13 18:38:10,171 - BERTopic - Representation - Completed ✓
2025-05-13 18:38:10,190 - BERTopic - Topic reduction - Reducing number of topics
2025-05-13 18:38:10,653 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-13 18:38:25,118 - BERTopic - Representation - Completed ✓
2025-05-13 18:38:25,154 - BERTopic - Topic reduction - Reduced number of topics from 450 to 150


In [17]:
pd.reset_option('display.max_colwidth')
topic_info = topic_model.get_topic_info()
topic_info.head(17)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,169072,-1_vegans_animals_carbs_carnivore,"[vegans, animals, carbs, carnivore, veganism, ...",[Sure! What I did was work out the fat content...
1,0,16137,0_oils_seed oils_seed oil_olive oil,"[oils, seed oils, seed oil, olive oil, canola ...",[They do but they have oil in almost everythin...
2,1,9484,1_keto_fasting_protein_carbs,"[keto, fasting, protein, carbs, carb, ketosis,...","[And r/fasting, Found it! It was on r/Keto, Fa..."
3,2,8812,2_milk_raw milk_dairy_lactose,"[milk, raw milk, dairy, lactose, yogurt, ice c...","[Milk., Except the part that it’s not actually..."
4,3,8372,3_vegans_veganism_vegan diet_vegetarian,"[vegans, veganism, vegan diet, vegetarian, veg...","[Part 2. >""Even partially omnivorous diets ran..."
5,4,6436,4_raw meat_pork_red meat_beef,"[raw meat, pork, red meat, beef, ground beef, ...","[Bacon is life., I could go for some bacon rig..."
6,5,5880,5_carnivores_carnivore_omnivores_herbivores,"[carnivores, carnivore, omnivores, herbivores,...","[Dogs are not carnivores, dawg. Cats are thoug..."
7,6,4996,6_congratulations_congrats_progress_inspiring,"[congratulations, congrats, progress, inspirin...","[Incredible!!! Congratulations!!!, Congratulat..."
8,7,4973,7_cholesterol_ldl_insulin_glucose,"[cholesterol, ldl, insulin, glucose, hdl, stat...","[I have measurements for Cholesterol (total), ..."
9,8,4893,8_government_capitalism_political_election,"[government, capitalism, political, election, ...",[He requested his name be taken off the ballot...


# More topics

In [19]:
topic_model = BERTopic(
    embedding_model=model,
    calculate_probabilities=False,
    min_topic_size=100,
    verbose=True,
    nr_topics=300,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=MaximalMarginalRelevance(diversity=0.1),
    seed_topic_list=seed_topic_list,
    )

# Fit the model with the embeddings (without recalculating embeddings)
topics = topic_model.fit_transform(texts, embeddings)

2025-05-13 19:07:01,210 - BERTopic - Guided - Find embeddings highly related to seeded topics.
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.24it/s]
2025-05-13 19:07:03,247 - BERTopic - Guided - Completed ✓
2025-05-13 19:07:03,250 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-13 19:14:49,283 - BERTopic - Dimensionality - Completed ✓
2025-05-13 19:14:49,295 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-13 19:15:28,673 - BERTopic - Cluster - Completed ✓
2025-05-13 19:15:28,674 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-13 19:15:43,214 - BERTopic - Representation - Completed ✓
2025-05-13 19:15:43,246 - BERTopic - Topic reduction - Reducing number of topics
2025-05-13 19:15:43,758 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-13 19:16:01,971 - BERTopic - Representation - Completed ✓
2025-05-13 19:16:02,025 - BERTopic - Topic reduction - R

In [20]:
pd.reset_option('display.max_colwidth')
topic_info = topic_model.get_topic_info()
topic_info.head(17)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,149428,-1_veganism_steak_meals_plant based,"[veganism, steak, meals, plant based, vegans, ...",[Fuck veganism. Stop assuming things about peo...
1,0,16790,0_seed oils_seed oil_oils_olive oil,"[seed oils, seed oil, oils, olive oil, avocado...",[You poor Americans. Olive Oil is olive oil in...
2,1,8459,1_covid_vaccine_vaccines_flu,"[covid, vaccine, vaccines, flu, viruses, mrna,...",[Nothing to be baffled about they all know it'...
3,2,8070,2_keto_ketosis_carnivore diet_strict carnivore,"[keto, ketosis, carnivore diet, strict carnivo...","[Keto diet, is this keto or is it carnivore?, ..."
4,3,7525,3_vote_political_election_politics,"[vote, political, election, politics, democrat...",[Check out his full speech. He took himself of...
5,4,7409,4_soy_milk_raw milk_ingredients,"[soy, milk, raw milk, ingredients, pasteurized...","[Soy lecithin is one to avoid, Yes, fermented ..."
6,5,7236,5_carnivores_herbivores_omnivores_plants,"[carnivores, herbivores, omnivores, plants, ca...","[That is pretty unusual take since term ""omniv..."
7,6,5090,6_thyroid_testosterone_hypothyroidism_hormones,"[thyroid, testosterone, hypothyroidism, hormon...",[Right now I’m taking 137mcg t4 and 20mcg t3. ...
8,7,4543,7_plant based_veganism_organic_based diet,"[plant based, veganism, organic, based diet, c...",[Did I say that? I said the masses have been c...
9,8,3838,8_nutrition_doctors_dietitians_dietitian,"[nutrition, doctors, dietitians, dietitian, nu...",[Have you guys heard of Merchants of Doubt? Th...


# Fewer topics

In [21]:
# Fine-tuning UMAP parameters
umap_model = UMAP(n_neighbors=5, n_components=15, min_dist=0.0, metric='cosine', random_state=42)

# Fine-tuning HDBSCAN parameters
hdbscan_model = HDBSCAN(min_cluster_size=125, min_samples=15, metric='euclidean', cluster_selection_method="eom", prediction_data=False)

# Use ClassTfidfTransformer for better term-document representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=True)

# Vectorizer model (CountVectorizer or TF-IDF)
vectorizer_model = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS), min_df=5, binary=False, max_df=0.6, ngram_range=(1, 2))

In [22]:
topic_model = BERTopic(
    embedding_model=model,
    calculate_probabilities=False,
    min_topic_size=125,
    verbose=True,
    nr_topics=75,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=MaximalMarginalRelevance(diversity=0.1),
    seed_topic_list=seed_topic_list,
    )

# Fit the model with the embeddings (without recalculating embeddings)
topics = topic_model.fit_transform(texts, embeddings)

2025-05-13 19:33:37,743 - BERTopic - Guided - Find embeddings highly related to seeded topics.
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.98it/s]
2025-05-13 19:33:39,464 - BERTopic - Guided - Completed ✓
2025-05-13 19:33:39,464 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-13 19:44:05,096 - BERTopic - Dimensionality - Completed ✓
2025-05-13 19:44:05,108 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-13 19:44:41,003 - BERTopic - Cluster - Completed ✓
2025-05-13 19:44:41,004 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-13 19:44:54,169 - BERTopic - Representation - Completed ✓
2025-05-13 19:44:54,210 - BERTopic - Topic reduction - Reducing number of topics
2025-05-13 19:44:54,711 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-13 19:45:08,788 - BERTopic - Representation - Completed ✓
2025-05-13 19:45:08,820 - BERTopic - Topic reduction - R

In [23]:
pd.reset_option('display.max_colwidth')
topic_info = topic_model.get_topic_info()
topic_info.head(17)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,147333,-1_vegans_carbs_veganism_keto,"[vegans, carbs, veganism, keto, diets, eat mea...","[""Factory farming is simply an application of ..."
1,0,27783,0_pufa_keto_carbs_fasting,"[pufa, keto, carbs, fasting, insulin, carnivor...",[Experimenting with TCD or the [stearic diet](...
2,1,16680,1_seed oil_olive oil_avocado oil_coconut oil,"[seed oil, olive oil, avocado oil, coconut oil...",[Also some places cut the olive oil with seed ...
3,2,13504,2_vegans_veganism_vegan diet_vegetarian,"[vegans, veganism, vegan diet, vegetarian, ani...",[I'd love to see the proof of this. Seriously....
4,3,11550,3_carnivores_herbivores_omnivores_vegans,"[carnivores, herbivores, omnivores, vegans, ve...","[most carnivores *DO*, They can have *all* the..."
5,4,8715,4_soy_raw milk_ingredients_pasteurized,"[soy, raw milk, ingredients, pasteurized, yogu...","[Soy lecithin 3 times in addition to the oil.,..."
6,5,8492,5_covid_vaccines_vaccine_flu,"[covid, vaccines, vaccine, flu, mrna, covid 19...",[Because you are putting those patients at ris...
7,6,7840,6_vote_political_election_politics,"[vote, political, election, politics, democrat...",[Like now they’re begging for the wars to neve...
8,7,7748,7_congratulations_congrats_progress_proud,"[congratulations, congrats, progress, proud, i...","[Congratulations! That's what we're here for!,..."
9,8,7344,8_bacon_raw meat_red meat_ground beef,"[bacon, raw meat, red meat, ground beef, eat r...","[Kid tried bacon and can't go back., Bacon., b..."


# Dense Clusters

In [24]:
# Fine-tuning UMAP parameters
umap_model = UMAP(n_neighbors=3, n_components=10, min_dist=0.0, metric='cosine', random_state=42)

# Fine-tuning HDBSCAN parameters
hdbscan_model = HDBSCAN(min_cluster_size=100, min_samples=15, metric='euclidean', cluster_selection_method="eom", prediction_data=False)

# Use ClassTfidfTransformer for better term-document representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=True)

# Vectorizer model (CountVectorizer or TF-IDF)
vectorizer_model = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS), min_df=5, binary=False, max_df=0.6, ngram_range=(1, 2))

In [25]:
topic_model = BERTopic(
    embedding_model=model,
    calculate_probabilities=False,
    min_topic_size=100,
    verbose=True,
    nr_topics=150,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=MaximalMarginalRelevance(diversity=0.1),
    seed_topic_list=seed_topic_list,
    )

# Fit the model with the embeddings (without recalculating embeddings)
topics = topic_model.fit_transform(texts, embeddings)

2025-05-13 20:02:01,841 - BERTopic - Guided - Find embeddings highly related to seeded topics.
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.45it/s]
2025-05-13 20:02:03,709 - BERTopic - Guided - Completed ✓
2025-05-13 20:02:03,710 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-13 20:07:27,470 - BERTopic - Dimensionality - Completed ✓
2025-05-13 20:07:27,483 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-13 20:08:03,956 - BERTopic - Cluster - Completed ✓
2025-05-13 20:08:03,956 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-13 20:08:17,832 - BERTopic - Representation - Completed ✓
2025-05-13 20:08:17,881 - BERTopic - Topic reduction - Reducing number of topics
2025-05-13 20:08:18,394 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-13 20:08:34,434 - BERTopic - Representation - Completed ✓
2025-05-13 20:08:34,483 - BERTopic - Topic reduction - R

In [26]:
pd.reset_option('display.max_colwidth')
topic_info = topic_model.get_topic_info()
topic_info.head(17)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,161594,-1_vegans_carbs_keto_protein,"[vegans, carbs, keto, protein, veganism, steak...","[Hello all! I'm starting ZC for a ""meatful mar..."
1,0,16379,0_seed oils_oils_seed oil_olive oil,"[seed oils, oils, seed oil, olive oil, avocado...","[Seed oils are an issue, but they're not the l..."
2,1,11489,1_plants_carnivores_animals_herbivores,"[plants, carnivores, animals, herbivores, omni...","[I mean we're omnivores for sure, not carnivor..."
3,2,10205,2_vegans_veganism_animals_vegetarian,"[vegans, veganism, animals, vegetarian, vegan ...",[So I’m in a cult and our main belief is that ...
4,3,8584,3_covid_vaccines_vaccine_vax,"[covid, vaccines, vaccine, vax, flu, viruses, ...","[How are vaccines not toxic, Whether it's abou..."
5,4,8328,4_soy_raw milk_ingredients_protein,"[soy, raw milk, ingredients, protein, pasteuri...",[I know that Nattokinase is a time tested supp...
6,5,7914,5_trump_vote_political_election,"[trump, vote, political, election, politics, d...","[I'll be voting. Trump 2024, You're suggesting..."
7,6,6342,6_raw milk_dairy_lactose_yogurt,"[raw milk, dairy, lactose, yogurt, raw dairy, ...","[I was talking about cheese, How hot do you le..."
8,7,5108,7_thyroid_testosterone_hormones_hypothyroidism,"[thyroid, testosterone, hormones, hypothyroidi...",[I take prescription desiccated thyroid: NP Th...
9,8,4897,8_keto_carb_carbs_zero carb,"[keto, carb, carbs, zero carb, ketosis, weight...","[sposed to have carbs with it, Literally all c..."


# Sparse Clusters

In [8]:
# Fine-tuning UMAP parameters
umap_model = UMAP(n_neighbors=15, n_components=30, min_dist=0.0, metric='cosine', random_state=42)

# Fine-tuning HDBSCAN parameters
hdbscan_model = HDBSCAN(min_cluster_size=100, min_samples=15, metric='euclidean', cluster_selection_method="eom", prediction_data=False)

# Use ClassTfidfTransformer for better term-document representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=True)

# Vectorizer model (CountVectorizer or TF-IDF)
vectorizer_model = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS), min_df=5, binary=False, max_df=0.6, ngram_range=(1, 2))

In [9]:
topic_model = BERTopic(
    embedding_model=model,
    calculate_probabilities=False,
    min_topic_size=100,
    verbose=True,
    nr_topics=150,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=MaximalMarginalRelevance(diversity=0.1),
    seed_topic_list=seed_topic_list,
    )

# Fit the model with the embeddings (without recalculating embeddings)
topics = topic_model.fit_transform(texts, embeddings)

2025-05-13 22:20:39,576 - BERTopic - Guided - Find embeddings highly related to seeded topics.
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.49it/s]
2025-05-13 22:20:41,155 - BERTopic - Guided - Completed ✓
2025-05-13 22:20:41,155 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-13 22:35:44,803 - BERTopic - Dimensionality - Completed ✓
2025-05-13 22:35:44,828 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-13 22:36:29,908 - BERTopic - Cluster - Completed ✓
2025-05-13 22:36:29,908 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-13 22:36:40,210 - BERTopic - Representation - Completed ✓
2025-05-13 22:36:40,219 - BERTopic - Topic reduction - Reducing number of topics
2025-05-13 22:36:40,659 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-13 22:36:54,766 - BERTopic - Representation - Completed ✓
2025-05-13 22:36:54,799 - BERTopic - Topic reduction - R

In [10]:
pd.reset_option('display.max_colwidth')
topic_info = topic_model.get_topic_info()
topic_info.head(17)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,188756,-1_vegans_carnivore_carbs_beef,"[vegans, carnivore, carbs, beef, foods, vegani...",[Same. I only feel like eating chicken and fis...
1,0,10205,0_political_election_politics_biden,"[political, election, politics, biden, kennedy...","[Rfk!, Even if you think that's true, that doe..."
2,1,9581,1_vegans_veganism_vegan_vegan diet,"[vegans, veganism, vegan, vegan diet, vegetari...","[Let's unpack this. Vegan claims: compassion, ..."
3,2,7051,2_seed oils_oils_seed oil_olive oil,"[seed oils, oils, seed oil, olive oil, avocado...",[It still has seed oils in it but I would assu...
4,3,5353,3_carnivores_carnivore_omnivores_herbivores,"[carnivores, carnivore, omnivores, herbivores,...","[Yeah, cats definitely can't be vegan although..."
5,4,5075,4_raw milk_dairy_lactose_pasteurized,"[raw milk, dairy, lactose, pasteurized, yogurt...","[Homogenised milk, I do this too, without the ..."
6,5,4735,5_congratulations_progress_proud_transformation,"[congratulations, progress, proud, transformat...","[Congratulations. Good for you. Keep it up!, C..."
7,6,4614,6_insulin_glucose_diabetes_low pufa,"[insulin, glucose, diabetes, low pufa, fasting...","[So, for me, my PSMF (which thankfully ends so..."
8,7,4093,7_pork_steak_ground beef_beef,"[pork, steak, ground beef, beef, sausage, stea...","[Where is the bacon?, But, why would bacon hav..."
9,8,3727,8_keto_ketosis_low carb_ketogenic,"[keto, ketosis, low carb, ketogenic, keto diet...","[What do you call ""some carbs""?, All thats mis..."
