In [1]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from hdbscan import HDBSCAN
from umap import UMAP

2025-08-29 11:48:50.225568: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-29 11:48:50.540247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756460930.642015    1180 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756460930.676952    1180 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756460930.906676    1180 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')

In [3]:
default_stopwords = text.ENGLISH_STOP_WORDS

# Defining customized stopwords-like list of words in order to obtain more detailed topic (without noise)
custom_stopwords = [
    # Reddit/platform specific
    "reddit", "subreddit", "sub", "thread", "post", "posts", "comment", "comments",
    "commenters", "upvotes", "upvote", "downvotes", "op", "changemyview",
    "deltaboards", "deltalog", "unpopularopinion",

    # Conversation/structure
    "replying", "paragraphs", "discussion", "debates",

    # Noise
    "omggg", "ella", "becky", "scarlet",

    # Too generic / cross-topic
    "people", "thing", "things", "stuff", "way", "lot", "good", "bad"
]

all_stopwords = list(default_stopwords.union(custom_stopwords))

In [4]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

# Instantiating CountVectorizer (bag-of-words) to upgrade pre-procerssing
vectorizer_model = CountVectorizer(
    stop_words=all_stopwords,
    min_df=0.001,
    max_df=0.7,
    max_features=30000
)

# Customizing HDBSCAN clustering model
hdbscan_model = HDBSCAN(
    min_cluster_size=300,
    min_samples=10,
    cluster_selection_epsilon=0.01,
    prediction_data=True,
    memory='./hdbscan_cache/',
    core_dist_n_jobs=1 
)

umap_model = UMAP(
    n_neighbors=30,
    n_components=5,
    min_dist=0.05,
    metric='cosine',
    low_memory=True
)

# Instantiating a representation model to customize topic labels
representation_model = MaximalMarginalRelevance(diversity=0.3)

# Removing empty texts
text_df['clean_text'] = text_df['clean_text'].fillna('').astype(str)
text_df = text_df[text_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    verbose=True,
    calculate_probabilities=True,
    nr_topics= 25
)

topics, probs = topic_model.fit_transform(text_df['clean_text'].astype(str))

Using device : cuda


2025-08-29 11:49:07,094 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/7208 [00:00<?, ?it/s]

2025-08-29 11:51:36,069 - BERTopic - Embedding - Completed ✓
2025-08-29 11:51:36,070 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm

The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.

2025-08-29 11:54:02,241 - BERTopic - Dimensionality - Completed ✓
2025-08-29 11:54:02,258 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-29 11:55:54,808 - BERTopic - Cluster - Completed ✓
2025-08-29 11:55:54,810 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-08-29 11:55:59,408 - BERTopic - Representation - Completed ✓
2025-08-29 11:55:59,410 - BERTopic - Topic reduction - Reducing number of topics
2025-08-29 11:55:59,573 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-29 11:56:04,944 - BERTopic - Representation - Completed ✓
2025-08-29 11:56:04,9

In [5]:
topic_model.visualize_barchart()

In [6]:
topic_model.visualize_hierarchy()

In [7]:
new_topics = topic_model.reduce_outliers(text_df['clean_text'], topics, strategy="probabilities", probabilities=probs)

In [8]:
topic_info = topic_model.get_topic_info()
topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

text_df['topic'] = [
    topic_id_to_name.get(topic, f"Topic {topic}") 
    for topic in new_topics
]
text_df['topic_id'] = new_topics

In [9]:
text_df.head()

Unnamed: 0,author,id,type,date,clean_text,lemmatized_text,topic,topic_id
0,Raichu4u,1mv3z5n,post,2025-08-20T03:31:07+00:00,nirvana fallacy people dismiss real option isn...,nirvana fallacy people dismiss real option per...,0_democrats_voters_dems_votes,0
1,KingGhidorah1225,1musrfu,post,2025-08-19T19:41:42+00:00,term fascism used modern politics qualify diff...,term fascism use modern politics qualify diffe...,0_democrats_voters_dems_votes,0
2,Potato_Cat93,1mujuir,post,2025-08-19T14:22:03+00:00,epstein files reported shared starting friday ...,epstein file report shared start friday think ...,0_democrats_voters_dems_votes,0
3,Rong_Liu,1mumwdb,post,2025-08-19T16:11:58+00:00,discussion basic components social democracy t...,discussion basic component social democracy ta...,0_democrats_voters_dems_votes,0
4,Candle-Jolly,1mun37m,post,2025-08-19T16:18:59+00:00,since creation 2003 department homeland securi...,since creation 2003 department homeland securi...,0_democrats_voters_dems_votes,0


#### Computing topic at community-level

In [10]:
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [11]:
community_topic_counts = merged_df.groupby(['community_id', 'topic', 'topic_id']).size().reset_index(name='count')

# Compouting each tipic percentage in each community
community_totals = community_topic_counts.groupby('community_id')['count'].transform('sum')
community_topic_counts['percent'] = community_topic_counts['count'] / community_totals

# Retrieving the dominant topic for each community
top_n_topics = community_topic_counts.groupby('community_id').apply(
    lambda x: x.nlargest(5, 'count')
).reset_index(drop=True)
top_n_topics

Unnamed: 0,community_id,topic,topic_id,count,percent
0,0,1_christianity_muslims_religions_violent,1,36093,0.319982
1,0,0_democrats_voters_dems_votes,0,31431,0.278651
2,0,5_healthcare_tariffs_medicaid_deficit,5,9613,0.085224
3,0,2_genocide_hamas_palestinians_gaza,2,9231,0.081837
4,0,7_childish_reread_olds_distraction,7,4350,0.038565
...,...,...,...,...,...
170,34,6_superman_band_songs_anime,6,9,0.375000
171,34,0_democrats_voters_dems_votes,0,4,0.166667
172,34,7_childish_reread_olds_distraction,7,4,0.166667
173,34,1_christianity_muslims_religions_violent,1,3,0.125000


### Topic-role distribuition

In [12]:
# Hubs
hubs = merged_df.loc[merged_df['is_hub']]
hub_topic_counts = hubs.groupby(['community_id', 'topic']).size().reset_index(name='count')
hub_topic_counts['percent'] = hub_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())

# Bridges
bridges = merged_df.loc[merged_df['is_bridge']]
bridge_topic_counts = bridges.groupby(['community_id', 'topic']).size().reset_index(name='count')
bridge_topic_counts['percent'] = bridge_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())


In [13]:
hub_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_democrats_voters_dems_votes,4105,0.261348
1,0,10_tuition_loans_gpa_colleges,136,0.008659
2,0,11_paternity_tests_dna_pregnant,198,0.012606
3,0,12_shower_socks_smell_shampoo,16,0.001019
4,0,13_zoos_pets_horses_mice,100,0.006367


In [14]:
bridge_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_democrats_voters_dems_votes,1611,0.278864
1,0,10_tuition_loans_gpa_colleges,127,0.021984
2,0,11_paternity_tests_dna_pregnant,217,0.037563
3,0,12_shower_socks_smell_shampoo,47,0.008136
4,0,13_zoos_pets_horses_mice,67,0.011598


In [15]:
# Saving final dataframe
text_df.drop(columns=['clean_text'], inplace=True)
text_df.to_csv('../../../src/nlp/topic_modeling/topic_data.csv', index=False)

community_topic_counts.to_csv("../../../src/nlp/topic_modeling/community_topic_counts.csv", index=False)
top_n_topics.to_csv("../../../src/nlp/topic_modeling/top_n_topics.csv", index=False)
hub_topic_counts.to_csv("../../../src/nlp/topic_modeling/hub_topic_counts.csv", index=False)
bridge_topic_counts.to_csv("../../../src/nlp/topic_modeling/bridge_topic_counts.csv", index=False)