In [1]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

2025-08-26 16:46:18.085804: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-26 16:46:18.152838: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756219578.173794    7424 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756219578.183409    7424 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756219578.211327    7424 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

# Instantiating CountVectorizer (bag-of-words) to upgrade pre-procerssing
vectorizer_model = CountVectorizer(
    stop_words='english',
    min_df=0.001,
    max_df=0.80,
    max_features=15000,
    lowercase=True
)

# customizing HDBSCAN clustering model
hdbscan_model = HDBSCAN(
    min_cluster_size=600,
    min_samples=25,
    cluster_selection_epsilon=0.05,
    prediction_data=True,
    memory='./hdbscan_cache/',
    core_dist_n_jobs=1 
)

umap_model = UMAP(
    n_neighbors=20,
    n_components=5,
    min_dist=0.1,
    metric='cosine',
    low_memory=True
)

# Instantiating a representation model to customize topic labels
representation_model = KeyBERTInspired()

# Removing empty texts
text_df['clean_text'] = text_df['clean_text'].fillna('').astype(str)
text_df = text_df[text_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    verbose=True,
    calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(text_df['clean_text'].astype(str))

Using device : cuda


2025-08-26 16:46:34,355 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/7289 [00:00<?, ?it/s]

2025-08-26 16:49:04,738 - BERTopic - Embedding - Completed ✓
2025-08-26 16:49:04,740 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm

The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.

2025-08-26 16:51:35,946 - BERTopic - Dimensionality - Completed ✓
2025-08-26 16:51:35,965 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-26 16:52:55,683 - BERTopic - Cluster - Completed ✓
2025-08-26 16:52:55,730 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-26 16:53:02,512 - BERTopic - Representation - Completed ✓


In [4]:
topic_model.visualize_barchart()

In [5]:
topic_model.visualize_hierarchy()

In [6]:
new_topics = topic_model.reduce_outliers(text_df['clean_text'], topics, strategy="probabilities", probabilities=probs)

In [7]:
topic_info = topic_model.get_topic_info()
topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

text_df['topic'] = [
    topic_id_to_name.get(topic, f"Topic {topic}") 
    for topic in new_topics
]
text_df['topic_id'] = new_topics

In [8]:
text_df.head()

Unnamed: 0,author,id,type,clean_text,topic,topic_id
0,Raichu4u,1mv3z5n,post,nirvana fallacy people dismiss real option isn...,0_politicians_elections_gerrymandering_democrats,0
1,KingGhidorah1225,1musrfu,post,term fascism used modern politics qualify diff...,51_crisis_embrace_violence_tendencies,51
2,Potato_Cat93,1mujuir,post,epstein files reported shared starting friday ...,15_democrats_election_democrat_dems,15
3,Rong_Liu,1mumwdb,post,discussion basic components social democracy t...,18_regimes_democracies_dictatorship_democratic,18
4,Candle-Jolly,1mun37m,post,since creation 2003 department homeland securi...,7_democrats_republicans_president_biden,7


#### Computing topic at community-level

In [9]:
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [10]:
community_topic_counts = merged_df.groupby(['community_id', 'topic', 'topic_id']).size().reset_index(name='count')

# Compouting each tipic percentage in each community
community_totals = community_topic_counts.groupby('community_id')['count'].transform('sum')
community_topic_counts['percent'] = community_topic_counts['count'] / community_totals

# Retrieving the dominant topic for each community
top_n_topics = community_topic_counts.groupby('community_id').apply(
    lambda x: x.nlargest(5, 'count')
).reset_index(drop=True)
top_n_topics

Unnamed: 0,community_id,topic,topic_id,count,percent
0,0,0_politicians_elections_gerrymandering_democrats,0,13339,0.117852
1,0,2_debates_conclusions_disinformation_discussions,2,9799,0.086576
2,0,4_outcomes_statistics_roles_expectations,4,7107,0.062792
3,0,5_casualties_attacks_terrorism_hamas,5,7026,0.062076
4,0,45_qualitative_closure_experiences_descriptions,45,4632,0.040925
...,...,...,...,...,...
180,36,7_democrats_republicans_president_biden,7,8,0.160000
181,36,0_politicians_elections_gerrymandering_democrats,0,6,0.120000
182,36,2_debates_conclusions_disinformation_discussions,2,6,0.120000
183,36,16_unrealized_federal_progressive_bezos,16,5,0.100000


### Topic-role distribuition

In [11]:
# Hubs
hubs = merged_df.loc[merged_df['is_hub']]
hub_topic_counts = hubs.groupby(['community_id', 'topic']).size().reset_index(name='count')
hub_topic_counts['percent'] = hub_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())

# Bridges
bridges = merged_df.loc[merged_df['is_bridge']]
bridge_topic_counts = bridges.groupby(['community_id', 'topic']).size().reset_index(name='count')
bridge_topic_counts['percent'] = bridge_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())


In [12]:
hub_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_politicians_elections_gerrymandering_democrats,1751,0.110564
1,0,10_democratic_voting_voters_dems,464,0.029298
2,0,11_toilets_stalls_jug_soap,41,0.002589
3,0,12_gambling_viewership_titles_entertaining,136,0.008587
4,0,13_infrastructure_crisis_communities_economic,161,0.010166


In [13]:
bridge_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_politicians_elections_gerrymandering_democrats,535,0.103422
1,0,10_democratic_voting_voters_dems,138,0.026677
2,0,11_toilets_stalls_jug_soap,74,0.014305
3,0,12_gambling_viewership_titles_entertaining,75,0.014498
4,0,13_infrastructure_crisis_communities_economic,74,0.014305


In [15]:
# Saving final dataframe
text_df.drop(columns=['clean_text'], inplace=True)
text_df.to_csv('../../../src/nlp/topic_modeling/topic_data.csv', index=False)

community_topic_counts.to_csv("../../../src/nlp/topic_modeling/community_topic_counts.csv", index=False)
top_n_topics.to_csv("../../../src/nlp/topic_modeling/top_n_topics.csv", index=False)
hub_topic_counts.to_csv("../../../src/nlp/topic_modeling/hub_topic_counts.csv", index=False)
bridge_topic_counts.to_csv("../../../src/nlp/topic_modeling/bridge_topic_counts.csv", index=False)