In [1]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

2025-08-28 10:12:59.727888: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-28 10:13:00.113876: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756368780.258101    1187 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756368780.301224    1187 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756368780.625157    1187 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

# Instantiating CountVectorizer (bag-of-words) to upgrade pre-procerssing
vectorizer_model = CountVectorizer(
    stop_words='english',
    min_df=0.001,
    max_df=0.7,
    max_features=30000
)

# Customizing HDBSCAN clustering model
hdbscan_model = HDBSCAN(
    min_cluster_size=150,
    min_samples=10,
    cluster_selection_epsilon=0.01,
    prediction_data=True,
    memory='./hdbscan_cache/',
    core_dist_n_jobs=1 
)

umap_model = UMAP(
    n_neighbors=30,
    n_components=5,
    min_dist=0.05,
    metric='cosine',
    low_memory=True
)

# Instantiating a representation model to customize topic labels
representation_model = MaximalMarginalRelevance(diversity=0.3)

# Removing empty texts
text_df['clean_text'] = text_df['clean_text'].fillna('').astype(str)
text_df = text_df[text_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    verbose=True,
    calculate_probabilities=True,
    nr_topics= 50
)

topics, probs = topic_model.fit_transform(text_df['clean_text'].astype(str))

Using device : cuda


2025-08-28 10:13:14,995 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/7208 [00:00<?, ?it/s]

2025-08-28 10:15:42,149 - BERTopic - Embedding - Completed ✓
2025-08-28 10:15:42,151 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm

The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.

2025-08-28 10:17:46,915 - BERTopic - Dimensionality - Completed ✓
2025-08-28 10:17:46,932 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-28 10:23:15,269 - BERTopic - Cluster - Completed ✓
2025-08-28 10:23:15,271 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-08-28 10:23:17,337 - BERTopic - Representation - Completed ✓
2025-08-28 10:23:17,339 - BERTopic - Topic reduction - Reducing number of topics
2025-08-28 10:23:17,586 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-28 10:23:23,173 - BERTopic - Representation - Completed ✓
2025-08-28 10:23:23,1

In [4]:
topic_model.visualize_barchart()

In [5]:
topic_model.visualize_hierarchy()

In [6]:
new_topics = topic_model.reduce_outliers(text_df['clean_text'], topics, strategy="probabilities", probabilities=probs)

In [7]:
topic_info = topic_model.get_topic_info()
topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

text_df['topic'] = [
    topic_id_to_name.get(topic, f"Topic {topic}") 
    for topic in new_topics
]
text_df['topic_id'] = new_topics

In [8]:
text_df.head()

Unnamed: 0,author,id,type,clean_text,topic,topic_id
0,Raichu4u,1mv3z5n,post,nirvana fallacy people dismiss real option isn...,0_vote_epstein_democrats_voters,0
1,KingGhidorah1225,1musrfu,post,term fascism used modern politics qualify diff...,2_genocide_hamas_war_gaza,2
2,Potato_Cat93,1mujuir,post,epstein files reported shared starting friday ...,0_vote_epstein_democrats_voters,0
3,Rong_Liu,1mumwdb,post,discussion basic components social democracy t...,5_healthcare_capitalism_taxes_tariffs,5
4,Candle-Jolly,1mun37m,post,since creation 2003 department homeland securi...,1_immigrants_citizenship_undocumented_deported,1


#### Computing topic at community-level

In [9]:
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [10]:
community_topic_counts = merged_df.groupby(['community_id', 'topic', 'topic_id']).size().reset_index(name='count')

# Compouting each tipic percentage in each community
community_totals = community_topic_counts.groupby('community_id')['count'].transform('sum')
community_topic_counts['percent'] = community_topic_counts['count'] / community_totals

# Retrieving the dominant topic for each community
top_n_topics = community_topic_counts.groupby('community_id').apply(
    lambda x: x.nlargest(5, 'count')
).reset_index(drop=True)
top_n_topics

Unnamed: 0,community_id,topic,topic_id,count,percent
0,0,4_racism_genes_privilege_eugenics,4,18872,0.167309
1,0,0_vote_epstein_democrats_voters,0,15319,0.135810
2,0,5_healthcare_capitalism_taxes_tariffs,5,12267,0.108753
3,0,2_genocide_hamas_war_gaza,2,11008,0.097591
4,0,1_immigrants_citizenship_undocumented_deported,1,6766,0.059984
...,...,...,...,...,...
170,34,14_music_superman_songs_films,14,8,0.333333
171,34,0_vote_epstein_democrats_voters,0,4,0.166667
172,34,16_insult_apology_insults_replying,16,3,0.125000
173,34,12_dating_violence_relationships_romantic,12,2,0.083333


### Topic-role distribuition

In [11]:
# Hubs
hubs = merged_df.loc[merged_df['is_hub']]
hub_topic_counts = hubs.groupby(['community_id', 'topic']).size().reset_index(name='count')
hub_topic_counts['percent'] = hub_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())

# Bridges
bridges = merged_df.loc[merged_df['is_bridge']]
bridge_topic_counts = bridges.groupby(['community_id', 'topic']).size().reset_index(name='count')
bridge_topic_counts['percent'] = bridge_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())


In [12]:
hub_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_vote_epstein_democrats_voters,1876,0.119437
1,0,10_tipping_tip_wage_tipped,410,0.026103
2,0,11_massage_subjective_qualia_consciousness,643,0.040937
3,0,12_dating_violence_relationships_romantic,556,0.035398
4,0,13_wedding_bride_ceremony_bridesmaid,214,0.013624


In [13]:
bridge_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_vote_epstein_democrats_voters,834,0.144366
1,0,10_tipping_tip_wage_tipped,300,0.05193
2,0,11_massage_subjective_qualia_consciousness,226,0.039121
3,0,12_dating_violence_relationships_romantic,309,0.053488
4,0,13_wedding_bride_ceremony_bridesmaid,192,0.033235


In [14]:
# Saving final dataframe
text_df.drop(columns=['clean_text'], inplace=True)
text_df.to_csv('../../../src/nlp/topic_modeling/topic_data.csv', index=False)

community_topic_counts.to_csv("../../../src/nlp/topic_modeling/community_topic_counts.csv", index=False)
top_n_topics.to_csv("../../../src/nlp/topic_modeling/top_n_topics.csv", index=False)
hub_topic_counts.to_csv("../../../src/nlp/topic_modeling/hub_topic_counts.csv", index=False)
bridge_topic_counts.to_csv("../../../src/nlp/topic_modeling/bridge_topic_counts.csv", index=False)