In [1]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

In [2]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

# Instantiating CountVectorizer (bag-of-words) to upgrade pre-procerssing
vectorizer_model = CountVectorizer(
    stop_words='english',
    min_df=0.002, # min number of a word occurance - rare words
    max_df=0.75 # max percentage of occurance - word likely stopword
)

# customizing HDBSCAN clustering model
hdbscan_model = HDBSCAN(
    min_cluster_size=500,
    min_samples=25,
    cluster_selection_epsilon=0.05,
    prediction_data=True,
    memory='./hdbscan_cache/',
    core_dist_n_jobs=1 
    
)

umap_model = UMAP(
    n_neighbors=20,
    n_components=5,
    min_dist=0.1,
    metric='cosine',
    low_memory=True
)

# Instantiating a representation model to customize topic labels
representation_model = KeyBERTInspired()

# Removing empty texts
text_df['clean_text'] = text_df['clean_text'].fillna('').astype(str)
text_df = text_df[text_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    verbose=True,
    calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(text_df['clean_text'].astype(str))

Using device : cuda


2025-08-25 11:08:53,775 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/7289 [00:00<?, ?it/s]

2025-08-25 11:11:29,356 - BERTopic - Embedding - Completed ✓
2025-08-25 11:11:29,358 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm

The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.

2025-08-25 11:14:05,398 - BERTopic - Dimensionality - Completed ✓
2025-08-25 11:14:05,418 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-25 11:15:36,553 - BERTopic - Cluster - Completed ✓
2025-08-25 11:15:36,607 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-25 11:15:45,069 - BERTopic - Representation - Completed ✓


In [4]:
topic_model.visualize_barchart()

In [5]:
topic_model.visualize_hierarchy()

In [6]:
new_topics = topic_model.reduce_outliers(text_df['clean_text'], topics, strategy="probabilities", probabilities=probs)

In [7]:
topic_info = topic_model.get_topic_info()
topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

text_df['topic'] = [
    topic_id_to_name.get(topic, f"Topic {topic}") 
    for topic in new_topics
]
text_df['topic_id'] = new_topics

In [8]:
text_df.head()

Unnamed: 0,author,id,type,clean_text,topic,topic_id
0,Raichu4u,1mv3z5n,post,nirvana fallacy people dismiss real option isn...,54_disappearances_apathy_abuses_apathetic,54
1,KingGhidorah1225,1musrfu,post,term fascism used modern politics qualify diff...,58_speech_tendencies_suppression_hierarchy,58
2,Potato_Cat93,1mujuir,post,epstein files reported shared starting friday ...,14_democrats_republicans_election_dems,14
3,Rong_Liu,1mumwdb,post,discussion basic components social democracy t...,27_authoritarianism_democratic_democracies_aut...,27
4,Candle-Jolly,1mun37m,post,since creation 2003 department homeland securi...,7_democrats_republicans_biden_obama,7


#### Computing topic at community-level

In [15]:
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [16]:
community_topic_counts = merged_df.groupby(['community_id', 'topic', 'topic_id']).size().reset_index(name='count')

# Compouting each tipic percentage in each community
community_totals = community_topic_counts.groupby('community_id')['count'].transform('sum')
community_topic_counts['percent'] = community_topic_counts['count'] / community_totals

# Retrieving the dominant topic for each community
top_n_topics = community_topic_counts.groupby('community_id').apply(
    lambda x: x.nlargest(5, 'count')
).reset_index(drop=True)
top_n_topics

Unnamed: 0,community_id,topic,topic_id,count,percent
0,0,0_debates_discussions_moderation_debating,0,10894,0.096193
1,0,2_boundaries_expectations_boundary_roles,2,10250,0.090507
2,0,4_borders_settlements_deaths_territory,4,6058,0.053492
3,0,44_selfish_suicides_placebo_suicide,44,4642,0.040989
4,0,6_sharia_criticize_practices_criticism,6,3995,0.035276
...,...,...,...,...,...
200,40,1_boundaries_plans_bills_trip,1,17,0.515152
201,40,62_stubborn_irresponsible_responsibilities_adu...,62,5,0.151515
202,40,0_debates_discussions_moderation_debating,0,3,0.090909
203,40,30_delays_frontier_planes_overhead,30,3,0.090909


### Topic-role distribuition

In [17]:
# Hubs
hubs = merged_df.loc[merged_df['is_hub']]
hub_topic_counts = hubs.groupby(['community_id', 'topic']).size().reset_index(name='count')
hub_topic_counts['percent'] = hub_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())

# Bridges
bridges = merged_df.loc[merged_df['is_bridge']]
bridge_topic_counts = bridges.groupby(['community_id', 'topic']).size().reset_index(name='count')
bridge_topic_counts['percent'] = bridge_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())


In [18]:
hub_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_debates_discussions_moderation_debating,3353,0.102432
1,0,10_democratic_voters_dems_democrats,1059,0.032352
2,0,11_viewership_revenue_competition_gambling,341,0.010417
3,0,12_topics_behaviour_deeper_shaming,594,0.018146
4,0,13_corporations_markets_meetings_executives,670,0.020468


In [19]:
bridge_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_debates_discussions_moderation_debating,1423,0.086949
1,0,10_democratic_voters_dems_democrats,452,0.027618
2,0,11_viewership_revenue_competition_gambling,240,0.014665
3,0,12_topics_behaviour_deeper_shaming,314,0.019186
4,0,13_corporations_markets_meetings_executives,391,0.023891


In [None]:
# Saving final dataframe
text_df.drop(columns=['clean_text'], inplace=True)
text_df.to_csv('../../../src/nlp/topic_modeling/topic_data.csv', index=False)

community_topic_counts.to_csv("../../../src/nlp/topic_modeling/community_topic_counts.csv", index=False)
top_n_topics.to_csv("../../../src/nlp/topic_modeling/top_n_topics.csv", index=False)
hub_topic_counts.to_csv("../../../src/nlp/topic_modeling/hub_topic_counts.csv", index=False)
bridge_topic_counts.to_csv("../../../src/nlp/topic_modeling/bridge_topic_counts.csv", index=False)