In [1]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

2025-08-23 12:18:27.968367: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-23 12:18:28.040502: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755944308.073360   19519 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755944308.081505   19519 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755944308.115396   19519 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

# Instantiating CountVectorizer (bag-of-words) to upgrade pre-procerssing
vectorizer_model = CountVectorizer(
    stop_words='english',
    min_df=0.002, # min number of a word occurance - rare words
    max_df=0.75 # max percentage of occurance - word likely stopword
)

# customizing HDBSCAN clustering model
hdbscan_model = HDBSCAN(
    min_cluster_size=500,
    min_samples=25,
    cluster_selection_epsilon=0.05,
    prediction_data=True
)

umap_model = UMAP(
    n_neighbors=20,
    n_components=5,
    min_dist=0.1,
    metric='cosine',
)

# Instantiating a representation model to customize topic labels
representation_model = KeyBERTInspired()

# Removing empty texts
text_df['clean_text'] = text_df['clean_text'].fillna('').astype(str)
text_df = text_df[text_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    verbose=True,
    calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(text_df['clean_text'].astype(str))

Using device : cuda


2025-08-23 12:18:42,576 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/6360 [00:00<?, ?it/s]

2025-08-23 12:21:03,589 - BERTopic - Embedding - Completed ✓
2025-08-23 12:21:03,591 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm

The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.

2025-08-23 12:24:28,451 - BERTopic - Dimensionality - Completed ✓
2025-08-23 12:24:28,468 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-23 12:25:39,678 - BERTopic - Cluster - Completed ✓
2025-08-23 12:25:39,722 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-23 12:25:47,691 - BERTopic - Representation - Completed ✓


In [4]:
topic_model.visualize_barchart()

In [5]:
topic_model.visualize_hierarchy()

In [6]:
new_topics = topic_model.reduce_outliers(text_df['clean_text'], topics, strategy="probabilities", probabilities=probs)

In [7]:
topic_info = topic_model.get_topic_info()
topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

text_df['topic'] = [
    topic_id_to_name.get(topic, f"Topic {topic}") 
    for topic in new_topics
]
text_df['topic_id'] = new_topics

In [8]:
text_df.head()

Unnamed: 0,author,id,type,clean_text,topic,topic_id
0,Raichu4u,1mv3z5n,post,nirvana fallacy people dismiss real option isn...,42_abstain_abstaining_lazy_dislike,42
1,Potato_Cat93,1mujuir,post,epstein files reported shared starting friday ...,9_democrats_election_republicans_dems,9
2,Candle-Jolly,1mun37m,post,since creation 2003 department homeland securi...,6_administration_president_deportations_obama,6
3,PsychLegalMind,1mu59zz,post,white house zelensky meeting followed eu meeti...,0_nato_putin_russia_territory,0
4,Virtual-Orchid3065,1mt26yb,post,democrats care image action jan 6 california c...,14_congressional_partisan_congress_presidential,14


#### Computing topic at community-level

In [9]:
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [None]:
community_topic_counts = merged_df.groupby(['community_id', 'topic', 'topic_id']).size().reset_index(name='count')

# Compouting each tipic percentage in each community
community_totals = community_topic_counts.groupby('community_id')['count'].transform('sum')
community_topic_counts['percent'] = community_topic_counts['count'] / community_totals

# Retrieving the dominant topic for each community
top_n_topics = community_topic_counts.groupby('community_id').apply(
    lambda x: x.nlargest(5, 'count')
).reset_index(drop=True)
top_n_topics

NameError: name 'merged_df' is not defined

### Topic-role distribuition

In [11]:
# Hubs
hubs = merged_df.loc[merged_df['is_hub']]
hub_topic_counts = hubs.groupby(['community_id', 'topic']).size().reset_index(name='count')
hub_topic_counts['percent'] = hub_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())

# Bridges
bridges = merged_df.loc[merged_df['is_bridge']]
bridge_topic_counts = bridges.groupby(['community_id', 'topic']).size().reset_index(name='count')
bridge_topic_counts['percent'] = bridge_topic_counts.groupby(['community_id'])['count'].transform(lambda x: x/x.sum())


In [12]:
hub_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_nato_putin_russia_territory,1974,0.080254
1,0,10_amendment_campaign_conservatives_liberals,741,0.030126
2,0,11_appeals_incidents_defended_hostile,3,0.000122
3,0,12_nato_planned_regimes_mao,465,0.018905
4,0,13_obamacare_appointments_denials_medicare,356,0.014473


In [13]:
bridge_topic_counts.head()

Unnamed: 0,community_id,topic,count,percent
0,0,0_nato_putin_russia_territory,916,0.071697
1,0,10_amendment_campaign_conservatives_liberals,424,0.033187
2,0,11_appeals_incidents_defended_hostile,4,0.000313
3,0,12_nato_planned_regimes_mao,147,0.011506
4,0,13_obamacare_appointments_denials_medicare,89,0.006966


In [15]:
# Saving final dataframe
text_df.drop(columns=['clean_text'], inplace=True)
text_df.to_csv('../../../src/nlp/topic_modeling/topic_data.csv', index=False)

community_topic_counts.to_csv("../../../src/nlp/topic_modeling/community_topic_counts.csv", index=False)
top_n_topics.to_csv("../../../src/nlp/topic_modeling/top_n_topics.csv", index=False)
hub_topic_counts.to_csv("../../../src/nlp/topic_modeling/hub_topic_counts.csv", index=False)
bridge_topic_counts.to_csv("../../../src/nlp/topic_modeling/bridge_topic_counts.csv", index=False)