In [1]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN

2025-08-20 16:12:53.983330: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-20 16:12:54.305382: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755699174.425077    1137 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755699174.456928    1137 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755699174.719150    1137 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-mpnet-base-v2')
embedding_model.to(device)

# Instantiating CountVectorizer (bag-of-words) to upgrade pre-procerssing
vectorizer_model = CountVectorizer(
    stop_words='english',
    min_df=1, # min number of a word occurance - rare words
    max_df=0.9 # max percentage of occurance - word likely stopword
)

# customizing HDBSCAN clustering model
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=3, cluster_selection_epsilon=0.2, prediction_data=True)

# Instantiating a representation model to customize topic labels
representation_model = KeyBERTInspired()

# Removing empty texts
merged_df['clean_text'] = merged_df['clean_text'].fillna('').astype(str)
merged_df = merged_df[merged_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model
)

topics, probs = topic_model.fit_transform(merged_df['clean_text'].astype(str))

Using device : cuda



The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.



In [4]:
topic_info = topic_model.get_topic_info()
topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

merged_df['topic'] = [topic_id_to_name[topic] for topic in topics]
merged_df['topic_id'] = topics

In [5]:
# Calculating embeddings
embeddings = embedding_model.encode(
    merged_df['clean_text'].tolist(),
    show_progress_bar=True
)

merged_df['embedding'] = embeddings.tolist()

Batches:   0%|          | 0/3092 [00:00<?, ?it/s]

In [6]:
topic_model.visualize_barchart()

In [7]:
topic_model.visualize_hierarchy()

#### Computing topic at community-level

In [8]:
base = merged_df.loc[merged_df['community_id'] == 74.0]
base

Unnamed: 0,author,id_x,type,clean_text,id_y,indegree,outdegree,degree,internal_degree,community_id,community_type,external_degree,pct_internal,pct_external,is_hub,is_bridge,topic,topic_id,embedding
28188,MagnanimosDesolation,t1_n2hpebz,comment,ok disney hot hectic exhausting often unnecess...,MagnanimosDesolation,7,5,12,4,74.0,Weak community,8,0.333333,0.666667,False,True,-1_trump_government_political_congress,-1,"[0.05375266447663307, 0.02026956155896187, 0.0..."
30908,elocin1985,t1_n36y1va,comment,grandma rose secret didn t find died dad ances...,elocin1985,4,3,7,4,74.0,Weak community,3,0.571429,0.428571,False,False,0_political_republicans_government_democrats,0,"[0.037965159863233566, 0.02537514828145504, -0..."
32637,MagnanimosDesolation,t1_n3e7mz6,comment,scandal administration able simply spin ignore...,MagnanimosDesolation,7,5,12,4,74.0,Weak community,8,0.333333,0.666667,False,True,0_political_republicans_government_democrats,0,"[0.016284022480249405, 0.07522033154964447, -0..."
33789,MediocreSizedDan,t1_n3b8esl,comment,guess curious argument must deport 11 million ...,MediocreSizedDan,2,4,6,1,74.0,Weak community,5,0.166667,0.833333,False,True,0_political_republicans_government_democrats,0,"[-0.021855007857084274, 0.09594006091356277, 0..."
33848,MediocreSizedDan,t1_n3bd97w,comment,sure understand point kicking certain period p...,MediocreSizedDan,2,4,6,1,74.0,Weak community,5,0.166667,0.833333,False,True,0_political_republicans_government_democrats,0,"[-0.043908920139074326, 0.07180030643939972, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77611,unbelizeable1,t1_mzny97z,comment,yea focus illegal shady shit see come legaliza...,unbelizeable1,6,9,15,6,74.0,Weak community,9,0.400000,0.600000,False,False,0_political_republicans_government_democrats,0,"[-0.061489030718803406, 0.12323072552680969, 0..."
77627,unbelizeable1,t1_mznz5oz,comment,really bummed sex nowhere important sex traffi...,unbelizeable1,6,9,15,6,74.0,Weak community,9,0.400000,0.600000,False,False,0_political_republicans_government_democrats,0,"[0.02317735180258751, 0.04522618278861046, -0...."
89593,unbelizeable1,t1_myogxdl,comment,imo biggest mistakes us history accepting trea...,unbelizeable1,6,9,15,6,74.0,Weak community,9,0.400000,0.600000,False,False,-1_trump_government_political_congress,-1,"[-0.015500646084547043, 0.034452587366104126, ..."
93818,half_way_by_accident,t1_mzdtedk,comment,sorry really wtf trying asked know better syst...,half_way_by_accident,0,5,5,2,74.0,Weak community,3,0.400000,0.600000,False,False,0_political_republicans_government_democrats,0,"[0.018433477729558945, 0.008822071366012096, 0..."


In [9]:
community_topic_counts = merged_df.groupby(['community_id', 'topic', 'topic_id']).size().reset_index(name='count')

# Compouting each tipic percentage in each community
community_totals = community_topic_counts.groupby('community_id')['count'].transform('sum')
community_topic_counts['percent'] = community_topic_counts['count'] / community_totals

# Retrieving the dominant topic for each community
dominant_topics = community_topic_counts.groupby('community_id').apply(
    lambda x: x.loc[x['count'].idxmax()]
)
dominant_topics

Unnamed: 0_level_0,community_id,topic,topic_id,count,percent
community_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0.0,0_political_republicans_government_democrats,0,23126,0.923489
1.0,1.0,0_political_republicans_government_democrats,0,16088,0.695126
2.0,2.0,0_political_republicans_government_democrats,0,2398,0.612203
3.0,3.0,0_political_republicans_government_democrats,0,1224,0.958496
4.0,4.0,0_political_republicans_government_democrats,0,1546,0.893125
...,...,...,...,...,...
165.0,165.0,0_political_republicans_government_democrats,0,51,0.944444
166.0,166.0,0_political_republicans_government_democrats,0,60,0.937500
167.0,167.0,0_political_republicans_government_democrats,0,44,0.977778
168.0,168.0,0_political_republicans_government_democrats,0,50,1.000000


### Elaborating mean embedding value for each community index in the dataframe

In [10]:
community_embeddings = (
    merged_df.groupby('community_id')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_community_embedding')
)

# Normalizing embedding value range
community_embeddings['mean_community_embedding'] = normalize(np.stack(community_embeddings['mean_community_embedding']), axis=1).tolist()
community_embeddings


Unnamed: 0,community_id,mean_community_embedding
0,0.0,"[-0.027600232638943952, 0.124464299879511, 0.0..."
1,1.0,"[0.04260249596115194, 0.0924507493347344, 0.00..."
2,2.0,"[0.07687575150118457, 0.046913692320080656, -0..."
3,3.0,"[-0.00619382628680516, 0.08439805578237626, 0...."
4,4.0,"[-0.03665238954230591, 0.11616745677609945, 0...."
...,...,...
165,165.0,"[0.010630359179901619, 0.13186817481581545, 0...."
166,166.0,"[0.037806238709498195, 0.07690723935570926, 0...."
167,167.0,"[0.036021501511538975, -0.003675442877892489, ..."
168,168.0,"[0.010211729357658694, 0.04051775931994275, 0...."


In [11]:
print("=== Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding community (mean):", len(community_embeddings['mean_community_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Community num:", len(community_embeddings))
print("Unique users num:", merged_df['author'].nunique())

=== Embeddings Dimensions ===
Dimension embedding texts (BERT): 768
Dimension embedding community (mean): 768

=== Final Shape ===
Community num: 170
Unique users num: 3348


In [12]:
user_embeddings = (
    merged_df.groupby('author')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_user_embedding')
)

# Normalizing embedding value range
user_embeddings['mean_user_embedding'] = normalize(np.stack(user_embeddings['mean_user_embedding']), axis=1).tolist()
user_embeddings

Unnamed: 0,author,mean_user_embedding
0,---AI---,"[0.06015991267402478, 0.06129952461877571, -0...."
1,--o,"[0.02817598359652301, 0.09259338883385021, 0.0..."
2,-Ch4s3-,"[-0.02392391131765903, 0.08075527522377651, 0...."
3,-Foxer,"[-0.009595146311128198, 0.06259979061566182, 0..."
4,-ReadingBug-,"[-0.034770284511081564, 0.13370757970011826, 0..."
...,...,...
3343,zayelion,"[-0.012365274927683013, 0.11976527059508234, 0..."
3344,zeussays,"[-0.03516050167924896, 0.040504033027540735, -..."
3345,zookeepier,"[-0.033230956343177, 0.08359358144134907, -0.0..."
3346,zxxQQz,"[0.01440717608256693, 0.08212747409928399, 0.0..."


In [13]:
print("=== User Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding user (mean):", len(user_embeddings['mean_user_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Users num:", len(user_embeddings))
print("Unique users num:", merged_df['author'].nunique())

=== User Embeddings Dimensions ===
Dimension embedding texts (BERT): 768
Dimension embedding user (mean): 768

=== Final Shape ===
Users num: 3348
Unique users num: 3348


In [14]:
# Saving emebddings separatelly in a numpy file
np.save('../../../src/nlp/topic_modeling/text_embeddings.npy', embeddings)
np.save('../../../src/nlp/topic_modeling/community_embeddings.npy', community_embeddings)
np.save('../../../src/nlp/topic_modeling/user_embeddings.npy', user_embeddings)

# Saving final dataframe
merged_df.drop(columns=['embedding', 'clean_text','id_y','indegree','outdegree','degree','internal_degree','external_degree','pct_internal','pct_external'], inplace=True)
merged_df.to_csv('../../../src/nlp/topic_modeling/global_metadata.csv', index=False)