In [1]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

2025-08-13 15:00:53.001072: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-13 15:00:53.035734: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755090053.061942    5302 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755090053.071590    5302 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755090053.106278    5302 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

# Removing empty texts
merged_df['clean_text'] = merged_df['clean_text'].fillna('').astype(str)
merged_df = merged_df[merged_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(merged_df['clean_text'].astype(str))

Using device : cuda



The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.



In [4]:
topic_info = topic_model.get_topic_info()
topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

merged_df['topic'] = [topic_id_to_name[topic] for topic in topics]
merged_df['topic_id'] = topics

In [5]:
# Calculating embeddings
embeddings = embedding_model.encode(
    merged_df['clean_text'].tolist(),
    show_progress_bar=True
)

merged_df['embedding'] = embeddings.tolist()

Batches:   0%|          | 0/3092 [00:00<?, ?it/s]

### Elaborating mean embedding value for each community index in the dataframe

In [6]:
community_embeddings = (
    merged_df.groupby('community_id')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_community_embedding')
)

# Normalizing embedding value range
community_embeddings['mean_community_embedding'] = normalize(np.stack(community_embeddings['mean_community_embedding']), axis=1).tolist()
community_embeddings


Unnamed: 0,community_id,mean_community_embedding
0,0.0,"[0.020105618885145216, -0.02229900076280347, 0..."
1,1.0,"[-0.0321354957364867, 0.007784762423196177, 0...."
2,2.0,"[-0.0656525252450862, -0.03643717584053105, 0...."
3,3.0,"[0.04115581079380223, 0.05581539112406382, -0...."
4,4.0,"[0.01561084731977141, -0.025452655445941295, 0..."
...,...,...
73,73.0,"[0.01594564734294614, -0.04022783580273586, 0...."
74,74.0,"[-0.023686220419980374, 0.003559193800497408, ..."
75,75.0,"[0.005802778332151095, 0.019718704042567314, -..."
76,76.0,"[-0.0589436785057696, 0.054964585790700314, 0...."


In [7]:
print("=== Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding community (mean):", len(community_embeddings['mean_community_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Community num:", len(community_embeddings))
print("Unique users num:", merged_df['author'].nunique())

=== Embeddings Dimensions ===
Dimension embedding texts (BERT): 384
Dimension embedding community (mean): 384

=== Final Shape ===
Community num: 78
Unique users num: 3348


In [12]:
merged_df = merged_df.loc[merged_df['is_hub'] | merged_df['is_bridge']]

user_embeddings = (
    merged_df.groupby('author')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_user_embedding')
)

# Normalizing embedding value range
user_embeddings['mean_user_embedding'] = normalize(np.stack(user_embeddings['mean_user_embedding']), axis=1).tolist()
user_embeddings

Unnamed: 0,author,mean_user_embedding
0,---AI---,"[-0.0639383406968702, 0.028263391483561412, -0..."
1,-SOFA-KING-VOTE-,"[0.02537594980285635, 0.020655424394617494, 0...."
2,-spicychilli-,"[0.05242311004935476, 0.011509657517813153, -0..."
3,00rb,"[0.022140607800075204, 0.011875626579590515, 0..."
4,1-objective-opinion,"[-0.003780060558322122, -0.05633407886506631, ..."
...,...,...
1015,yeahmanbombclaut,"[0.04260721291476764, 0.019757095786873232, 0...."
1016,ygmc8413,"[0.0016344143592275596, 0.02063230530266495, 0..."
1017,yyzjertl,"[0.01365876545455135, 0.04090400073064282, 0.0..."
1018,zaoldyeck,"[0.019149896450205813, -0.010847117660616754, ..."


In [14]:
print("=== User Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding user (mean):", len(user_embeddings['mean_user_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Users num:", len(user_embeddings))
print("Unique users num:", merged_df['author'].nunique())

=== User Embeddings Dimensions ===
Dimension embedding texts (BERT): 384
Dimension embedding user (mean): 384

=== Final Shape ===
Users num: 1020
Unique users num: 1020


In [None]:
# Saving emebddings separatelly in a numpy file
np.save('../../../src/nlp/topic_modeling/text_embeddings.npy', embeddings)
np.save('../../../src/nlp/topic_modeling/community_embeddings.npy', community_embeddings)
np.save('../../../src/nlp/topic_modeling/user_embeddings.npy', user_embeddings)

# Saving final dataframe
merged_df.drop(columns=['embedding', 'clean_text','id_y','indegree','outdegree','degree','internal_degree','external_degree','pct_internal','pct_external'], inplace=True)
merged_df.to_csv('../../../src/nlp/topic_modeling/global_metadata.csv', index=False)