In [1]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

2025-08-12 15:25:17.965682: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-12 15:25:18.391800: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755005118.576901    9786 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755005118.629458    9786 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755005119.042616    9786 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

# Removing empty texts
merged_df['clean_text'] = merged_df['clean_text'].fillna('').astype(str)
merged_df = merged_df[merged_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(merged_df['clean_text'].astype(str))

Using device : cuda



The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.



In [4]:
topic_info = topic_model.get_topic_info()
topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

merged_df['topic'] = [topic_id_to_name[topic] for topic in topics]
merged_df['topic_id'] = topics

In [6]:
# Calculating embeddings
embeddings = embedding_model.encode(
    merged_df['clean_text'].tolist(),
    show_progress_bar=True
)

merged_df['embedding'] = embeddings.tolist()

Batches:   0%|          | 0/3092 [00:00<?, ?it/s]

### Elaborating mean embedding value for each community index in the dataframe

In [7]:
community_embeddings = (
    merged_df.groupby('community_id')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_community_embedding')
)

community_embeddings

Unnamed: 0,community_id,mean_community_embedding
0,0.0,"[0.008139040812967555, -0.009026953029083103, ..."
1,1.0,"[-0.012794963880515417, 0.0030995555456795458,..."
2,2.0,"[-0.03335555065166391, -0.018512342972571094, ..."
3,3.0,"[0.019291695096525827, 0.026163340886505567, -..."
4,4.0,"[0.006346587028317098, -0.010347772262486067, ..."
...,...,...
73,73.0,"[0.007042493369804922, -0.017766871474659406, ..."
74,74.0,"[-0.009027910028952498, 0.0013565727598900193,..."
75,75.0,"[0.0024505718904084717, 0.008327407851212252, ..."
76,76.0,"[-0.02263737464518323, 0.02110920038082805, 0...."


In [8]:
# Saving emebddings separatelly in a numpy file
np.save('../../../src/nlp/topic_modeling/user_embeddings.npy', embeddings)
np.save('../../../src/nlp/topic_modeling/global_embeddings.npy', community_embeddings)

# Saving final dataframe
merged_df.drop(columns=['embedding', 'clean_text','id_y','indegree','outdegree','degree','internal_degree','external_degree','pct_internal','pct_external'], inplace=True)
merged_df.to_csv('../../../src/nlp/topic_modeling/global_metadata.csv', index=False)