In [1]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

2025-08-25 11:19:49.554393: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-25 11:19:50.037373: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756113590.192763    4481 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756113590.249478    4481 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756113590.647057    4481 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../src/nlp/cleaned_dataset.csv')
community_df = pd.read_csv('../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

Using device : cuda


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [4]:
# Calculating embeddings
embeddings = embedding_model.encode(
    merged_df['clean_text'].tolist(),
    show_progress_bar=True
)

merged_df['embedding'] = embeddings.tolist()

Batches:   0%|          | 0/7296 [00:00<?, ?it/s]

In [5]:
community_embeddings = (
    merged_df.groupby('community_id')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_community_embedding')
)

# Normalizing embedding value range
community_embeddings['mean_community_embedding'] = normalize(np.stack(community_embeddings['mean_community_embedding']), axis=1).tolist()
community_embeddings


Unnamed: 0,community_id,mean_community_embedding
0,0,"[0.00878261319496715, -0.0016162803461459067, ..."
1,1,"[-0.026222462309150825, -0.008213285717627689,..."
2,2,"[-0.06378324591227912, 0.04083870989192552, 0...."
3,3,"[0.015568125775501394, -0.0289841665404282, 0...."
4,4,"[-0.07016679104256716, 0.02509450006785376, 0...."
5,5,"[6.501510459230815e-05, -0.0158215701100829, 0..."
6,6,"[-0.019953043890167808, 0.017036462758824665, ..."
7,7,"[0.01725681034941588, -0.001611782874778222, -..."
8,8,"[0.001840263030001201, 0.055733522412306044, 0..."
9,9,"[0.0002876983190575259, 0.0033747137398717673,..."


In [6]:
print("=== Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding community (mean):", len(community_embeddings['mean_community_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Community num:", len(community_embeddings))
print("Unique users num:", merged_df['author'].nunique())

=== Embeddings Dimensions ===
Dimension embedding texts (BERT): 384
Dimension embedding community (mean): 384

=== Final Shape ===
Community num: 41
Unique users num: 7872


In [7]:
user_embeddings = (
    merged_df.groupby('author')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_user_embedding')
)

# Normalizing embedding value range
user_embeddings['mean_user_embedding'] = normalize(np.stack(user_embeddings['mean_user_embedding']), axis=1).tolist()
user_embeddings

Unnamed: 0,author,mean_user_embedding
0,---AI---,"[-0.11233255038217432, 0.002103750300523153, -..."
1,---why-so-serious---,"[-0.005073963428776478, 0.014678470203448066, ..."
2,--o,"[-0.005540001656924998, 0.02029038711938278, 0..."
3,-C4-,"[0.005107945403411477, -0.023586249655908293, ..."
4,-Ch4s3-,"[-0.04254239356714327, -0.005530797121095537, ..."
...,...,...
7867,zupobaloop,"[0.0037880761863261793, -0.03283913947585479, ..."
7868,zuriel45,"[0.007112056218372102, 0.0063828485978313495, ..."
7869,zxc999,"[0.06414997352131481, -0.026140493759473873, 0..."
7870,zxxQQz,"[-0.05887795172817127, -4.090400717071015e-05,..."


In [8]:
print("=== User Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding user (mean):", len(user_embeddings['mean_user_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Users num:", len(user_embeddings))
print("Unique users num:", merged_df['author'].nunique())

=== User Embeddings Dimensions ===
Dimension embedding texts (BERT): 384
Dimension embedding user (mean): 384

=== Final Shape ===
Users num: 7872
Unique users num: 7872


In [9]:
# Saving emebddings separatelly in a numpy file
np.save('../../src/nlp/text_embeddings.npy', embeddings)
np.save('../../src/nlp/community_embeddings.npy', community_embeddings)
np.save('../../src/nlp/user_embeddings.npy', user_embeddings)