In [1]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

2025-08-28 10:26:31.726392: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-28 10:26:32.235454: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756369592.435684    4800 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756369592.494330    4800 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756369592.889064    4800 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
text_df = pd.read_csv('../../src/nlp/cleaned_dataset.csv')
community_df = pd.read_csv('../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [3]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

Using device : cuda


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [4]:
# Calculating embeddings
embeddings = embedding_model.encode(
    merged_df['clean_text'].tolist(),
    show_progress_bar=True
)

merged_df['embedding'] = embeddings.tolist()

Batches:   0%|          | 0/7215 [00:00<?, ?it/s]

In [5]:
community_embeddings = (
    merged_df.groupby('community_id')
    .apply(lambda df: np.average(np.stack(df['embedding']), axis=0, weights=df['author'].map(df['author'].value_counts())))
    .reset_index(name='weighted_community_embedding')
)

# Normalizing embedding value range
community_embeddings['weighted_community_embedding'] = normalize(np.stack(community_embeddings['weighted_community_embedding']), axis=1).tolist()
community_embeddings

  .apply(lambda df: np.average(np.stack(df['embedding']), axis=0, weights=df['author'].map(df['author'].value_counts())))


Unnamed: 0,community_id,weighted_community_embedding
0,0,"[-0.03174944504430175, 9.456400530341377e-05, ..."
1,1,"[-0.023215748688427282, 0.018855841962647446, ..."
2,2,"[-0.07201282766770374, 0.045006119540975595, 0..."
3,3,"[0.012736148951822436, -0.020084389952517545, ..."
4,4,"[-0.08430956569403694, 0.02955804556314459, 0...."
5,5,"[-0.007467192071303748, -0.013673752978067557,..."
6,6,"[-0.01995521238340613, 0.011183031202999363, 0..."
7,7,"[0.0019133019587764019, 0.061068003321766824, ..."
8,8,"[-0.0047000162077750615, 0.004532813523014483,..."
9,9,"[0.021230122640409306, -0.0028823499521320804,..."


In [6]:
print("=== Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding community (weighted):", len(community_embeddings['weighted_community_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Community num:", len(community_embeddings))
print("Unique users num:", merged_df['author'].nunique())

=== Embeddings Dimensions ===
Dimension embedding texts (BERT): 384
Dimension embedding community (weighted): 384

=== Final Shape ===
Community num: 35
Unique users num: 7787


In [7]:
user_embeddings = (
    merged_df.groupby('author')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_user_embedding')
)

# Normalizing embedding value range
user_embeddings['mean_user_embedding'] = normalize(np.stack(user_embeddings['mean_user_embedding']), axis=1).tolist()
user_embeddings

Unnamed: 0,author,mean_user_embedding
0,---AI---,"[-0.11233255379432303, 0.002103749525428783, -..."
1,---why-so-serious---,"[-0.005073964089189215, 0.014678458098814027, ..."
2,--o,"[0.0002577067735751773, 0.02312159432734466, 0..."
3,-C4-,"[0.005107942593273136, -0.023586260724990236, ..."
4,-Ch4s3-,"[-0.04254238919181533, -0.005530801973817223, ..."
...,...,...
7782,zupobaloop,"[0.003788088102603537, -0.03283912411101916, -..."
7783,zuriel45,"[0.007112084310772595, 0.0063828685856433766, ..."
7784,zxc999,"[0.06414996757788313, -0.026140490378372884, 0..."
7785,zxxQQz,"[-0.05887796116424575, -4.0908086648957624e-05..."


In [8]:
print("=== User Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding user (mean):", len(user_embeddings['mean_user_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Users num:", len(user_embeddings))
print("Unique users num:", merged_df['author'].nunique())

=== User Embeddings Dimensions ===
Dimension embedding texts (BERT): 384
Dimension embedding user (mean): 384

=== Final Shape ===
Users num: 7787
Unique users num: 7787


In [9]:
# Saving emebddings separatelly in a numpy file
np.save('../../src/nlp/text_embeddings.npy', embeddings)
np.save('../../src/nlp/community_embeddings.npy', community_embeddings)
np.save('../../src/nlp/user_embeddings.npy', user_embeddings)