In [None]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

In [None]:
text_df = pd.read_csv('../../src/nlp/cleaned_dataset.csv')
community_df = pd.read_csv('../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [None]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

In [None]:
# Calculating embeddings
embeddings = embedding_model.encode(
    merged_df['clean_text'].tolist(),
    show_progress_bar=True
)

merged_df['embedding'] = embeddings.tolist()

In [None]:
community_embeddings = (
    merged_df.groupby('community_id')
    .apply(lambda df: np.average(np.stack(df['embedding']), axis=0, weights=df['author'].map(df['author'].value_counts())))
    .reset_index(name='weighted_community_embedding')
)

# Normalizing embedding value range
community_embeddings['mean_community_embedding'] = normalize(np.stack(community_embeddings['mean_community_embedding']), axis=1).tolist()
community_embeddings


In [None]:
print("=== Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding community (mean):", len(community_embeddings['mean_community_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Community num:", len(community_embeddings))
print("Unique users num:", merged_df['author'].nunique())

In [None]:
user_embeddings = (
    merged_df.groupby('author')['embedding']
    .apply(lambda x: np.mean(np.stack(x), axis=0))
    .reset_index(name='mean_user_embedding')
)

# Normalizing embedding value range
user_embeddings['mean_user_embedding'] = normalize(np.stack(user_embeddings['mean_user_embedding']), axis=1).tolist()
user_embeddings

In [None]:
print("=== User Embeddings Dimensions ===")
print("Dimension embedding texts (BERT):", len(merged_df['embedding'].iloc[0]))
print("Dimension embedding user (mean):", len(user_embeddings['mean_user_embedding'].iloc[0]))

print("\n=== Final Shape ===")
print("Users num:", len(user_embeddings))
print("Unique users num:", merged_df['author'].nunique())

In [None]:
# Saving emebddings separatelly in a numpy file
np.save('../../src/nlp/text_embeddings.npy', embeddings)
np.save('../../src/nlp/community_embeddings.npy', community_embeddings)
np.save('../../src/nlp/user_embeddings.npy', user_embeddings)