In [24]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [25]:
embeddings_df = pd.read_csv('../../../src/nlp/topic_modeling/global_metadata.csv')
embeddings_npy = np.load('../../../src/nlp/topic_modeling/user_embeddings.npy')
roles_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')

embeddings_df['embedding'] = list(embeddings_npy) 

In [26]:
embeddings_df.head()

Unnamed: 0,author,id_x,type,clean_text,id_y,indegree,outdegree,degree,internal_degree,community_id,community_type,external_degree,pct_internal,pct_external,is_hub,is_bridge,topic,embedding
0,PsychLegalMind,t3_1lzrb15,post,trump announced new weapons ukraine monday thr...,PsychLegalMind,0,232,232,224,0.0,Strong community,8,0.965517,0.034483,True,False,17,"[-0.015391379, 0.005025496, 0.03813486, 0.0105..."
1,the_original_Retro,t3_1lys6tq,post,recent days maga outspoken influencers rushing...,the_original_Retro,6,65,71,69,0.0,Strong community,2,0.971831,0.028169,True,False,53,"[0.006398821, -0.040616985, -0.045154493, -0.0..."
2,Time_Minute_6036,t3_1lyzum1,post,might naive question hear donald trump impleme...,Time_Minute_6036,3,267,270,256,0.0,Strong community,14,0.948148,0.051852,True,False,-1,"[-0.053852115, -0.02925583, 0.08934749, -0.011..."
3,najumobi,t3_1lyd2ym,post,biden rhetoric greatly diverged trump posture ...,najumobi,0,197,197,185,0.0,Strong community,12,0.939086,0.060914,True,False,26,"[-0.035550196, 0.0013031296, 0.08044617, -0.03..."
4,Awesomeuser90,t3_1lx5svi,post,kinda hard come something nature perhaps might...,Awesomeuser90,2,170,172,166,0.0,Strong community,6,0.965116,0.034884,True,False,-1,"[0.0059204116, -0.020216445, 0.063375555, -0.0..."


In [27]:
# Calculating mean embedding for hub and bridge users
def calculate_roles_embeddings(roles_df, embeddings_df):
    users = roles_df[(roles_df['is_hub']) | (roles_df['is_bridge'])]['id'].tolist()

    print(f"Total users to process: {len(users)}")
    print(f"Match with embeddings_df: {len(set(users) & set(embeddings_df['author']))}")

    mean_embeddings = (
        embeddings_df[embeddings_df['author'].isin(users)]
        .groupby('author')['embedding']
        .apply(lambda x: np.mean(np.stack(x.tolist()), axis=0))
    )

    roles_df['mean_embedding'] = roles_df['id'].map(mean_embeddings)
    return roles_df


In [28]:
roles_df = calculate_roles_embeddings(roles_df=roles_df, embeddings_df=embeddings_df)
roles_df[roles_df['is_hub']].head()

Total users to process: 1020
Match with embeddings_df: 1020


Unnamed: 0,id,indegree,outdegree,degree,internal_degree,community_id,community_type,external_degree,pct_internal,pct_external,is_hub,is_bridge,mean_embedding
115,AlexandrTheTolerable,6,48,54,50,0.0,Strong community,4,0.925926,0.074074,True,False,"[0.040094446, -0.014856148, 0.006937535, -0.03..."
118,Ashmedai,27,31,58,57,0.0,Strong community,1,0.982759,0.017241,True,False,"[-0.0070319534, -0.0030622976, 0.0030205164, -..."
129,Interrophish,26,37,63,61,0.0,Strong community,2,0.968254,0.031746,True,False,"[-0.0028898586, -0.00849698, 0.017251326, -0.0..."
134,ManBearScientist,23,21,44,43,0.0,Strong community,1,0.977273,0.022727,True,False,"[0.00807288, -0.017616635, 0.0152864605, -0.00..."
135,Moccus,34,37,71,66,0.0,Strong community,5,0.929577,0.070423,True,False,"[-0.0011246337, -0.01864815, 0.017223574, -0.0..."


In [29]:
# Filtering valid embeddings validi (not NaN or Null)
valid_embeddings = roles_df['mean_embedding'].dropna().tolist()
np.save('../../../src/data/distribuitions/role_embeddings.npy', np.stack(valid_embeddings))