In [1]:
!pip install graphrole



In [2]:
import pandas as pd
import snap
from src.load_and_save import load_networkx_directed_graph, save_list, load_list
from graphrole import RecursiveFeatureExtractor, RoleExtractor
import numpy as np
import os.path
import networkx as nx

In [3]:
followers_reduced_path = "data/followers_reduced.edgelist"
feature_extraction_path = "data/feature_extractions.csv"

In [4]:
followers_reduced_graph = load_networkx_directed_graph(followers_reduced_path, weighted=True)

# Extracting features

In [5]:
followers_feature_extractor = RecursiveFeatureExtractor(followers_reduced_graph)
followers_features = followers_feature_extractor.extract_features()
followers_features

Unnamed: 0,in_degree(mean)(mean)(mean)(mean)(mean)(mean),in_degree(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean),in_degree(mean)(mean)(mean)(mean),in_degree(sum)(mean)(mean)(mean),external_edges(mean)(mean)(mean),external_edges(sum)(mean)(mean),in_degree(mean)(mean)(mean),in_degree(sum)(mean)(mean),total_degree(sum)(mean)(mean),...,internal_edges(mean),internal_edges(sum),out_degree(mean),total_degree(mean),total_degree(sum),external_edges,in_degree,internal_edges,out_degree,total_degree
2,378.806888,386.939511,3946.214369,398.161717,24233.771668,3841.426413,297102.043671,421.710482,25385.205366,30047.132487,...,1026.105263,58488.0,66.350877,660.877193,37670.0,3149,654,690,57,711
3,379.050907,383.313239,3984.477170,391.563404,23576.126641,3869.961766,262825.819781,392.751203,21392.758349,25606.420354,...,692.833333,12471.0,56.000000,488.388889,8791.0,902,41,124,18,59
4,377.329381,379.375909,4233.439899,381.143598,24401.884456,4252.216193,344096.564927,382.548051,24458.767015,29738.818392,...,862.250980,219874.0,65.603922,373.635294,95277.0,11852,1494,5132,255,1749
5,384.391760,389.687407,4439.565723,396.509063,26853.931382,4557.419059,421048.376910,407.883068,29373.616713,35644.775406,...,1311.666667,55090.0,80.047619,653.285714,27438.0,2914,445,490,42,487
6,384.197811,386.219248,4319.218987,386.600495,25061.801511,4374.431622,362984.602354,391.524500,25824.583046,31399.966771,...,1345.264151,71299.0,90.132075,524.622642,27805.0,4298,2256,532,53,2309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444546,222.692949,196.722668,1957.281913,165.964317,8506.772603,1761.431556,100632.312500,120.911233,7017.925000,9583.837500,...,1957.000000,1957.0,80.000000,305.000000,305.0,80,0,1,1,1
446560,436.585144,458.978599,4930.984043,497.973987,30389.125819,4765.589368,334776.867479,559.365943,29412.638908,34368.082972,...,370.833333,2225.0,29.000000,1108.500000,6651.0,167,0,13,6,6
449779,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,5.000000,5.0,0,0,1,1,1
456041,379.617720,399.456244,5694.731442,434.618310,27990.319065,5555.092541,450854.313589,533.870885,24679.291812,30724.012195,...,851.000000,1702.0,48.000000,1466.500000,2933.0,94,0,4,2,2


In [6]:
# DataFrame to manipulate
followers_features_clustering = followers_features[["external_edges", "in_degree", "internal_edges", "out_degree", "total_degree"]].copy()

In [7]:
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree
2,3149,654,690,57,711
3,902,41,124,18,59
4,11852,1494,5132,255,1749
5,2914,445,490,42,487
6,4298,2256,532,53,2309
...,...,...,...,...,...
444546,80,0,1,1,1
446560,167,0,13,6,6
449779,0,0,1,1,1
456041,94,0,4,2,2


# Adding extra features

## Betweeness centrality

In [8]:
# Loading graph with snap
snap_followers_graph = snap.LoadEdgeList(snap.TNGraph, followers_reduced_path, 0, 1)

In [9]:
# Extrancting betweeness centrality
Nodes, _ = snap_followers_graph.GetBetweennessCentr(1.0, False)

In [10]:
# Collecting node number and betweeness centrality
list_nodes = []
list_betwCentrality = []
for node in Nodes:
    list_nodes.append(node)
    list_betwCentrality.append(Nodes[node])
    
followers_features_clustering["Betweeness Centrality"] = list_betwCentrality
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree,Betweeness Centrality
2,3149,654,690,57,711,2.271158e+05
3,902,41,124,18,59,1.028602e+05
4,11852,1494,5132,255,1749,3.156500e+06
5,2914,445,490,42,487,1.841585e+04
6,4298,2256,532,53,2309,1.223231e+04
...,...,...,...,...,...,...
444546,80,0,1,1,1,1.274079e+02
446560,167,0,13,6,6,1.694695e+03
449779,0,0,1,1,1,1.029845e+01
456041,94,0,4,2,2,7.775289e+01


## K-core

In [11]:
followers_reduced_noloops_graph = followers_reduced_graph.copy()
followers_reduced_graph.remove_edges_from(nx.selfloop_edges(followers_reduced_graph))
followers_features_clustering["k-core"] = nx.core_number(followers_reduced_graph)
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree,Betweeness Centrality,k-core
2,3149,654,690,57,711,2.271158e+05,2
3,902,41,124,18,59,1.028602e+05,5
4,11852,1494,5132,255,1749,3.156500e+06,6
5,2914,445,490,42,487,1.841585e+04,14
6,4298,2256,532,53,2309,1.223231e+04,25
...,...,...,...,...,...,...,...
444546,80,0,1,1,1,1.274079e+02,22295
446560,167,0,13,6,6,1.694695e+03,22297
449779,0,0,1,1,1,1.029845e+01,22343
456041,94,0,4,2,2,7.775289e+01,414595


# Extracting roles

In [12]:
# Using RoleX to extract roles
role_extractor = RoleExtractor()
role_extractor.extract_role_factors(followers_features_clustering)
roles = role_extractor.roles

  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + ve

  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + ve

In [13]:
# List of clusters
list_roles = list(roles.values())

In [14]:
# Counting number of nodes in each cluster
clusters, counts = np.unique(list_roles, return_counts=True)
for cluster, count in zip(clusters, counts):
    print(cluster, count)

role_0 3351
role_1 337
role_2 438
role_3 5552
role_4 736
role_5 159
role_6 5926


In [15]:
# Adding cluster column
cluster_df = followers_features_clustering.copy()
cluster_df['cluster'] = list_roles

In [16]:
followers_features_clustering = cluster_df.copy()

In [17]:
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree,Betweeness Centrality,k-core,cluster
2,3149,654,690,57,711,2.271158e+05,2,role_1
3,902,41,124,18,59,1.028602e+05,5,role_1
4,11852,1494,5132,255,1749,3.156500e+06,6,role_1
5,2914,445,490,42,487,1.841585e+04,14,role_1
6,4298,2256,532,53,2309,1.223231e+04,25,role_3
...,...,...,...,...,...,...,...,...
444546,80,0,1,1,1,1.274079e+02,22295,role_6
446560,167,0,13,6,6,1.694695e+03,22297,role_6
449779,0,0,1,1,1,1.029845e+01,22343,role_6
456041,94,0,4,2,2,7.775289e+01,414595,role_6


# Save to file

In [18]:
# Saving data frame with roles/clusters included
followers_features_clustering.to_csv(feature_extraction_path)