In [1]:
!pip install graphrole



In [2]:
import pandas as pd
import snap
from src.load_and_save import load_networkx_directed_graph, save_list, load_list
from graphrole import RecursiveFeatureExtractor, RoleExtractor
import numpy as np

In [3]:
spreading_path = "data/spreading.edgelist"
spreading_features_path = "data/spreading_features.csv"
followers_features_path = "data/followers_features.csv"
feature_extraction_path = "data/feature_extractions.csv"

In [4]:
spreading_graph = load_networkx_directed_graph(spreading_path, weighted=True)

# Extracting features

In [5]:
# Using RoleX to extract feautures
spreading_feature_extractor = RecursiveFeatureExtractor(spreading_graph)
spreading_features = spreading_feature_extractor.extract_features()
spreading_features.to_csv(spreading_features_path)

In [6]:
# DataFrame to manipulate
feature_df = spreading_features.copy()

In [7]:
# Adding node column
feature_df["node"] = list(feature_df.index)

In [8]:
# Updating index
#feature_df.index = range(0,(len(feature_df)))

In [9]:
feature_df

Unnamed: 0,external_edges(mean)(mean)(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean),in_degree(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(sum),in_degree(mean)(mean)(mean)(mean),in_degree(sum)(mean)(mean)(mean),internal_edges(mean)(mean)(mean)(mean),...,internal_edges(sum),out_degree(sum),total_degree(mean),total_degree(sum),external_edges,in_degree,internal_edges,out_degree,total_degree,node
1,0.370745,0.559823,0.847296,1.224214,1.219500,1.900549,22.806586,2.233415,16.937434,0.519168,...,26.640230,18.120690,4.672605,56.071264,17.820690,2.411494,2.150575,1.850575,4.262069,1
2,0.404920,0.620920,0.978132,1.455130,1.533108,2.229373,71.339931,2.267897,16.122920,0.581757,...,38.564368,27.449425,3.342170,106.949425,25.349425,3.167816,5.300000,3.200000,6.367816,2
3,0.348013,0.542532,0.821582,1.243467,1.400035,1.848926,5.546778,2.051266,17.060422,0.457335,...,1.000000,0.700000,3.500000,10.500000,0.700000,0.200000,0.300000,0.300000,0.500000,3
4,0.327690,0.499998,0.734351,1.121674,1.276565,1.648318,105.492354,1.692854,13.528869,0.409399,...,77.752874,50.422989,4.203538,269.026437,45.148276,14.882759,12.363218,7.088506,21.971264,4
5,0.152267,0.219351,0.331582,0.515192,0.546967,0.753334,3.766671,0.680317,7.885884,0.189435,...,12.760920,6.254023,16.437471,82.187356,6.154023,1.200000,0.600000,0.500000,1.700000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422665,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,422665
428003,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,428003
433090,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,433090
444546,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,444546


# Adding extra features

## Betweeness centrality

In [10]:
# Loading graph with snap
snap_spreading_graph = snap.LoadEdgeList(snap.TNGraph, spreading_path, 0, 1)

In [11]:
# Extrancting betweeness centrality
Nodes, _ = snap_spreading_graph.GetBetweennessCentr(1.0, False)

In [12]:
# Collecting node number and betweeness centrality
list_nodes = []
list_betwCentrality = []
for node in Nodes:
    list_nodes.append(node)
    list_betwCentrality.append([Nodes[node]])    

In [13]:
# Test to see matches cluster_df node numbers 
cluster_list_nodes = list(feature_df['node'])
print(len(set(list_nodes) & set(cluster_list_nodes))/len(list_nodes)*100, "%")

100.0 %


In [14]:
# construncting data frame with node and corr. betweeness centrality
data = {"node": list_nodes, "Betweeness Centrality": list_betwCentrality}
betweeness_centrality_df = pd.DataFrame.from_dict(data = data)
betweeness_centrality_df 

Unnamed: 0,node,Betweeness Centrality
0,2,[1833689.6614537227]
1,314,[0.0]
2,77,[3060871.3571301405]
3,291,[460113.467320175]
4,290,[736090.5252858942]
...,...,...
19220,6043,[0.0]
19221,22399,[97072.81523580353]
19222,5987,[0.0]
19223,22401,[0.0]


In [15]:
# adding feature to origin DF
added_feature_df = feature_df.copy()
added_feature_df = pd.merge(added_feature_df, betweeness_centrality_df , on='node', how='left')
added_feature_df

Unnamed: 0,external_edges(mean)(mean)(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean),in_degree(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(sum),in_degree(mean)(mean)(mean)(mean),in_degree(sum)(mean)(mean)(mean),internal_edges(mean)(mean)(mean)(mean),...,out_degree(sum),total_degree(mean),total_degree(sum),external_edges,in_degree,internal_edges,out_degree,total_degree,node,Betweeness Centrality
0,0.370745,0.559823,0.847296,1.224214,1.219500,1.900549,22.806586,2.233415,16.937434,0.519168,...,18.120690,4.672605,56.071264,17.820690,2.411494,2.150575,1.850575,4.262069,1,[481208.1487228568]
1,0.404920,0.620920,0.978132,1.455130,1.533108,2.229373,71.339931,2.267897,16.122920,0.581757,...,27.449425,3.342170,106.949425,25.349425,3.167816,5.300000,3.200000,6.367816,2,[1833689.6614537227]
2,0.348013,0.542532,0.821582,1.243467,1.400035,1.848926,5.546778,2.051266,17.060422,0.457335,...,0.700000,3.500000,10.500000,0.700000,0.200000,0.300000,0.300000,0.500000,3,[33090.95583629351]
3,0.327690,0.499998,0.734351,1.121674,1.276565,1.648318,105.492354,1.692854,13.528869,0.409399,...,50.422989,4.203538,269.026437,45.148276,14.882759,12.363218,7.088506,21.971264,4,[8116414.438216583]
4,0.152267,0.219351,0.331582,0.515192,0.546967,0.753334,3.766671,0.680317,7.885884,0.189435,...,6.254023,16.437471,82.187356,6.154023,1.200000,0.600000,0.500000,1.700000,5,[597805.2075550914]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19220,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,422665,[0.0]
19221,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,428003,[0.0]
19222,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,433090,[0.0]
19223,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,444546,[0.0]


In [16]:
# Updating manipulating DF
feature_df = added_feature_df.copy()

In [17]:
feature_df

Unnamed: 0,external_edges(mean)(mean)(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean)(mean),in_degree(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(sum),in_degree(mean)(mean)(mean)(mean),in_degree(sum)(mean)(mean)(mean),internal_edges(mean)(mean)(mean)(mean),...,out_degree(sum),total_degree(mean),total_degree(sum),external_edges,in_degree,internal_edges,out_degree,total_degree,node,Betweeness Centrality
0,0.370745,0.559823,0.847296,1.224214,1.219500,1.900549,22.806586,2.233415,16.937434,0.519168,...,18.120690,4.672605,56.071264,17.820690,2.411494,2.150575,1.850575,4.262069,1,[481208.1487228568]
1,0.404920,0.620920,0.978132,1.455130,1.533108,2.229373,71.339931,2.267897,16.122920,0.581757,...,27.449425,3.342170,106.949425,25.349425,3.167816,5.300000,3.200000,6.367816,2,[1833689.6614537227]
2,0.348013,0.542532,0.821582,1.243467,1.400035,1.848926,5.546778,2.051266,17.060422,0.457335,...,0.700000,3.500000,10.500000,0.700000,0.200000,0.300000,0.300000,0.500000,3,[33090.95583629351]
3,0.327690,0.499998,0.734351,1.121674,1.276565,1.648318,105.492354,1.692854,13.528869,0.409399,...,50.422989,4.203538,269.026437,45.148276,14.882759,12.363218,7.088506,21.971264,4,[8116414.438216583]
4,0.152267,0.219351,0.331582,0.515192,0.546967,0.753334,3.766671,0.680317,7.885884,0.189435,...,6.254023,16.437471,82.187356,6.154023,1.200000,0.600000,0.500000,1.700000,5,[597805.2075550914]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19220,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,422665,[0.0]
19221,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,428003,[0.0]
19222,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,433090,[0.0]
19223,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.100000,444546,[0.0]


## Kcoreness

In [18]:
#KCorenessList = getKcorenessForEachNode()

# Extracting roles

In [19]:
# Using RoleX to extract roles
role_extractor = RoleExtractor()
role_extractor.extract_role_factors(feature_df)
roles = role_extractor.roles

  min_cost = np.nanmin(costs)


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
# List of clusters
list_roles = list(roles.values())

In [None]:
# Counting number of nodes in each cluster
clusters, counts = np.unique(list_roles, return_counts=True)
for cluster, count in zip(clusters, counts):
    print(cluster, count)

In [None]:
# Adding cluster column
cluster_df = feature_df.copy()
cluster_df['cluster'] = list_roles

In [None]:
feature_df = cluster_df.copy()

In [None]:
feature_df

# Save to file

In [None]:
# Saving data frame with roles/clusters included
feature_df.to_csv(feature_extraction_path)

# Documentation

In [None]:
graphrole.__file__