In [1]:
!pip install graphrole



You should consider upgrading via the 'c:\users\98ric\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [1]:
import pandas as pd
import snap
from src.load_and_save import load_networkx_directed_graph, save_list, load_list
from graphrole import RecursiveFeatureExtractor, RoleExtractor
import numpy as np
import os.path
import networkx as nx

In [2]:
followers_reduced_path = "data/followers_reduced.edgelist"
feature_extraction_path = "data/feature_extractions.csv"

In [3]:
followers_reduced_graph = load_networkx_directed_graph(followers_reduced_path, weighted=True)

# Reading from file

In [4]:
followers_features_clustering = pd.read_csv(feature_extraction_path, index_col=0).drop(["cluster", "Betweeness Centrality", "k-core"], axis = 'columns')
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree
2,3149,654,690,57,711
3,902,41,124,18,59
4,11852,1494,5132,255,1749
5,2914,445,490,42,487
6,4298,2256,532,53,2309
...,...,...,...,...,...
444546,80,0,1,1,1
446560,167,0,13,6,6
449779,0,0,1,1,1
456041,94,0,4,2,2


# TO SKIP IF READ FROM FILE - Extracting features

In [21]:
followers_feature_extractor = RecursiveFeatureExtractor(followers_reduced_graph)
followers_features = followers_feature_extractor.extract_features()
followers_features

KeyboardInterrupt: 

In [6]:
# DataFrame to manipulate
followers_features_clustering = followers_features[["external_edges", "in_degree", "internal_edges", "out_degree", "total_degree"]].copy()

In [20]:
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree,Betweeness Centrality,k-core
2,960,134,318,40,174,5308.949046,38
3,272,17,64,13,30,189.392262,23
4,1845,184,1623,107,291,18148.062181,38
8,898,230,243,26,256,16328.031205,38
9,290,21,40,13,34,375.875398,23
...,...,...,...,...,...,...,...
264163,234,0,17,8,8,17.471860,8
288958,406,0,158,22,22,97.893243,20
312038,54,0,20,5,5,0.974793,5
385575,134,0,24,6,6,1.209667,6


# Adding extra features

## Betweeness centrality

In [5]:
# Loading graph with snap
snap_followers_graph = snap.LoadEdgeList(snap.TNGraph, followers_reduced_path, 0, 1)

In [6]:
# Extrancting betweeness centrality
Nodes, _ = snap_followers_graph.GetBetweennessCentr(1.0, True)

In [7]:
# Collecting node number and betweeness centrality
list_nodes = []
list_betwCentrality = []
for node in Nodes:
    list_nodes.append(node)
    list_betwCentrality.append(Nodes[node])
    followers_features_clustering.at[node, "Betweeness Centrality"] = Nodes[node]
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree,Betweeness Centrality
2,3149,654,690,57,711,1.263438e+05
3,902,41,124,18,59,4.439361e+03
4,11852,1494,5132,255,1749,2.204792e+06
5,2914,445,490,42,487,5.748678e+04
6,4298,2256,532,53,2309,1.502223e+06
...,...,...,...,...,...,...
444546,80,0,1,1,1,0.000000e+00
446560,167,0,13,6,6,0.000000e+00
449779,0,0,1,1,1,0.000000e+00
456041,94,0,4,2,2,0.000000e+00


## K-core

In [8]:
followers_reduced_noloops_graph = followers_reduced_graph.copy()
followers_reduced_graph.remove_edges_from(nx.selfloop_edges(followers_reduced_graph))

In [9]:
final_dict = {}
for key, value in nx.core_number(followers_reduced_graph).items():
    final_dict[key] = [value]
core_df = pd.DataFrame.from_dict(final_dict, orient='index', columns=["score"])

#followers_features_clustering["k-core"] = nx.core_number(followers_reduced_graph)


In [10]:
followers_features_clustering["k-core"] = core_df.sort_index()

In [11]:
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree,Betweeness Centrality,k-core
2,3149,654,690,57,711,1.263438e+05,88
3,902,41,124,18,59,4.439361e+03,48
4,11852,1494,5132,255,1749,2.204792e+06,89
5,2914,445,490,42,487,5.748678e+04,89
6,4298,2256,532,53,2309,1.502223e+06,89
...,...,...,...,...,...,...,...
444546,80,0,1,1,1,0.000000e+00,1
446560,167,0,13,6,6,0.000000e+00,6
449779,0,0,1,1,1,0.000000e+00,1
456041,94,0,4,2,2,0.000000e+00,2


# Extracting roles

In [12]:
# Using RoleX to extract roles
role_extractor = RoleExtractor()
role_extractor.extract_role_factors(followers_features_clustering)
roles = role_extractor.roles

  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + ve

  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + vec2, 0))
  kl_div = np.sum(np.where(vec1 != 0, vec1 * np.log(vec1 / vec2) - vec1 + ve

In [20]:
# List of clusters
list_roles = list(roles.values())

In [14]:
# Counting number of nodes in each cluster
clusters, counts = np.unique(list_roles, return_counts=True)
for cluster, count in zip(clusters, counts):
    print(cluster, count)

role_0 9209
role_1 352
role_2 2040
role_3 1305
role_4 2942
role_5 46
role_6 605


In [15]:
# Adding cluster column
cluster_df = followers_features_clustering.copy()
cluster_df['cluster'] = list_roles

In [16]:
followers_features_clustering = cluster_df.copy()

In [17]:
followers_features_clustering

Unnamed: 0,external_edges,in_degree,internal_edges,out_degree,total_degree,Betweeness Centrality,k-core,cluster
2,3149,654,690,57,711,1.263438e+05,88,role_2
3,902,41,124,18,59,4.439361e+03,48,role_0
4,11852,1494,5132,255,1749,2.204792e+06,89,role_1
5,2914,445,490,42,487,5.748678e+04,89,role_2
6,4298,2256,532,53,2309,1.502223e+06,89,role_1
...,...,...,...,...,...,...,...,...
444546,80,0,1,1,1,0.000000e+00,1,role_0
446560,167,0,13,6,6,0.000000e+00,6,role_0
449779,0,0,1,1,1,0.000000e+00,1,role_0
456041,94,0,4,2,2,0.000000e+00,2,role_0


# Save to file

In [21]:
# Saving data frame with roles/clusters included
followers_features_clustering.to_csv(feature_extraction_path)