In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [2]:
# 1. Data loading and preparation

In [3]:
# data source: https://snap.stanford.edu/data/facebook-large-page-page-network.html

nodes = pd.read_csv("musae_facebook_target.csv")
edges = pd.read_csv("musae_facebook_edges.csv")
edges = edges.astype("int")

In [4]:
nodes.head()

Unnamed: 0,id,facebook_id,page_name,page_type
0,0,145647315578475,The Voice of China 中国好声音,tvshow
1,1,191483281412,U.S. Consulate General Mumbai,government
2,2,144761358898518,ESET,company
3,3,568700043198473,Consulate General of Switzerland in Montreal,government
4,4,1408935539376139,Mark Bailey MP - Labor for Miller,politician


In [5]:
with open("musae_facebook_features.json") as json_data:
    data = json.load(json_data)

col_names = []
for i in range(31):
    a = '{}{}'.format('f', i)
    col_names.append(a)

features = pd.DataFrame.from_dict(data=data, orient='index', columns=col_names)
features['id'] = features.index
features.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f22,f23,f24,f25,f26,f27,f28,f29,f30,id
554,2835,4518,2198,2014.0,3818.0,3045.0,4570.0,4585.0,4596.0,4599.0,...,,,,,,,,,,554
10772,3293,2534,979,2014.0,2317.0,,,,,,...,,,,,,,,,,10772
6350,4477,2895,4039,3282.0,765.0,2669.0,,,,,...,,,,,,,,,,6350
9218,3133,2052,979,3355.0,107.0,1878.0,1105.0,4571.0,4582.0,4593.0,...,4713.0,,,,,,,,,9218
7787,1151,631,2613,2791.0,648.0,1684.0,3818.0,107.0,1878.0,1105.0,...,4678.0,4695.0,4696.0,4708.0,,,,,,7787


In [6]:
# impute null values with mean values

features_name = []
for i in range(31):
    a = '{}{}'.format('f',i)
    features_name.append(a)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
features_tf = imputer.fit_transform(features.iloc[:, :-1])
features_tf = pd.DataFrame(features_tf, columns=features_name, index = features.index)
features_tf['id'] = features_tf.index
features_tf = features_tf.astype('int')
features_tf.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f22,f23,f24,f25,f26,f27,f28,f29,f30,id
554,2835,4518,2198,2014,3818,3045,4570,4585,4596,4599,...,4699,4702,4704,4705,4706,4705,4707,4707,4708,554
10772,3293,2534,979,2014,2317,2397,2530,2890,3440,4055,...,4699,4702,4704,4705,4706,4705,4707,4707,4708,10772
6350,4477,2895,4039,3282,765,2669,2530,2890,3440,4055,...,4699,4702,4704,4705,4706,4705,4707,4707,4708,6350
9218,3133,2052,979,3355,107,1878,1105,4571,4582,4593,...,4713,4702,4704,4705,4706,4705,4707,4707,4708,9218
7787,1151,631,2613,2791,648,1684,3818,107,1878,1105,...,4678,4695,4696,4708,4706,4705,4707,4707,4708,7787


In [7]:
# merge features into nodes dataframe

nodes = nodes.merge(features_tf, how ='inner', on='id')
nodes.head()

Unnamed: 0,id,facebook_id,page_name,page_type,f0,f1,f2,f3,f4,f5,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
0,0,145647315578475,The Voice of China 中国好声音,tvshow,3133,3825,236,874,1072,143,...,4693,4699,4702,4704,4705,4706,4705,4707,4707,4708
1,1,191483281412,U.S. Consulate General Mumbai,government,3399,597,979,2014,2327,2397,...,4693,4699,4702,4704,4705,4706,4705,4707,4707,4708
2,2,144761358898518,ESET,company,3383,3832,2035,765,3972,3364,...,4688,4697,4713,4704,4705,4706,4705,4707,4707,4708
3,3,568700043198473,Consulate General of Switzerland in Montreal,government,2710,1960,1940,4514,4339,761,...,4693,4699,4702,4704,4705,4706,4705,4707,4707,4708
4,4,1408935539376139,Mark Bailey MP - Labor for Miller,politician,2873,4518,4535,1602,3500,4457,...,4693,4699,4702,4704,4705,4706,4705,4707,4707,4708


In [8]:
# 2. Graph Creation

In [9]:
# remove id from edges which are not in nodes

node_id_unique = list(nodes.id.unique())
edges = edges[edges['id_1'].isin(node_id_unique)]
edges = edges[edges['id_2'].isin(node_id_unique)]

In [10]:
import networkx as nx 

G = nx.Graph()

# add nodes
for node in nodes.iloc[:,0]:
    G.add_node(int(node))

# add edges
edge_1 = list(edges['id_1'])
edge_2 = list(edges['id_2'])

for i, j in zip(edge_1, edge_2):
    G.add_edge(int(i), int(j))

In [11]:
G.number_of_nodes()

22470

In [12]:
G.number_of_edges()

171002

In [13]:
# set node attributes (feature variables)
for i in range(0,31):
    nx.set_node_attributes(G, nodes.iloc[:, i+4], features_name[i])

In [14]:
nx.info(G)

'Name: \nType: Graph\nNumber of nodes: 22470\nNumber of edges: 171002\nAverage degree:  15.2205'

In [15]:
# 2. Graph Network Metrics
# 2.1. Connectivity: Density, Clustering Coefficient, Distance
# 2.2. Centrality: Degree, Betweenness, Eigenvector, Closeness

In [16]:
# 2.1. Connectivity

# density
density = nx.density(G)
print(density)

# clustering
avg_cluster_coef = nx.average_clustering(G)
print(avg_cluster_coef)

# distance
shortest_path = nx.shortest_path(G, G.nodes[0]['f0'], G.nodes[1]['f0'])
print(shortest_path)

shortest_path_len = nx.shortest_path_length(G, G.nodes[0]['f0'], G.nodes[1]['f0'])
print(shortest_path_len)

0.000677398715568023
0.3597383824426918
[3133, 3199, 14577, 12203, 17126, 9254, 3399]
6


In [28]:
# 2.2. Centrality

# calculate centrality values
degree_centrality = nx.degree_centrality(G) # nunber of edges a node has
betweenness_centrality = nx.betweenness_centrality(G, normalized=True) # frequency of a node appearing in the shortest paths / all shortest paths
eigenvector_centrality = nx.eigenvector_centrality(G) # the centrality of a node based on that of its neighbors
closeness_centrality = nx.closeness_centrality(G, normalized=True) # the more central a node is, the closer it is to all other nodes

# set centrality as node attributes
set_node_attributes(G, degree_centrality, 'degree_centrality')
set_node_attributes(G, betweenness_centrality, 'betweenness_centrality')
set_node_attributes(G, eigenvector_centrality, 'eigenvector_centrality')
set_node_attributes(G, closeness_centrality, 'closeness_centrality')

In [None]:
# convert nodes to dataframe for quick reference
network_df = nx.to_pandas_dataframe(G)
network_df.head()

In [None]:
# 3. Graph Visualization

In [39]:
pos = nx.spring_layout(G) # force a layout to the network before drawing
node_size = [v*1000 for v in betweenness_centrality]
node_color = [2000*G.degree(v) for v in G]

plt.figure(figsize=(20,20))
nx.draw_networkx(G, pos=pos, with_labels=False, node_color=node_color, node_size=node_size)
plt.axis('off')
plt.show()

KeyboardInterrupt: 