In [37]:
# http://holoviews.org/user_guide/Network_Graphs.html
import numpy as np
import pandas as pd
import holoviews as hv
import networkx as nx

hv.extension('bokeh')

In [38]:
%opts Nodes Graph [width=800 height=800 xaxis=None yaxis=None]


In [39]:
%%opts Graph [color_index='circle']
%%opts Graph (node_size=10 edge_line_width=1)
colors = ['#000000']+hv.Cycle('Category20').values
edges_df = pd.read_csv('../data/processed_network/fb_edges.csv')
# they do the nodes in one step, using spcial hv.Nodes command
fb_nodes = pd.read_csv('../data/processed_network/fb_nodes.csv')

edges_df.head()

Unnamed: 0,start,end
0,236,186
1,236,84
2,236,62
3,236,142
4,236,252


In [40]:
fb_nodes.head()

Unnamed: 0,x,y,index,circle
0,0.346231,0.294644,1,circle15
1,0.754652,0.903146,2,circle10
2,0.248924,0.535731,3,circle15
3,0.286255,0.033878,4,
4,0.385894,0.26404,5,circle16


I'm not sure what the x and y mean, probably jsut random variables? Or do they provide position?

In [41]:
# we add this extra step here, to make it two steps, so we can view head above
fb_nodes = hv.Nodes(fb_nodes).sort()
fb_graph = hv.Graph((edges_df, fb_nodes), label='Facebook Circles')
fb_graph = fb_graph.redim.range(x=(-0.05, 1.05), y=(-0.05, 1.05)).options(cmap=colors)
fb_graph

## Can we repeat the above with out data?

In [89]:
# convert from csv to pandas df, can read compressed also if required
df_nodes_file = pd.read_csv('../data/processed_network/test_nodes.csv', names = ["page"])
df_edges_file = pd.read_csv('../data/processed_network/test_edges.csv', usecols=[0, 1, 2])
# fix messy names
df_edges_file.columns = df_edges_file.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [90]:
nodes = df_nodes_file.reset_index().set_index('page').squeeze() 
# remove duplicates
nodes = nodes.loc[~nodes.index.duplicated(keep='first')]

edges = df_edges_file
# need indices
edges['source'] = df_edges_file.source_node.map(nodes)
edges['destination'] = df_edges_file.destination_node.map(nodes)

In [91]:
# remove duplicates then convert to integer
# create empty list to hold indices
index_to_drop = []
# append list with indices
for index, row in edges['destination'].iteritems():
    if pd.isnull(row):
        index_to_drop.append(index)


edges.drop(edges.index[[index_to_drop]], inplace=True)
# now we should be able to convert destination into int
edges['destination'].astype(int)

edges.head()

Unnamed: 0,source_node,destination_node,weight,source,destination
0,/tax-codes/updating-tax-code,/check-income-tax-current-year,50181,20741,2779.0
1,/check-income-tax-current-year,/check-income-tax-current-year/sign-in/prove-i...,2612646,2779,2781.0
2,/penalty-points-endorsements,/view-driving-licence,27756,17003,21575.0
3,/government/organisations/companies-house,/get-information-about-a-company,3787239753,7758,5896.0
4,/council-tax-bands,/council-tax,19527,3618,3615.0


In [92]:
# need to convert from pandas series to pandas dataframe, to pass to holoviews later
nodes = nodes.to_frame()
# rename to avoid confusion over index
nodes.columns = ["index"]
# page is currently the index, let's make a new column that contains the info for holoviews
nodes['label'] = pd.Series(nodes.index, index = nodes.index)

# the only difference is now our rows are indexed by "page", let's change that to "index" column


nodes.head()

Unnamed: 0_level_0,index,label
page,Unnamed: 1_level_1,Unnamed: 2_level_1
/,0,/
/1619-bursary-fund,1,/1619-bursary-fund
/1619-bursary-fund/eligibility,2,/1619-bursary-fund/eligibility
/1619-bursary-fund/further-information,3,/1619-bursary-fund/further-information
/1619-bursary-fund/how-to-claim,4,/1619-bursary-fund/how-to-claim


Now let's drop the variables we don't need so our pandas look like those in the facebook example above.


In [93]:
nodes = nodes[['index', 'label']]
nodes.head()
# the above is not accepted by hv.Nodes() and errors
nodes.size

44142

In [95]:
# let's try convverting to a table first and then into nodes
# this still fails, is it because we are missing x and y values from fbook example?
# error says it expects 3kdims, 3 columns, will adding one help?
prep_nodes = nodes
# get length, so can generate random number Series to add as new columns
sLength = len(nodes['label'])
prep_nodes['x'] = pd.Series(np.random.randn(sLength), index = nodes.index)
prep_nodes['y'] = pd.Series(np.random.randn(sLength), index = nodes.index)

prep_nodes = hv.Table(prep_nodes)
prep_nodes

In [102]:
prep_nodes.kdims


[Dimension('index'), Dimension('label'), Dimension('x'), Dimension('y')]

In [96]:
edges = edges[['source', 'destination', 'weight']]

edges.head()

Unnamed: 0,source,destination,weight
0,20741,2779.0,50181
1,2779,2781.0,2612646
2,17003,21575.0,27756
3,7758,5896.0,3787239753
4,3618,3615.0,19527


In [97]:
# create some mroe space for the plot
%opts Nodes Graph [width=1000 height=1000 xaxis=None yaxis=None]


In [101]:
# we seperate this line as it keeps erroring
gov_nodes = hv.Nodes(prep_nodes).sort()


ValueError: kdims: list length must be between 3 and 3 (inclusive)

In [98]:
# we ignore some of the facebook example specific quirks, by commenting it out
gov_graph = hv.Graph((edges, gov_nodes), label='GOV.UK user journeys')
#fb_graph = fb_graph.redim.range(x=(-0.05, 1.05), y=(-0.05, 1.05))
gov_graph

ValueError: kdims: list length must be between 3 and 3 (inclusive)

Bundling it, to make it less hairballey

In [33]:
from holoviews.operation.datashader import datashade, bundle_graph
bundled = bundle_graph(gov_graph)
bundled

Detailed substructure of this graph becomes visible after bundling edges using a variant of Hurter, Ersoy, & Telea (ECV-2012), which takes several minutes even using multiple cores with Dask:

In [None]:
# drop all variables not in use
edges

In [None]:
%time r_bundled = hv.Curve(hammer_bundle(r_nodes.data, r_edges.data),label="Bundled")
