In [48]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
import pathpy as pp
import my_functions
from my_functions import matprint
import igraph
import csv
from tabulate import tabulate

from IPython.display import * # no idea what this does
from IPython.display import HTML # no idea what this does

**csh tutorial:** https://ingoscholtes.github.io/pathpy/tutorial.html#data

# 1. Flight Data
Leaving out departure data and focusing on pathway data

    # from departures dataset
    dep_net = pp.Network.read_file("Data/US flights 2011/US flights 2011.edges",directed=True)

    # network statistics
    print('Network Statistics:')
    print("number of nodes: {}".format(dep_net.ncount()))
    print("number of edges: {}".format(dep_net.ecount()))
    # print("total edge weight: {}".format(dep_net.transition_matrix())) # not what I expected it to do

    print("average clustering coefficient: {}".format(pp.algorithms.statistics.avg_clustering_coefficient(dep_net)))
    print("network diameter: {}".format(pp.algorithms.shortest_paths.diameter(dep_net)))

In [3]:
# from paths dataset - use this instead
flight_paths = pp.Paths.read_file("Data/US flights 2011/US flights od.ngram", separator=',', frequency=True)
print(flight_paths)

2021-04-01 20:46:44 [Severity.INFO]	Reading ngram data ... 
2021-04-01 20:46:44 [Severity.INFO]	finished. Read 35801 paths with maximum length 8
2021-04-01 20:46:44 [Severity.INFO]	Calculating sub path statistics ... 
2021-04-01 20:46:44 [Severity.INFO]	finished.
Total path count: 		1470628.0 
[Unique / Sub paths / Total]: 	[35801.0 / 6597817.0 / 8068445.0]
Nodes:				185 
Edges:				2115
Max. path length:		8
Avg path length:		1.7830097074175115 
Paths of length k = 0		0.0 [ 0.0 / 4092772.0 / 4092772.0 ]
Paths of length k = 1		467906.0 [ 1333.0 / 2154238.0 / 2622144.0 ]
Paths of length k = 2		904505.0 [ 13274.0 / 247011.0 / 1151516.0 ]
Paths of length k = 3		49828.0 [ 10905.0 / 98966.0 / 148794.0 ]
Paths of length k = 4		46616.0 [ 8909.0 / 3961.0 / 50577.0 ]
Paths of length k = 5		1390.0 [ 1092.0 / 798.0 / 2188.0 ]
Paths of length k = 6		358.0 [ 265.0 / 57.0 / 415.0 ]
Paths of length k = 7		18.0 [ 16.0 / 14.0 / 32.0 ]
Paths of length k = 8		7.0 [ 7.0 / 0.0 / 7.0 ]



In [4]:
# help(pp.visualisation.alluvial) 

## Time-aggregated Network analysis

In [16]:
dep_net = pp.Network.from_paths(flight_paths)

# network statistics
print('Static Network Statistics:')
print("number of nodes: {}".format(dep_net.ncount()))
print("number of edges: {}".format(dep_net.ecount()))
# print("total edge weight: {}".format(dep_net.transition_matrix())) # not what I expected it to do

print("average clustering coefficient: {}".format(pp.algorithms.statistics.avg_clustering_coefficient(dep_net)))
print("network diameter: {}".format(pp.algorithms.shortest_paths.diameter(dep_net)))

Static Network Statistics:
number of nodes: 185
number of edges: 2115
average clustering coefficient: 0.6167263568907494
network diameter: inf


In [None]:
# degree distribution + histogram


In [36]:
# centrality analyses - can use on paths or higher-order networks or paths
# NEED TO MAKE SURE M1 NETWORK IS ERGODIC
# node traversals / visitation probabilities for paths
# closeness for higher-order because projects back down
eig_cents = pp.algorithms.centralities.eigenvector(dep_net)
betw_cents = pp.algorithms.centralities.betweenness(dep_net)
close_cents = pp.algorithms.centralities.closeness(dep_net)
pr_cents = pp.algorithms.centralities.pagerank(dep_net)

# top 10 for each centrality
eig_ranks10 = pp.algorithms.centralities.rank_centralities(eig_cents)[:10]
pr_cents10 = pp.algorithms.centralities.rank_centralities(pr_cents)[:10]
close_cents10 = pp.algorithms.centralities.rank_centralities(close_cents)[:10]
betw_cents10 = pp.algorithms.centralities.rank_centralities(betw_cents)[:10]


2021-04-01 21:14:39 [Severity.INFO]	Calculating betweenness centralities ...
2021-04-01 21:14:42 [Severity.INFO]	Calculating closeness in network ...
2021-04-01 21:14:42 [Severity.INFO]	finished.


In [124]:
# measure the correlation between the centrality measures
cent_vals  = pd.DataFrame(list(eig_cents.values()))
cent_vals.columns = ["eigenvector"]
cent_vals["PageRank"]  = pd.DataFrame(list(pr_cents.values()))
cent_vals["betweenness"]  = pd.DataFrame(list(betw_cents.values()))
cent_vals["closeness"]  = pd.DataFrame(list(close_cents.values()))
cent_vals.index = list(eig_cents.keys())
print("Correlations:")
print(cent_vals.corr(method='pearson'))
cent_vals

# eigenvector and PageRank are pretty highly correlated which is too 
# be expected but betweenness and closeness are both very different
# because they measure quite different aspects of the nodes

Correlations:
             eigenvector  PageRank  betweenness  closeness
eigenvector     1.000000  0.865487     0.553552   0.515169
PageRank        0.865487  1.000000     0.722728   0.352960
betweenness     0.553552  0.722728     1.000000   0.282024
closeness       0.515169  0.352960     0.282024   1.000000


Unnamed: 0,eigenvector,PageRank,betweenness,closeness
ABQ,0.008229,0.004891,1495.328550,0.669091
BUR,0.021477,0.029383,6256.746006,0.888889
LAX,0.028792,0.082955,9833.070837,0.689139
OAK,0.022715,0.034725,1089.970655,0.671533
ONT,0.021328,0.032689,3482.950640,0.741935
...,...,...,...,...
CRW,0.000854,0.001348,0.000000,0.000000
VIS,0.000666,0.001206,0.000000,0.000000
SJT,0.000854,0.001348,0.000000,0.000000
MBS,0.000900,0.001390,0.000000,0.000000


In [100]:
# print top 10 centralities by each measure
centralities = pd.DataFrame(list(zip(*eig_ranks10))[0])
centralities.columns = ["eigenvector"]
centralities["betweenness"] = list(zip(*betw_cents10))[0]
centralities["closeness"] = list(zip(*close_cents10))[0]
centralities["PageRank"] = list(zip(*pr_cents10))[0]
centralities

# agree on top 4 airports, then betweenness places SNA one above all the others and ranks SJC lower
# overall betweenness diverges the most from consenus

Unnamed: 0,eigenvector,betweenness,closeness,PageRank
0,LAX,LAX,LAX,LAX
1,SFO,SFO,SFO,SFO
2,SAN,SAN,SAN,SAN
3,SMF,SMF,SMF,SMF
4,SJC,SNA,SJC,SJC
5,SNA,ONT,SNA,SNA
6,OAK,SJC,OAK,OAK
7,BUR,OAK,ONT,ONT
8,ONT,BUR,BUR,BUR
9,PSP,LGB,PSP,PSP


In [65]:
print("Eigenvector centralities")
print(tabulate(eig_ranks10))
print("PageRank centralities")
print(tabulate(pr_cents10))
print("Closeness centralities")
print(tabulate(close_cents10))
print("Betweenness centralities")
print(tabulate(betw_cents10))

Eigenvector centralities
---  ---------
LAX  0.0287917
SFO  0.0273284
SAN  0.02522
SMF  0.0237206
SJC  0.0236592
SNA  0.0227202
OAK  0.0227153
BUR  0.0214773
ONT  0.0213285
PSP  0.0153914
---  ---------
PageRank centralities
---  ---------
LAX  0.0829554
SFO  0.0672831
SAN  0.0459811
SMF  0.0373484
SJC  0.0353708
SNA  0.0353609
OAK  0.0347253
ONT  0.0326894
BUR  0.0293832
PSP  0.01839
---  ---------
Closeness centralities
---  --------
LAX  0.888889
SFO  0.807018
SAN  0.741935
SMF  0.710425
SJC  0.69962
SNA  0.69697
OAK  0.689139
ONT  0.671533
BUR  0.669091
PSP  0.609272
---  --------
Betweenness centralities
---  --------
LAX  9833.07
SFO  6256.75
SAN  3482.95
SMF  2190.39
SNA  1573.92
ONT  1519.75
SJC  1495.33
OAK  1089.97
BUR   887.7
LGB   850.528
---  --------


In [64]:
# visualise
#dep_net # use this instead

## Multi-order models

    help(pp.MultiOrderModel)
`estimate_order` estimates the optimal **maximum** order of the hierarchical model. `layer_likelihood` calculates the likelhood of the **first** l layers of a multi-order network model. Likelihood calculates the likelihood of the whole model. Try `test_network_hypothesis(self, paths, method='AIC')` to see if it's a network, doesn't consider nested models.

In [8]:
m1 = pp.MultiOrderModel(flight_paths, max_order=1) 
m2 = pp.MultiOrderModel(flight_paths, max_order=2)
m3 = pp.MultiOrderModel(flight_paths, max_order=3)

2021-04-01 20:48:31 [Severity.INFO]	Generating 0-th order layer ...
2021-04-01 20:48:31 [Severity.INFO]	Generating 1-th order layer ...
2021-04-01 20:48:31 [Severity.INFO]	finished.
2021-04-01 20:48:31 [Severity.INFO]	Generating 0-th order layer ...
2021-04-01 20:48:31 [Severity.INFO]	Generating 1-th order layer ...
2021-04-01 20:48:31 [Severity.INFO]	Generating 2-th order layer ...
2021-04-01 20:48:33 [Severity.INFO]	finished.
2021-04-01 20:48:33 [Severity.INFO]	Generating 0-th order layer ...
2021-04-01 20:48:33 [Severity.INFO]	Generating 1-th order layer ...
2021-04-01 20:48:33 [Severity.INFO]	Generating 2-th order layer ...
2021-04-01 20:48:34 [Severity.INFO]	Generating 3-th order layer ...
2021-04-01 20:48:36 [Severity.INFO]	finished.


In [9]:
print(m1.summary)
print(m2.summary)
print(m3.summary)

<bound method MultiOrderModel.summary of <pathpy.classes.multi_order_model.MultiOrderModel object at 0x166f85850>>
<bound method MultiOrderModel.summary of <pathpy.classes.multi_order_model.MultiOrderModel object at 0x165b036d0>>
<bound method MultiOrderModel.summary of <pathpy.classes.multi_order_model.MultiOrderModel object at 0x105251ac0>>


In [10]:
# Calculates the likelihood of a multi-order network model up to a maximum order max_order based on all path statistics
print('Likelihood = ', m1.likelihood(flight_paths, log=False))
print('Log-likelihood = ', m1.likelihood(flight_paths, log=True))

  return likelihood if log else np.exp(likelihood)


Likelihood =  0.0
Log-likelihood =  -12222217.04195787


In [11]:
print('Likelihood = ', m2.likelihood(flight_paths, log=False))
print('Log-likelihood = ', m2.likelihood(flight_paths, log=True))

Likelihood =  0.0
Log-likelihood =  -10417727.35236143


In [12]:
print('Optimal maximum order = ', m2.estimate_order(flight_paths))
# Optimal order is at least 2, but may be higher.Try to increase `stop_at_order`

2021-04-01 20:49:36 [Severity.INFO]	Likelihood ratio test for K_opt = 2, x = 3608979.3791928813
2021-04-01 20:49:36 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 106829
2021-04-01 20:49:36 [Severity.INFO]	Likelihood ratio test, p = 0.0
Optimal maximum order =  2


In [13]:
print('Optimal maximum order = ', m3.estimate_order(flight_paths))
# Optimal order is at 2

2021-04-01 20:49:42 [Severity.INFO]	Likelihood ratio test for K_opt = 2, x = 3608979.3791928813
2021-04-01 20:49:42 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 106829
2021-04-01 20:49:42 [Severity.INFO]	Likelihood ratio test, p = 0.0
2021-04-01 20:49:49 [Severity.INFO]	Likelihood ratio test for K_opt = 3, x = 385091.24200374633
2021-04-01 20:49:49 [Severity.INFO]	Likelihood ratio test, d_1-d_0 = 2616772
2021-04-01 20:49:49 [Severity.INFO]	Likelihood ratio test, p = 1.0
Optimal maximum order =  2


# Comparing memory network with time-aggregated network
**Basics:** same number of nodes? edges? <br>

**Centrality measures:** what nodes are identified as most important in each model? <br>

| Memory model | Time-aggregated model |
| :----------- | :-------------------- |
|  | PageRank |
|  | eigenvector |
|  | betweenness|

**Clustering:** How do clusters differ? How to compare clustering performance? <br>
**Graph distances** Are there ways of comparing?

**Predictive performance?:**
Look at conditional entropy