In [108]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
import pathpy as pp
import my_functions
from my_functions import matprint
import igraph

from IPython.display import *
from IPython.display import HTML

# Preliminary network set up (getting the feel for it)
## Use data to copy Rosvall (2014) first (Cali only)
https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FIM
https://ingoscholtes.github.io/pathpy/tutorial.html

Need a `.edges` a `.tedges` and a `.ngram` file now

In [109]:
# assemble data
dep_df = pd.read_csv("Data/US flights 2011/carrier departures.csv")
dep_df = dep_df[dep_df.DEPARTURES_PERFORMED>0]
dep_df.PASSENGERS = pd.to_numeric(dep_df.PASSENGERS,downcast="integer")
dep_df = dep_df.sample(frac=0.1)
dep_df = dep_df.sort_values(by=['MONTH'])
dep_df = dep_df.reset_index()

In [110]:
dep_df = dep_df[["ORIGIN_CITY_NAME","DEST_CITY_NAME","PASSENGERS","MONTH"]]
dep_df.columns = ["source","target","frequency","time"]
dep_df

Unnamed: 0,source,target,frequency,time
0,"Las Vegas, NV","San Francisco, CA",158,1
1,"Salt Lake City, UT","Long Beach, CA",5089,1
2,"Las Vegas, NV","San Diego, CA",16078,1
3,"Long Beach, CA","Memphis, TN",0,1
4,"Detroit, MI","San Jose, CA",0,1
...,...,...,...,...
3576,"San Francisco, CA","New York, NY",12427,12
3577,"Los Angeles, CA","Denver, CO",11702,12
3578,"Austin, TX","Ontario, CA",119,12
3579,"Teterboro, NJ","Burbank, CA",7,12


### Make time-aggregated weighted network .edges file
Ok, this is a little tricky because we have frequencies per month as well... check is ok this later

In [111]:
dep_wn = dep_df[["source","target","frequency"]] # leave out time
# aggregate all the same paths over the year
dep_wn = dep_wn.groupby(['source','target'],as_index=False)['frequency'].sum()
dep_wn.to_csv("Data/US flights 2011/carrier departures.edges",sep='\t',index=False)

In [112]:
dep_wn

Unnamed: 0,source,target,frequency
0,"Albuquerque, NM","Los Angeles, CA",17106
1,"Albuquerque, NM","Oakland, CA",9809
2,"Albuquerque, NM","Ontario, CA",0
3,"Albuquerque, NM","San Diego, CA",4624
4,"Albuquerque, NM","San Francisco, CA",5054
...,...,...,...
1024,"Wendover, UT","Modesto, CA",145
1025,"Wendover, UT","Redding, CA",141
1026,"Wichita, KS","Los Angeles, CA",2005
1027,"Yuma, AZ","El Centro, CA",20


### Make .tedges file
Problem that we lose monthly weightings here

In [113]:
dep_tn = dep_df[["source","target","time"]]
dep_tn.to_csv("Data/US flights 2011/carrier departures.tedges",sep='\t',index=False)
dep_tn

Unnamed: 0,source,target,time
0,"Las Vegas, NV","San Francisco, CA",1
1,"Salt Lake City, UT","Long Beach, CA",1
2,"Las Vegas, NV","San Diego, CA",1
3,"Long Beach, CA","Memphis, TN",1
4,"Detroit, MI","San Jose, CA",1
...,...,...,...
3576,"San Francisco, CA","New York, NY",12
3577,"Los Angeles, CA","Denver, CO",12
3578,"Austin, TX","Ontario, CA",12
3579,"Teterboro, NJ","Burbank, CA",12


In [114]:
dep_tn.shape

(3581, 3)

# Temporal Network

File format will be `.tedges` and will be like:

    time source target
      
    
Using tutorial from _When is Network a Network?_ paper https://ingoscholtes.github.io/pathpy/tutorial.html#paths

My comment: This network ignores frequencies so an edge exists if there was a flight that month hence it's not the best form of pathway data because you can't see whether flights are consecutive... I will have to do this again with a network with finer grain time resolution.

In [115]:
dep_tnet = pp.TemporalNetwork.read_file("Data/US flights 2011/carrier departures.edges",separator='\t',directed=True)

2021-03-27 19:24:02 [Severity.INFO]	Reading directed time-stamped links ...
2021-03-27 19:24:02 [Severity.INFO]	Building index data structures ...
2021-03-27 19:24:02 [Severity.INFO]	Sorting time stamps ...
2021-03-27 19:24:02 [Severity.INFO]	finished.


In [116]:
# basic network info
print("total number of edges: {}".format(dep_tnet.ecount()))
print("total number of vertices: {}".format(dep_tnet.vcount()))
print("number of months of observation: {}".format(dep_tnet.observation_length()))


total number of edges: 1029
total number of vertices: 188
number of months of observation: 1028


In [117]:
# this isn't working, time isn't incrementing
#dep_tnet

In [118]:
# Create time-aggregated network
dep_agg = pp.Network.from_temporal_network(dep_tnet, directed=True)

In [119]:
#dep_agg # expensive

# Time-aggregated network

In [120]:
#help(pp.Network)
#https://github.com/uzhdag/pathpy/issues/46

In [121]:
dep_net  = pp.Network.read_file("Data/US flights 2011/carrier departures.edges", separator='\t',weighted=True,directed=True,header=True)

2021-03-27 19:24:07 [Severity.INFO]	Reading edge list ... 
2021-03-27 19:24:07 [Severity.INFO]	finished.


In [193]:
dep_net.edges.values()[['weight']]

TypeError: 'dict_values' object is not subscriptable

In [174]:
max_w = max(dep_net.edges.values())
min_w = min(dep_net.edges.values())
range_w = max_w - min_w

TypeError: '>' not supported between instances of 'dict' and 'dict'

In [162]:
c = pp.algorithms.centralities.eigenvector(dep_net)

maxc = max(c.values())
minc = min(c.values())
rang =  maxc -  minc

# to scale nodes by their betweenness centrality
# this isn't working...
style = {}
style['node_size'] = {v:10*(u-minc)/rang for v,u in c.items()}
pp.visualisation.plot(dep_net,**style)

In [122]:
# visualise network
# dep_net

In [123]:
eig_cent = pp.algorithms.centralities.eigenvector(dep_net)

In [124]:
# do some visuals here

def Network2igraph(network):
    """ 
    Returns an igraph Graph object which represents 
    the k-th layer of a multi-order graphical model.
    """
    g = igraph.Graph(directed=True)

    for e in network.edges:
        if g.vcount()== 0 or e[0] not in g.vs()["name"]:
            g.add_vertex(e[0])
        if g.vcount()== 0 or e[1] not in g.vs()["name"]:
            g.add_vertex(e[1])
        g.add_edge(e[0], e[1], weight=network.edges[e].sum())
    return g

g1 = Network2igraph(dep_net)
igraph.plot(g1)

visual_style = {}
visual_style["bbox"] = (600, 400)
visual_style["margin"] = 60
visual_style["vertex_size"] = 80
visual_style["vertex_label_size"] = 24
visual_style["vertex_color"] = "lightblue"
visual_style["edge_curved"] = 0.2
visual_style["edge_width"] = 1
visual_style["edge_arrow_size"] = 2

visual_style["layout"] = g1.layout_auto()
visual_style["vertex_label"] = g1.vs["name"]
visual_style["edge_label"] = g1.es["weight"]

igraph.plot(g1, 'pathpy_tutorial/g1.png', **visual_style)
#display(Image(filename='pathpy_tutorial/g1.png'))

AttributeError: 'dict' object has no attribute 'sum'

In [77]:
# basic time-agg network statistics
A = pp.Network.adjacency_matrix(dep_net,weighted=True)
L = pp.Network.laplacian_matrix(dep_net,weighted=True) # transposed
T = pp.Network.transition_matrix(dep_net) # transposed

print(pp.Network.summary(dep_net))

# degree distribution
# clustering coefficient
# spectral properties
v1 = pp.Network.leading_eigenvector(A, normalized=True, lanczos_vecs=None, maxiter=None)
# motifs?

# dynamical node centralities: 

# community structures


Directed network
Nodes:				193
Links:				1013



In [76]:
# network statistics
print('Network Statistics:')
print("number of nodes: {}".format(dep_net.ncount()))
print("number of edges: {}".format(dep_net.ecount()))
# print("total edge weight: {}".format(dep_net.transition_matrix())) # not what I expected it to do

print("average clustering coefficient: {}".format(pp.algorithms.statistics.avg_clustering_coefficient(dep_net)))
print("network diameter: {}".format(pp.algorithms.shortest_paths.diameter(dep_net)))

Network Statistics:
number of nodes: 193
number of edges: 1013
average clustering coefficient: 0.39662485286387117
network diameter: inf


In [72]:
# help(pp.algorithms.statistics.degree_dist(dep_net, degree='degree')) # not working

In [62]:
# node statistics
c_cents = pp.algorithms.centralities.closeness(dep_net)
b_cents = pp.algorithms.centralities.betweenness(dep_net)
eig_cents = pp.algorithms.centralities.eigenvector(dep_net)
pr_cents = pp.algorithms.centralities.pagerank(dep_net) # PageRank based on power method
#def rank_centralities()

2021-03-27 16:01:56 [Severity.INFO]	Calculating closeness in network ...
2021-03-27 16:01:56 [Severity.INFO]	finished.
2021-03-27 16:01:56 [Severity.INFO]	Calculating betweenness centralities ...


# Start looking at memory network

# Pathpy Network Viz
https://ingoscholtes.github.io/pathpy/tutorial.html#5.-Multi-Order-Graphical-Models-of-Pathways-and-Temporal-Networks

Weightings will be different on this because tnet ignored monthly frequencies and I don't know if wnet aggrgates frequencies for same node. I think I'll have to manually do time-aggregation and add path frequencies. 

Save the modified df to a tab/comma-delimited `.ngram` file with the last column being frequencies.

    pp.Paths.readFile('pathpy_tutorial/tube_paths.ngram', separator=',', pathFrequency=True)

To extract paths from a temporal network you need to choose a maximum time difference $\delta$ and `paths = pp.Paths.fromTemporalNetwork(t, delta=180)`

### Theory
Want to know if we can justify representing this data as a network with the Markov property,

### How to get paths
**Get network:** `pp.Network.read_file(PATH,sepatator="'')`

**Get origin, destination info:** `...read_origin_destination(filename, separator=',')` of form: `origin1,destination1,weight`

**Make paths from network:** `paths_from_origin_destination(origin_destination_list, network, distribute_weight=True)`

Need to read over `help(pp.path_extraction.origin_destination_stats)`

In [43]:
# construct paths from a temporal network etc as done in tutorial

# make a memory network
#help(pp.visualisation.alluvial)
#generate_memory_net(paths, node, self_loops=True)
#generate_memory_net_markov(network, focal_node, self_loops=True)

In [None]:
# assemble weighted data

In [13]:
# construct bigrams and trigrams

In [14]:
# construct M1 network

In [15]:
# construct M2 network