In [51]:
# note: anything that uses a shortest path algorithm should use 'distance' (1/flow) as the weight parameter,
# since more people moving on a path should make it easier to traverse that path

In [52]:
import networkx as nx
import pandas as pd
from functools import reduce

In [53]:
G = nx.DiGraph()

x = pd.ExcelFile('data/metro-to-metro-2011-2015.xlsx')
print(x.sheet_names)

previous = 'Unnamed: 15'
current = 'Unnamed: 2'
count = 'Unnamed: 26'

thresh = 0
migrate = []

for state in x.sheet_names:
    df = x.parse(state)

    for index, row in df.iterrows():

        # skip first three rows: header rows
        if index < 3:
            continue
        # skip the last rows: footer rows
        if index >= 53724:
            break


        if int(row[count]) < thresh:
            continue

        migrate.append(row[count])
        G.add_edge(row[previous], row[current], weight=int(row[count]))

['Metro-to-Metro 2011-2015']


In [54]:
print(len(G.nodes()))
print(len(G.edges()))

398
53721


In [55]:
# degree centrality
# doesn't use weight
deg_cent = pd.DataFrame.from_dict(nx.degree_centrality(G), orient='index', columns=['Degree']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
print(deg_cent.sort_values(by='Degree', ascending=False))

                                                 Metro    Degree
109      Outside Metro Area within U.S. or Puerto Rico  1.977330
62    New York-Newark-Jersey City, NY-NJ-PA Metro Area  1.788413
19       Chicago-Naperville-Elgin, IL-IN-WI Metro Area  1.785894
25          Dallas-Fort Worth-Arlington, TX Metro Area  1.773300
105  Washington-Arlington-Alexandria, DC-VA-MD-WV M...  1.770781
53       Los Angeles-Long Beach-Anaheim, CA Metro Area  1.750630
72              Phoenix-Mesa-Scottsdale, AZ Metro Area  1.707809
5         Atlanta-Sandy Springs-Roswell, GA Metro Area  1.667506
38     Houston-The Woodlands-Sugar Land, TX Metro Area  1.617128
95      Tampa-St. Petersburg-Clearwater, FL Metro Area  1.614610
88              Seattle-Tacoma-Bellevue, WA Metro Area  1.591940
58   Miami-Fort Lauderdale-West Palm Beach, FL Metr...  1.579345
102  Virginia Beach-Norfolk-Newport News, VA-NC Met...  1.579345
85                   San Diego-Carlsbad, CA Metro Area  1.574307
27               Denver-A

In [56]:
# eigenvector centrality
# use weight = flow
eig_cent = pd.DataFrame.from_dict(nx.eigenvector_centrality(G, weight='weight'), orient='index', columns=['Eigenvector']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
print(eig_cent.sort_values(by='Eigenvector', ascending=False))

                                                 Metro    Eigenvector
109      Outside Metro Area within U.S. or Puerto Rico   6.180784e-01
25          Dallas-Fort Worth-Arlington, TX Metro Area   2.199228e-01
38     Houston-The Woodlands-Sugar Land, TX Metro Area   1.897061e-01
53       Los Angeles-Long Beach-Anaheim, CA Metro Area   1.724938e-01
5         Atlanta-Sandy Springs-Roswell, GA Metro Area   1.724092e-01
72              Phoenix-Mesa-Scottsdale, AZ Metro Area   1.627624e-01
62    New York-Newark-Jersey City, NY-NJ-PA Metro Area   1.508124e-01
79     Riverside-San Bernardino-Ontario, CA Metro Area   1.503959e-01
19       Chicago-Naperville-Elgin, IL-IN-WI Metro Area   1.459562e-01
105  Washington-Arlington-Alexandria, DC-VA-MD-WV M...   1.423062e-01
88              Seattle-Tacoma-Bellevue, WA Metro Area   1.277689e-01
60   Minneapolis-St. Paul-Bloomington, MN-WI Metro ...   1.192056e-01
6                     Austin-Round Rock, TX Metro Area   1.168166e-01
27               Den

In [57]:
# closeness centrality
# use distance = 1/flow

# If the ‘distance’ keyword is set to an edge attribute key then the shortest-path length will be computed
# using Dijkstra’s algorithm with that edge attribute as the edge weight
g_distance_dict = {(e1, e2): 1 / weight for e1, e2, weight in G.edges(data='weight')}
nx.set_edge_attributes(G, g_distance_dict, 'distance')

close_cent = pd.DataFrame.from_dict(nx.closeness_centrality(G, distance='distance'), orient='index', columns=['Closeness']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
print(close_cent.sort_values(by='Closeness', ascending=False))

                                                 Metro    Closeness
109      Outside Metro Area within U.S. or Puerto Rico  2575.988324
111                                               Asia  2559.986500
62    New York-Newark-Jersey City, NY-NJ-PA Metro Area  2479.607545
112                                    Central America  2437.910783
19       Chicago-Naperville-Elgin, IL-IN-WI Metro Area  2427.013408
25          Dallas-Fort Worth-Arlington, TX Metro Area  2423.670840
5         Atlanta-Sandy Springs-Roswell, GA Metro Area  2403.118574
113                                             Europe  2393.833080
38     Houston-The Woodlands-Sugar Land, TX Metro Area  2384.392295
60   Minneapolis-St. Paul-Bloomington, MN-WI Metro ...  2356.634379
88              Seattle-Tacoma-Bellevue, WA Metro Area  2334.608725
105  Washington-Arlington-Alexandria, DC-VA-MD-WV M...  2313.631949
53       Los Angeles-Long Beach-Anaheim, CA Metro Area  2306.492494
81                         St. Louis, MO-IL Metr

In [58]:
# betweenness centrality
# use distance = 1/flow
bet_cent = pd.DataFrame.from_dict(nx.betweenness_centrality(G, weight='distance'), orient='index', columns=['Betweenness']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
print(bet_cent.sort_values(by='Betweenness', ascending=False))

                                                 Metro  Betweenness
109      Outside Metro Area within U.S. or Puerto Rico     0.947027
62    New York-Newark-Jersey City, NY-NJ-PA Metro Area     0.088765
53       Los Angeles-Long Beach-Anaheim, CA Metro Area     0.054710
105  Washington-Arlington-Alexandria, DC-VA-MD-WV M...     0.050639
127  Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...     0.030443
207       San Francisco-Oakland-Hayward, CA Metro Area     0.030080
13           Boston-Cambridge-Newton, MA-NH Metro Area     0.027409
133            San Juan-Carolina-Caguas, PR Metro Area     0.024845
72              Phoenix-Mesa-Scottsdale, AZ Metro Area     0.022333
67            Orlando-Kissimmee-Sanford, FL Metro Area     0.019979
73      Portland-Vancouver-Hillsboro, OR-WA Metro Area     0.019827
19       Chicago-Naperville-Elgin, IL-IN-WI Metro Area     0.017486
88              Seattle-Tacoma-Bellevue, WA Metro Area     0.017416
8             Baltimore-Columbia-Towson, MD Metr

In [59]:
# harmonic centrality
# use distance = 1/flow
# see: 'Can Harmonic Centrality Be the New PageRank?'
# https://www.searchenginejournal.com/harmonic-centrality-pagerank/283985/#close
harm_cent = pd.DataFrame.from_dict(nx.harmonic_centrality(G, distance='distance'), orient='index', columns=['Harmonic']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
print(harm_cent.sort_values(by='Harmonic', ascending=False))

                                                 Metro      Harmonic
109      Outside Metro Area within U.S. or Puerto Rico  1.952652e+06
25          Dallas-Fort Worth-Arlington, TX Metro Area  1.546762e+06
38     Houston-The Woodlands-Sugar Land, TX Metro Area  1.486132e+06
5         Atlanta-Sandy Springs-Roswell, GA Metro Area  1.482701e+06
72              Phoenix-Mesa-Scottsdale, AZ Metro Area  1.452602e+06
60   Minneapolis-St. Paul-Bloomington, MN-WI Metro ...  1.433538e+06
19       Chicago-Naperville-Elgin, IL-IN-WI Metro Area  1.400016e+06
62    New York-Newark-Jersey City, NY-NJ-PA Metro Area  1.386140e+06
88              Seattle-Tacoma-Bellevue, WA Metro Area  1.356478e+06
65                        Oklahoma City, OK Metro Area  1.340853e+06
105  Washington-Arlington-Alexandria, DC-VA-MD-WV M...  1.337395e+06
162                            Columbus, OH Metro Area  1.306825e+06
27               Denver-Aurora-Lakewood, CO Metro Area  1.304166e+06
81                         St. Lou

In [60]:
# merge all node measures: deg_cent, eig_cent, close_cent, bet_cent, harm_cent
nodes = [deg_cent, eig_cent, close_cent, bet_cent, harm_cent]
nodes_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Metro'], how='outer'), nodes)
print(nodes_merged)
nodes_merged.to_csv('node_centrality_measures.csv', index=False)

                                                Metro    Degree  Eigenvector  \
0                          Albuquerque, NM Metro Area  1.060453     0.043337   
1                              Abilene, TX Metro Area  0.521411     0.020280   
2        Allentown-Bethlehem-Easton, PA-NJ Metro Area  0.881612     0.020750   
3                             Amarillo, TX Metro Area  0.526448     0.025437   
4                            Anchorage, AK Metro Area  1.173804     0.033434   
5        Atlanta-Sandy Springs-Roswell, GA Metro Area  1.667506     0.172409   
6                    Austin-Round Rock, TX Metro Area  1.347607     0.116817   
7                          Bakersfield, CA Metro Area  0.798489     0.031509   
8            Baltimore-Columbia-Towson, MD Metro Area  1.377834     0.064786   
9                          Baton Rouge, LA Metro Area  0.851385     0.026283   
10                Beaumont-Port Arthur, TX Metro Area  0.594458     0.025477   
11                   Birmingham-Hoover, 

In [64]:
# edge betweenness centrality
# use distance = 1/flow
edge_bet_cent = pd.DataFrame.from_dict(nx.edge_betweenness_centrality(G, weight='distance'), orient='index', columns=['Edge Betweenness']).reset_index(level=0).rename(index=str, columns={'index': 'Metros'})
print(edge_bet_cent.sort_values(by='Edge Betweenness', ascending=False))

                                                  Metros  Edge Betweenness
21099  (Outside Metro Area within U.S. or Puerto Rico...          0.054783
21059  (Outside Metro Area within U.S. or Puerto Rico...          0.038891
21216  (Outside Metro Area within U.S. or Puerto Rico...          0.027607
20432  (Washington-Arlington-Alexandria, DC-VA-MD-WV ...          0.023619
10637  (Los Angeles-Long Beach-Anaheim, CA Metro Area...          0.019765
12437  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.019221
13360  (Orlando-Kissimmee-Sanford, FL Metro Area, Out...          0.018759
12326  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.017550
20891  (Outside Metro Area within U.S. or Puerto Rico...          0.015942
20917  (Outside Metro Area within U.S. or Puerto Rico...          0.014879
21120  (Outside Metro Area within U.S. or Puerto Rico...          0.014835
25625  (San Juan-Carolina-Caguas, PR Metro Area, Orla...          0.014708
10706  (Los Angeles-Long 

In [65]:
# merge all link measures: edge_bet_cent
links = [edge_bet_cent]
links_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Metros'], how='outer'), links)
print(links_merged)
links_merged.to_csv('link_centrality_measures.csv', index=False)

                                                  Metros  Edge Betweenness
0      (Albuquerque, NM Metro Area, Abilene, TX Metro...          0.000000
1      (Albuquerque, NM Metro Area, Aguadilla-Isabela...          0.000000
2      (Albuquerque, NM Metro Area, Albany-Schenectad...          0.000000
3      (Albuquerque, NM Metro Area, Altoona, PA Metro...          0.000000
4      (Albuquerque, NM Metro Area, Amarillo, TX Metr...          0.000000
5      (Albuquerque, NM Metro Area, Anchorage, AK Met...          0.000000
6      (Albuquerque, NM Metro Area, Ann Arbor, MI Met...          0.000000
7      (Albuquerque, NM Metro Area, Appleton, WI Metr...          0.000000
8      (Albuquerque, NM Metro Area, Athens-Clarke Cou...          0.000000
9      (Albuquerque, NM Metro Area, Atlanta-Sandy Spr...          0.000000
10     (Albuquerque, NM Metro Area, Atlantic City-Ham...          0.000000
11     (Albuquerque, NM Metro Area, Auburn-Opelika, A...          0.000000
12     (Albuquerque, NM M