In [33]:
# note: anything that uses a shortest path algorithm should use 'distance' (1/flow) as the weight parameter,
# since more people moving on a path should make it easier to traverse that path

In [34]:
import networkx as nx
import pandas as pd
from functools import reduce

In [35]:
non_cities = ['Outside Metro Area within U.S. or Puerto Rico', 'Africa', 'Asia', 'Central America', 'Caribbean', 'Europe', 'U.S. Island Areas', 'Northern America', 'Oceania and At Sea', 'South America']

In [36]:
G = nx.DiGraph()

x = pd.ExcelFile('../data/metro-to-metro-2011-2015.xlsx')
print(x.sheet_names)

previous = 'Unnamed: 15'
current = 'Unnamed: 2'
count = 'Unnamed: 26'

thresh = 0
migrate = []

for state in x.sheet_names:
    df = x.parse(state)

    for index, row in df.iterrows():

        # skip first three rows: header rows
        if index < 3:
            continue
        # skip the last rows: footer rows
        if index >= 53724:
            break
            
        if int(row[count]) < thresh:
            continue

        # exclude non-cities
        if row[previous] in non_cities or row[current] in non_cities:
            continue
            
        migrate.append(row[count])
        G.add_edge(row[previous], row[current], weight=int(row[count]))

['Metro-to-Metro 2011-2015']


In [37]:
print(len(G.nodes()))
print(len(G.edges()))

388
50171


In [38]:
# degree centrality
# doesn't use weight
deg_cent = pd.DataFrame.from_dict(nx.degree_centrality(G), orient='index', columns=['Degree']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
deg_cent = deg_cent.sort_values(by='Degree', ascending=False).reset_index(drop=True)
deg_cent['Degree Rank'] = deg_cent.index + 1
print(deg_cent)

                                                 Metro    Degree  Degree Rank
0     New York-Newark-Jersey City, NY-NJ-PA Metro Area  1.806202            1
1        Chicago-Naperville-Elgin, IL-IN-WI Metro Area  1.803618            2
2           Dallas-Fort Worth-Arlington, TX Metro Area  1.790698            3
3    Washington-Arlington-Alexandria, DC-VA-MD-WV M...  1.788114            4
4        Los Angeles-Long Beach-Anaheim, CA Metro Area  1.767442            5
5               Phoenix-Mesa-Scottsdale, AZ Metro Area  1.723514            6
6         Atlanta-Sandy Springs-Roswell, GA Metro Area  1.682171            7
7      Houston-The Woodlands-Sugar Land, TX Metro Area  1.630491            8
8       Tampa-St. Petersburg-Clearwater, FL Metro Area  1.627907            9
9               Seattle-Tacoma-Bellevue, WA Metro Area  1.604651           10
10   Virginia Beach-Norfolk-Newport News, VA-NC Met...  1.591731           11
11   Miami-Fort Lauderdale-West Palm Beach, FL Metr...  1.591731

In [39]:
# eigenvector centrality
# use weight = flow
eig_cent = pd.DataFrame.from_dict(nx.eigenvector_centrality(G, weight='weight'), orient='index', columns=['Eigenvector']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
eig_cent = eig_cent.sort_values(by='Eigenvector', ascending=False).reset_index(drop=True)
eig_cent['Eigenvector Rank'] = eig_cent.index + 1
print(eig_cent)

                                                 Metro  Eigenvector  \
0      Riverside-San Bernardino-Ontario, CA Metro Area     0.496242   
1        Los Angeles-Long Beach-Anaheim, CA Metro Area     0.466785   
2         San Francisco-Oakland-Hayward, CA Metro Area     0.222380   
3                    San Diego-Carlsbad, CA Metro Area     0.218058   
4     New York-Newark-Jersey City, NY-NJ-PA Metro Area     0.211017   
5    Washington-Arlington-Alexandria, DC-VA-MD-WV M...     0.172725   
6           Dallas-Fort Worth-Arlington, TX Metro Area     0.164204   
7               Phoenix-Mesa-Scottsdale, AZ Metro Area     0.160793   
8          Las Vegas-Henderson-Paradise, NV Metro Area     0.146033   
9      Houston-The Woodlands-Sugar Land, TX Metro Area     0.145504   
10   Miami-Fort Lauderdale-West Palm Beach, FL Metr...     0.129009   
11              Seattle-Tacoma-Bellevue, WA Metro Area     0.127136   
12   Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...     0.121874   
13    

In [40]:
# closeness centrality
# use distance = 1/flow

# If the ‘distance’ keyword is set to an edge attribute key then the shortest-path length will be computed
# using Dijkstra’s algorithm with that edge attribute as the edge weight
g_distance_dict = {(e1, e2): 1 / weight for e1, e2, weight in G.edges(data='weight')}
nx.set_edge_attributes(G, g_distance_dict, 'distance')

close_cent = pd.DataFrame.from_dict(nx.closeness_centrality(G, distance='distance'), orient='index', columns=['Closeness']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
close_cent = close_cent.sort_values(by='Closeness', ascending=False).reset_index(drop=True)
close_cent['Closeness Rank'] = close_cent.index + 1
print(close_cent)

                                                 Metro   Closeness  \
0     New York-Newark-Jersey City, NY-NJ-PA Metro Area  990.902186   
1        Chicago-Naperville-Elgin, IL-IN-WI Metro Area  946.383097   
2        Los Angeles-Long Beach-Anaheim, CA Metro Area  945.949495   
3    Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...  943.173280   
4      Riverside-San Bernardino-Ontario, CA Metro Area  928.528250   
5    Washington-Arlington-Alexandria, DC-VA-MD-WV M...  910.323062   
6    Miami-Fort Lauderdale-West Palm Beach, FL Metr...  910.238983   
7            Boston-Cambridge-Newton, MA-NH Metro Area  909.497612   
8         San Francisco-Oakland-Hayward, CA Metro Area  893.762466   
9                    San Diego-Carlsbad, CA Metro Area  893.478299   
10                              Trenton, NJ Metro Area  888.413338   
11        Atlanta-Sandy Springs-Roswell, GA Metro Area  886.918418   
12            Baltimore-Columbia-Towson, MD Metro Area  874.406156   
13       San Jose-Su

In [41]:
# betweenness centrality
# use distance = 1/flow
bet_cent = pd.DataFrame.from_dict(nx.betweenness_centrality(G, weight='distance'), orient='index', columns=['Betweenness']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
bet_cent = bet_cent.sort_values(by='Betweenness', ascending=False).reset_index(drop=True)
bet_cent['Betweenness Rank'] = bet_cent.index + 1
print(bet_cent)

                                                 Metro  Betweenness  \
0     New York-Newark-Jersey City, NY-NJ-PA Metro Area     0.511233   
1        Chicago-Naperville-Elgin, IL-IN-WI Metro Area     0.378024   
2        Los Angeles-Long Beach-Anaheim, CA Metro Area     0.289366   
3         Atlanta-Sandy Springs-Roswell, GA Metro Area     0.192948   
4           Dallas-Fort Worth-Arlington, TX Metro Area     0.160910   
5    Washington-Arlington-Alexandria, DC-VA-MD-WV M...     0.103895   
6               Seattle-Tacoma-Bellevue, WA Metro Area     0.102676   
7    Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...     0.093552   
8      Houston-The Woodlands-Sugar Land, TX Metro Area     0.088980   
9               Phoenix-Mesa-Scottsdale, AZ Metro Area     0.074547   
10   Miami-Fort Lauderdale-West Palm Beach, FL Metr...     0.071501   
11   Minneapolis-St. Paul-Bloomington, MN-WI Metro ...     0.064298   
12        San Francisco-Oakland-Hayward, CA Metro Area     0.063890   
13    

In [42]:
# harmonic centrality
# use distance = 1/flow
# see: 'Can Harmonic Centrality Be the New PageRank?'
# https://www.searchenginejournal.com/harmonic-centrality-pagerank/283985/#close
harm_cent = pd.DataFrame.from_dict(nx.harmonic_centrality(G, distance='distance'), orient='index', columns=['Harmonic']).reset_index(level=0).rename(index=str, columns={'index': 'Metro'})
harm_cent = harm_cent.sort_values(by='Harmonic', ascending=False).reset_index(drop=True)
harm_cent['Harmonic Rank'] = harm_cent.index + 1
print(harm_cent)

                                                 Metro       Harmonic  \
0      Riverside-San Bernardino-Ontario, CA Metro Area  649621.621497   
1        Los Angeles-Long Beach-Anaheim, CA Metro Area  629291.037806   
2     New York-Newark-Jersey City, NY-NJ-PA Metro Area  588508.725168   
3    Miami-Fort Lauderdale-West Palm Beach, FL Metr...  555646.044663   
4         San Francisco-Oakland-Hayward, CA Metro Area  554782.004680   
5    Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...  553107.398614   
6    Washington-Arlington-Alexandria, DC-VA-MD-WV M...  541491.703264   
7           Dallas-Fort Worth-Arlington, TX Metro Area  525353.382035   
8            Boston-Cambridge-Newton, MA-NH Metro Area  524881.005405   
9                    San Diego-Carlsbad, CA Metro Area  518320.023672   
10        Atlanta-Sandy Springs-Roswell, GA Metro Area  513106.319430   
11     Houston-The Woodlands-Sugar Land, TX Metro Area  510921.828559   
12            Orlando-Kissimmee-Sanford, FL Metro A

In [43]:
# merge all node measures: deg_cent, eig_cent, close_cent, bet_cent, harm_cent
nodes = [deg_cent, eig_cent, close_cent, bet_cent, harm_cent]
nodes_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Metro'], how='outer'), nodes)
print(nodes_merged)
nodes_merged.to_csv('../results/node_centrality_measures.csv', index=False)

                                                 Metro    Degree  Degree Rank  \
0     New York-Newark-Jersey City, NY-NJ-PA Metro Area  1.806202            1   
1        Chicago-Naperville-Elgin, IL-IN-WI Metro Area  1.803618            2   
2           Dallas-Fort Worth-Arlington, TX Metro Area  1.790698            3   
3    Washington-Arlington-Alexandria, DC-VA-MD-WV M...  1.788114            4   
4        Los Angeles-Long Beach-Anaheim, CA Metro Area  1.767442            5   
5               Phoenix-Mesa-Scottsdale, AZ Metro Area  1.723514            6   
6         Atlanta-Sandy Springs-Roswell, GA Metro Area  1.682171            7   
7      Houston-The Woodlands-Sugar Land, TX Metro Area  1.630491            8   
8       Tampa-St. Petersburg-Clearwater, FL Metro Area  1.627907            9   
9               Seattle-Tacoma-Bellevue, WA Metro Area  1.604651           10   
10   Virginia Beach-Norfolk-Newport News, VA-NC Met...  1.591731           11   
11   Miami-Fort Lauderdale-W

In [44]:
# edge betweenness centrality
# use distance = 1/flow
edge_bet_cent = pd.DataFrame.from_dict(nx.edge_betweenness_centrality(G, weight='distance'), orient='index', columns=['Edge Betweenness']).reset_index(level=0).rename(index=str, columns={'index': 'Metros'})
print(edge_bet_cent.sort_values(by='Edge Betweenness', ascending=False))

                                                  Metros  Edge Betweenness
10527  (Los Angeles-Long Beach-Anaheim, CA Metro Area...          0.078958
3947   (Chicago-Naperville-Elgin, IL-IN-WI Metro Area...          0.078192
12207  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.071379
17520  (Seattle-Tacoma-Bellevue, WA Metro Area, Los A...          0.059718
12076  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.055682
22238  (Philadelphia-Camden-Wilmington, PA-NJ-DE-MD M...          0.052499
1076   (Atlanta-Sandy Springs-Roswell, GA Metro Area,...          0.051393
12356  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.045599
20203  (Washington-Arlington-Alexandria, DC-VA-MD-WV ...          0.045273
12264  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.043262
11460  (Miami-Fort Lauderdale-West Palm Beach, FL Met...          0.043148
12030  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.040751
3911   (Chicago-Napervill

In [45]:
# merge all link measures: edge_bet_cent
links = [edge_bet_cent]
links_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Metros'], how='outer'), links)
print(links_merged)
links_merged.to_csv('../results/link_centrality_measures.csv', index=False)

                                                  Metros  Edge Betweenness
0      (Albuquerque, NM Metro Area, Abilene, TX Metro...               0.0
1      (Albuquerque, NM Metro Area, Aguadilla-Isabela...               0.0
2      (Albuquerque, NM Metro Area, Albany-Schenectad...               0.0
3      (Albuquerque, NM Metro Area, Altoona, PA Metro...               0.0
4      (Albuquerque, NM Metro Area, Amarillo, TX Metr...               0.0
5      (Albuquerque, NM Metro Area, Anchorage, AK Met...               0.0
6      (Albuquerque, NM Metro Area, Ann Arbor, MI Met...               0.0
7      (Albuquerque, NM Metro Area, Appleton, WI Metr...               0.0
8      (Albuquerque, NM Metro Area, Athens-Clarke Cou...               0.0
9      (Albuquerque, NM Metro Area, Atlanta-Sandy Spr...               0.0
10     (Albuquerque, NM Metro Area, Atlantic City-Ham...               0.0
11     (Albuquerque, NM Metro Area, Auburn-Opelika, A...               0.0
12     (Albuquerque, NM M