In [2]:
import networkx as nx
import pandas as pd

In [3]:
income_csv = pd.read_csv('../data/income_by_msa_simplified.csv')
income_csv['pcpi'] = income_csv['pcpi'].str.replace(',', '')
income_csv['pcpi'] = income_csv['pcpi'].astype(int)
income_by_msa = pd.Series(income_csv.pcpi.values,index=income_csv.msa).to_dict()

In [4]:
non_cities = ['Outside Metro Area within U.S. or Puerto Rico', 'Africa', 'Asia', 'Central America', 'Caribbean', 'Europe', 'U.S. Island Areas', 'Northern America', 'Oceania and At Sea', 'South America']
G = nx.DiGraph()

x = pd.ExcelFile('../data/metro-to-metro-2011-2015.xlsx')
print(x.sheet_names)

previous = 'Unnamed: 15'
current = 'Unnamed: 2'
count = 'Unnamed: 26'

thresh = 0

for state in x.sheet_names:
    df = x.parse(state)

    for index, row in df.iterrows():

        # skip first three rows: header rows
        if index < 3:
            continue
        # skip the last rows: footer rows
        if index >= 53724:
            break
            
        if int(row[count]) < thresh:
            continue

        # exclude non-cities
        if row[previous] in non_cities or row[current] in non_cities:
            continue
            
        # compute weight
        source_income = income_by_msa.get(row[previous][:-11])
        
        # none for many places in PR
        if source_income is None:
            continue

        source_count = row[count]
        weight = source_income * source_count
            
        G.add_edge(row[previous], row[current], weight=weight)

print(len(G.nodes()))
print(len(G.edges()))

['Metro-to-Metro 2011-2015']
388
49521


In [5]:
# pagerank 
pr = nx.pagerank(G, max_iter=1000)
pr_df = pd.DataFrame.from_dict(pr, orient='index', columns=['pr']).reset_index().rename(columns={'index':'metro'})
pr_df = pr_df.sort_values(by='pr', ascending=False)
print(pr_df)
pr_df.to_csv('../results/weighted_income_pagerank.csv', index=False)

                                                 metro        pr
53       Los Angeles-Long Beach-Anaheim, CA Metro Area  0.021176
62    New York-Newark-Jersey City, NY-NJ-PA Metro Area  0.019864
25          Dallas-Fort Worth-Arlington, TX Metro Area  0.019688
105  Washington-Arlington-Alexandria, DC-VA-MD-WV M...  0.018384
38     Houston-The Woodlands-Sugar Land, TX Metro Area  0.017473
5         Atlanta-Sandy Springs-Roswell, GA Metro Area  0.017056
19       Chicago-Naperville-Elgin, IL-IN-WI Metro Area  0.015999
72              Phoenix-Mesa-Scottsdale, AZ Metro Area  0.015701
88              Seattle-Tacoma-Bellevue, WA Metro Area  0.013660
79     Riverside-San Bernardino-Ontario, CA Metro Area  0.013246
192       San Francisco-Oakland-Hayward, CA Metro Area  0.013170
58   Miami-Fort Lauderdale-West Palm Beach, FL Metr...  0.012076
27               Denver-Aurora-Lakewood, CO Metro Area  0.011797
117  Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...  0.011485
95      Tampa-St. Petersb

In [8]:
# edge betweenness centrality
# use distance = 1/flow
g_distance_dict = {(e1, e2): 1 / weight for e1, e2, weight in G.edges(data='weight')}
nx.set_edge_attributes(G, g_distance_dict, 'distance')

edge_bet_cent = pd.DataFrame.from_dict(nx.edge_betweenness_centrality(G, weight='distance'), orient='index', columns=['Edge Betweenness']).reset_index(level=0).rename(index=str, columns={'index': 'Metros'})
edge_bet_cent = edge_bet_cent[edge_bet_cent['Edge Betweenness'] > 0]
edge_bet_cent = edge_bet_cent.sort_values(by='Edge Betweenness', ascending=False)
print(edge_bet_cent)
edge_bet_cent.to_csv('../results/weighted_income_edge_centrality.csv', index=False)

                                                  Metros  Edge Betweenness
10527  (Los Angeles-Long Beach-Anaheim, CA Metro Area...          0.107561
12076  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.091911
3947   (Chicago-Naperville-Elgin, IL-IN-WI Metro Area...          0.084452
12207  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.062975
17520  (Seattle-Tacoma-Bellevue, WA Metro Area, Los A...          0.059844
1076   (Atlanta-Sandy Springs-Roswell, GA Metro Area,...          0.058359
12356  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.054297
22037  (Philadelphia-Camden-Wilmington, PA-NJ-DE-MD M...          0.053598
20203  (Washington-Arlington-Alexandria, DC-VA-MD-WV ...          0.053105
11460  (Miami-Fort Lauderdale-West Palm Beach, FL Met...          0.045679
12264  (New York-Newark-Jersey City, NY-NJ-PA Metro A...          0.042489
3911   (Chicago-Naperville-Elgin, IL-IN-WI Metro Area...          0.041270
11701  (Minneapolis-St. P