In [1]:
# The codes for race are:
# 01 = White alone,
# 02 = Black or African American alone,
# 03 = Asian alone,
# 04 = Some other race alone or Two or more races.

In [2]:
import networkx as nx
import pandas as pd
from functools import reduce

In [3]:
def load_data():
    non_cities = ['Outside Metro Area within U.S. or Puerto Rico', 'Africa', 'Asia', 'Central America', 'Caribbean', 'Europe', 'U.S. Island Areas', 'Northern America', 'Oceania and At Sea', 'South America']
    
    G_01 = nx.DiGraph()
    G_02 = nx.DiGraph()
    G_03 = nx.DiGraph()
    G_04 = nx.DiGraph()

    x = pd.ExcelFile('../data/metro-to-metro-by-race-2011-2015.xlsx')
    print(x.sheet_names)

    race = 'Unnamed: 2'
    previous = 'Unnamed: 16'
    current = 'Unnamed: 3'
    count = 'Unnamed: 27'

    thresh = 0

    for state in x.sheet_names:
        df = x.parse(state)

        for index, row in df.iterrows():

            # skip first three rows: header rows
            if index < 3:
                continue
            # skip the last rows: footer rows
            if index >= 54971:
                break

            if int(row[count]) < thresh:
                continue

            # exclude non-cities
            if row[previous] in non_cities or row[current] in non_cities:
                continue
                
            # get race code
            if row[race] == '01':
                G_01.add_edge(row[previous], row[current], weight=int(row[count]))
            elif row[race] == '02':
                G_02.add_edge(row[previous], row[current], weight=int(row[count]))
            elif row[race] == '03':
                G_03.add_edge(row[previous], row[current], weight=int(row[count]))
            elif row[race] == '04':    
                G_04.add_edge(row[previous], row[current], weight=int(row[count]))
                
    return [G_01, G_02, G_03, G_04]

In [4]:
race_graphs = load_data()
for g in race_graphs:
    print(str(len(g.nodes())) + " , " + str(len(g.edges())))

['Metro-to-Metro by Race 2011-15']
388 , 21827
387 , 10129
385 , 6284
388 , 8887


In [5]:
def pagerank(filename, graph):
    val_name = filename + '_value'
    rank_name = filename + '_rank'
    
    pr = nx.pagerank(graph, max_iter=1000)
    pr_df = pd.DataFrame.from_dict(pr, orient='index', columns=[val_name]).reset_index().rename(columns={'index':'metro'})
    pr_df = pr_df.sort_values(by=val_name, ascending=False).reset_index(drop=True)
    pr_df[rank_name] = pr_df.index + 1
    
    return pr_df

In [6]:
filenames = ['white', 'black', 'asian', 'other']

pr_dfs = []
for f,g in zip(filenames, race_graphs):
    pr_dfs.append(pagerank(f,g))

pr_merged = reduce(lambda  left,right: pd.merge(left,right,on=['metro'], how='outer'), pr_dfs)
print(pr_merged)

                                                 metro  white_value  \
0           Dallas-Fort Worth-Arlington, TX Metro Area     0.022978   
1     New York-Newark-Jersey City, NY-NJ-PA Metro Area     0.021955   
2    Washington-Arlington-Alexandria, DC-VA-MD-WV M...     0.021485   
3        Los Angeles-Long Beach-Anaheim, CA Metro Area     0.019803   
4      Houston-The Woodlands-Sugar Land, TX Metro Area     0.018904   
5        Chicago-Naperville-Elgin, IL-IN-WI Metro Area     0.018348   
6               Phoenix-Mesa-Scottsdale, AZ Metro Area     0.017531   
7         Atlanta-Sandy Springs-Roswell, GA Metro Area     0.016062   
8                Denver-Aurora-Lakewood, CO Metro Area     0.014545   
9    Miami-Fort Lauderdale-West Palm Beach, FL Metr...     0.014074   
10              Seattle-Tacoma-Bellevue, WA Metro Area     0.013586   
11   Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...     0.013121   
12     Riverside-San Bernardino-Ontario, CA Metro Area     0.012585   
13    

In [7]:
pr_merged.to_csv('../results/race_pagerank.csv', index=False)