In [12]:
# The codes for Hispanic origin are: 
# 01 = White alone, not Hispanic or Latino, 
# 02 = Other race, not Hispanic or Latino,
# 03 = Hispanic or Latino.

In [13]:
import networkx as nx
import pandas as pd
from functools import reduce

In [16]:
def load_data():
    non_cities = ['Outside Metro Area within U.S. or Puerto Rico', 'Africa', 'Asia', 'Central America', 'Caribbean', 'Europe', 'U.S. Island Areas', 'Northern America', 'Oceania and At Sea', 'South America']
    
    G_01 = nx.DiGraph()
    G_02 = nx.DiGraph()
    G_03 = nx.DiGraph()

    x = pd.ExcelFile('../data/metro-to-metro-by-hispanic-origin-2011-2015.xlsx')
    print(x.sheet_names)

    hispanic = 'Unnamed: 2'
    current = 'Unnamed: 3'
    previous = 'Unnamed: 16'
    
    count = 'Unnamed: 27'

    thresh = 0

    for state in x.sheet_names:
        df = x.parse(state)

        for index, row in df.iterrows():

            # skip first three rows: header rows
            if index < 3:
                continue
            # skip the last rows: footer rows
            if index >= 51533:
                break

            if int(row[count]) < thresh:
                continue

            # exclude non-cities
            if row[previous] in non_cities or row[current] in non_cities:
                continue
                
            # get hispanic origin code
            if row[hispanic] == '01':
                G_01.add_edge(row[previous], row[current], weight=int(row[count]))
            elif row[hispanic] == '02':
                G_02.add_edge(row[previous], row[current], weight=int(row[count]))
            elif row[hispanic] == '03':
                G_03.add_edge(row[previous], row[current], weight=int(row[count]))
                
    return [G_01, G_02, G_03]

In [17]:
hisp_graphs = load_data()
for g in hisp_graphs:
    print(str(len(g.nodes())) + " , " + str(len(g.edges())))

['Metro-to-Metro by Hispanic Orig']
388 , 21316
388 , 13607
388 , 9938


In [19]:
def pagerank(filename, graph):
    val_name = filename + '_value'
    rank_name = filename + '_rank'
    
    pr = nx.pagerank(graph, max_iter=1000)
    pr_df = pd.DataFrame.from_dict(pr, orient='index', columns=[val_name]).reset_index().rename(columns={'index':'metro'})
    pr_df = pr_df.sort_values(by=val_name, ascending=False).reset_index(drop=True)
    pr_df[rank_name] = pr_df.index + 1
    
    return pr_df

In [20]:
filenames = ['white_nonhisp', 'other_nonhisp', 'hisp']

pr_dfs = []
for f,g in zip(filenames, hisp_graphs):
    pr_dfs.append(pagerank(f,g))

pr_merged = reduce(lambda  left,right: pd.merge(left,right,on=['metro'], how='outer'), pr_dfs)
print(pr_merged)

                                                 metro  white_nonhisp_value  \
0    Washington-Arlington-Alexandria, DC-VA-MD-WV M...             0.022388   
1     New York-Newark-Jersey City, NY-NJ-PA Metro Area             0.021902   
2           Dallas-Fort Worth-Arlington, TX Metro Area             0.021891   
3        Chicago-Naperville-Elgin, IL-IN-WI Metro Area             0.018698   
4        Los Angeles-Long Beach-Anaheim, CA Metro Area             0.018083   
5               Phoenix-Mesa-Scottsdale, AZ Metro Area             0.017331   
6      Houston-The Woodlands-Sugar Land, TX Metro Area             0.016518   
7         Atlanta-Sandy Springs-Roswell, GA Metro Area             0.016495   
8                Denver-Aurora-Lakewood, CO Metro Area             0.015270   
9               Seattle-Tacoma-Bellevue, WA Metro Area             0.014186   
10   Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...             0.013511   
11           Boston-Cambridge-Newton, MA-NH Metro Ar

In [21]:
pr_merged.to_csv('../results/hisp_pagerank.csv', index=False)