In [1]:
# The codes for sex are:
# 01 = Male,
# 02 = Female.

In [2]:
import networkx as nx
import pandas as pd
from functools import reduce

In [3]:
def load_data():
    non_cities = ['Outside Metro Area within U.S. or Puerto Rico', 'Africa', 'Asia', 'Central America', 'Caribbean', 'Europe', 'U.S. Island Areas', 'Northern America', 'Oceania and At Sea', 'South America']
    
    G_01 = nx.DiGraph()
    G_02 = nx.DiGraph()

    x = pd.ExcelFile('../data/metro-to-metro-by-sex-2011-2015.xlsx')
    print(x.sheet_names)

    gender = 'Unnamed: 2'
    previous = 'Unnamed: 16'
    current = 'Unnamed: 3'
    count = 'Unnamed: 27'

    thresh = 0

    for state in x.sheet_names:
        df = x.parse(state)

        for index, row in df.iterrows():

            # skip first three rows: header rows
            if index < 3:
                continue
            # skip the last rows: footer rows
            if index >= 47521:
                break

            if int(row[count]) < thresh:
                continue

            # exclude non-cities
            if row[previous] in non_cities or row[current] in non_cities:
                continue
                
            # get gender code
            if row[gender] == '01':
                G_01.add_edge(row[previous], row[current], weight=int(row[count]))
            elif row[gender] == '02':
                G_02.add_edge(row[previous], row[current], weight=int(row[count]))
                
    return [G_01, G_02]

In [4]:
gender_graphs = load_data()
for g in gender_graphs:
    print(str(len(g.nodes())) + " , " + str(len(g.edges())))

['Metro-to-Metro by Sex 2011-2015']
388 , 21365
388 , 20764


In [5]:
def pagerank(filename, graph):
    val_name = filename + '_value'
    rank_name = filename + '_rank'
    
    pr = nx.pagerank(graph, max_iter=1000)
    pr_df = pd.DataFrame.from_dict(pr, orient='index', columns=[val_name]).reset_index().rename(columns={'index':'metro'})
    pr_df = pr_df.sort_values(by=val_name, ascending=False).reset_index(drop=True)
    pr_df[rank_name] = pr_df.index + 1
    
    return pr_df

In [6]:
filenames = ['male', 'female']

pr_dfs = []
for f,g in zip(filenames, gender_graphs):
    pr_dfs.append(pagerank(f,g))

pr_merged = reduce(lambda  left,right: pd.merge(left,right,on=['metro'], how='outer'), pr_dfs)
print(pr_merged)

                                                 metro  male_value  male_rank  \
0     New York-Newark-Jersey City, NY-NJ-PA Metro Area    0.024483          1   
1           Dallas-Fort Worth-Arlington, TX Metro Area    0.023305          2   
2    Washington-Arlington-Alexandria, DC-VA-MD-WV M...    0.022496          3   
3        Los Angeles-Long Beach-Anaheim, CA Metro Area    0.022256          4   
4         Atlanta-Sandy Springs-Roswell, GA Metro Area    0.020356          5   
5      Houston-The Woodlands-Sugar Land, TX Metro Area    0.019893          6   
6        Chicago-Naperville-Elgin, IL-IN-WI Metro Area    0.019321          7   
7               Phoenix-Mesa-Scottsdale, AZ Metro Area    0.016748          8   
8      Riverside-San Bernardino-Ontario, CA Metro Area    0.015484          9   
9               Seattle-Tacoma-Bellevue, WA Metro Area    0.013688         10   
10   Miami-Fort Lauderdale-West Palm Beach, FL Metr...    0.013667         11   
11   Philadelphia-Camden-Wil

In [7]:
pr_merged.to_csv('../results/gender_pagerank.csv', index=False)