In [19]:
import pandas as pd
import geopandas as gpd
import numpy as np
import networkx as nx

Load dataframes with zip code data for each city

In [35]:
df_nyc = gpd.read_file('data/nyc zip codes with data.zip')

In [36]:
df_atl = gpd.read_file('data/atl zip codes with data.zip')

Create shapefiles with polling place locations for NYC and Atlanta

In [15]:
df_poll = pd.read_csv('data/polling_pk_master_post.csv')

def create_pollplace_shapefile(city_df, city_name, state_abbv):
    '''
    Input: 
    city_df is the geopandas dataframe with the zip code geometries (e.g. df_nyc), 
    city_name is string (e.g. 'nyc'), 
    state_abbv is abbreviation of the state in polling_pk_master_post.csv (e.g. 'NY')
    
    Creates geopandas dataframe of polls that are contained in the city
    '''
    polls = df_poll[df_poll['address.state'] == state_abbv]
    polls = gpd.GeoDataFrame(polls, geometry=gpd.points_from_xy(polls.longitude, polls.latitude))
    polls = polls.iloc[[city_df.contains(poll['geometry']).any() for i, poll in polls.iterrows()]]
    polls.to_file(city_name + ' polling locations.geojson', driver='GeoJSON')

In [16]:
create_pollplace_shapefile(df_nyc, 'nyc', 'NY')

In [17]:
create_pollplace_shapefile(df_atl, 'atl', 'GA')

Calculate t_car matrix

In [16]:
# THESE ARE UNTESTED BECAUSE I HAVEN'T GOTTEN OSMNX WORKING YET

def calc_shortest_time(G, a, b):
    '''
    Input: 
    G is digraph representing road network, with edge travel times already added as an attribute.
    a, b are Points whose coordinates given by lat, long (see https://shapely.readthedocs.io/en/stable/manual.html#points)
    
    Returns: An estimate of the travel time between a and b.
    '''
    a_node = ox.distance.nearest_nodes(G, a.x, a.y)
    b_node = ox.distance.nearest_nodes(G, b.x, b.y)
    t = nx.shortest_path_length(G, a_node, b_node, weight = 'travel_time')
    return t

def calc_tcar_matrix(G, polls, fname = None):
    '''
    Input: 
    G is nx.Digraph representing road network, with edge travel times already added as an attribute.
    polls is geopandas dataframe that stores polling locations as Points with (lat, long) coordinates.
    fname (optional) is string for file name to save to.
    
    Returns: Numpy array t_car s.t. t_car(i, j) = estimate of travel time by car from ith poll to jth poll. t_car is asymmetric
    If fname is given, saves t_car as .npy file
    '''
    N = polls.shape[0]
    t_car = np.zeros((N, N))
    for i, a in polls.iterrows():
        for j, b in polls.iterrows():
            if i != j:
                t_car[i, j] = calc_shortest_time(G, a['geometry'], b['geometry'])
    if fname is not None:
        np.save(t_car, "Distance_Marix_Files/" + fname)
    return t_car

Calculate t_pub matrix

In [57]:
# NOT TESTED. WE NEED TO STANDARDIZE THE NAMES OF OUR COLUMNS FIRST

def calc_tpub_matrix(t_car, city_df, polls, sigma):
    '''
    Input: 
    Numpy array t_car calculated as above.
    dataframe city_df with columns 'zipcode' (name of zipcode), 'n_stops' (number of public trans locations in the zipcode), 'stop_per_h' (total number of stops/hour in the zipcode, across all pub trans locs), 'area' (geographic area).
    dataframe polls with column 'address.zip' that gives zip code of poll
    sigma (float)- the sigma variable in our report that accounts for slower travel speeds on pub trans vs. by car
    
    Returns: Numpy array t_pub s.t. t_pub(i, j) = estimate of travel time by public transportation from ith poll to jth poll (see the overleaf for formula)
    '''
    gamma_wait = {}
    for i, Z in city_df.iterrows():
        if Z['stop_per_h']*Z['n_stops'] != 0:
            gamma_wait.update({Z['zipcode'] : 30/(Z['stop_per_h']/Z['n_stops'])})
        else:
            gamma_wait.update({Z['zipcode'] : 30}) # default value for zip code with no stops. I think it was 0 in Michael's code?
            
    gamma_walk = {}
    for i, Z in city_df.iterrows():
        if Z['n_stops'] != 0:
            gamma_walk.update({Z['zipcode'] : 20*np.sqrt(Z['area']/Z['n_stops'])}) # same formula as in Michael's code, I think? he uses some column called 'stops_per_'
        else:
            gamma_walk.update({Z['zipcode'] : 40*np.sqrt(Z['area'])}) # default value, same as in Michael's code
    N = polls.shape[0]
    t_pub = np.zeros((N, N))
    for i, a in polls.iterrows():
        for j, b in polls.iterrows():
            if i != j:
                zip_a = a['address.zip']
                zip_b = b['address.zip']
                t_pub[i, j] = sigma*t_car[i, j] + gamma_walk[zip_a] + gamma_wait[zip_a] + gamma_walk[zip_b]
    return t_pub