In [1]:
import pandas as pd
import geopandas as gpd

from siuba import *
from utils import *



In [2]:
ca_block_joined = gpd.read_parquet(
    f"{GCS_FILE_PATH}block_population_joined.parquet")

tract_pop_employ_filtered = gpd.read_parquet(
    f"{GCS_FILE_PATH}tract_pop_employ_filtered.parquet")

In [3]:
rename_block_files = {
    #"block_all_stops": "block_level_static",
    "block_accessible_stops": "block_level_accessible",
    "block_all_stops_rt": "all_stops_rt",
    "block_accessible_stops_rt": "accessible_stops_trips_rt",
}

sjoin_blocks = {}

for key, value in rename_block_files.items():
    print(key)
    sjoin_blocks[key] = gpd.read_parquet(f"{GCS_FILE_PATH}{value}.parquet")

# This one needs to be read in as df, in a dict, kernel will crash
block_level_static = gpd.read_parquet(
    f"{GCS_FILE_PATH}block_level_static.parquet")

block_accessible_stops
block_all_stops_rt
block_accessible_stops_rt


In [4]:
tract_files = ["tract_all_stops", "tract_all_stops_rt",
    "tract_accessible_stops", "tract_accessible_stops_rt"
]

sjoin_tracts = {}

for t in tract_files:
    print(t)
    sjoin_tracts[t] = gpd.read_parquet(
        f"{GCS_FILE_PATH}{t}.parquet")

tract_all_stops
tract_all_stops_rt
tract_accessible_stops
tract_accessible_stops_rt


In [5]:
row_metrics = {'Population': 'block_pop', 
               'Land Area': 'area', 
               'Jobs (<4sq km tracts only)': 'num_jobs'}

col_geographies = {
    'GTFS Static': [
        block_level_static.drop_duplicates(subset=['geo_id']),
        sjoin_tracts["tract_all_stops"].drop_duplicates(subset=['Tract'])
    ],
    'Accessible Static': [
        sjoin_blocks["block_accessible_stops"].drop_duplicates(subset=['geo_id']),
        sjoin_tracts["tract_accessible_stops"].drop_duplicates(subset=['Tract'])
    ],
    'GTFS RT': [
        sjoin_blocks["block_all_stops_rt"].drop_duplicates(subset=['geo_id']), 
        sjoin_tracts["tract_all_stops_rt"].drop_duplicates(subset=['Tract'])
    ],
    'Accessible RT': [
        sjoin_blocks["block_accessible_stops_rt"].drop_duplicates(subset=['geo_id']),
        sjoin_tracts["tract_accessible_stops_rt"].drop_duplicates(subset=['Tract'])
    ]
}

In [6]:
summary_df = pd.DataFrame()

for row in row_metrics.keys():
    to_append = {}
    if row == 'Jobs (<4sq km tracts only)':
        for col in col_geographies.keys():
            metric = calculate_access_proportion(col_geographies[col][1], 
                                                 tract_pop_employ_filtered, 
                                                 row_metrics[row])
            to_append[col] = metric
    else:
        for col in col_geographies.keys():
            metric = calculate_access_proportion(col_geographies[col][0], 
                                                 ca_block_joined, 
                                                 row_metrics[row])
            to_append[col] = metric
    to_append = pd.DataFrame(to_append, index = [row])
    summary_df = summary_df.append(to_append)    

## Summary of all Metrics

* Population metrics: percent of CA population within a block group near a qualifying transit stop, excluding block groups > 4 sq km
* Land Area metrics: percent of CA land area made up of block groups near a qualifying transit stop, excluding block groups > 4 sq km
* Employment metrics: percent of CA jobs in a census tract near a qualifying transit stop, excluding tracts > 4 sq km
    * this currently means the analysis only looks at about 60% of CA jobs, it could be made more precise by pulling finer-grained employment data if desired
    * likely makes transit look "better" than a job analysis of all jobs, since these urban/suburban tracts are probably more likely to have at least some transit service

In [7]:
summary_df

Unnamed: 0,GTFS Static,Accessible Static,GTFS RT,Accessible RT
Population,85.95,10.02,59.65,3.67
Land Area,11.58,0.93,5.33,0.42
Jobs (<4sq km tracts only),99.32,18.61,82.63,11.61
