In [1]:
import pandas as pd
import geopandas as gpd

import shared_utils
from utils import *
import intake
import gcsfs

import calitp
from calitp.tables import tbl
from siuba import *

from ipyleaflet import Map, GeoJSON, projections, basemaps, GeoData, LayersControl, WidgetControl, GeoJSON, LegendControl
from ipywidgets import Text, HTML



In [2]:
catalog = intake.open_catalog('./catalog.yml')

# Accessible Transit Metrics

### New Accessibilty Metric (Area)

The % of non-water area of California that is within 1/4 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### New Accessibility Metric (Population)

 The % of Californians that are within 1/4 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### Notes and Caveats:

* The vast majority of accessible services don't appear to provide accessibility data in GTFS
* Pathways appears to be empty in data warehouse for now...

In [3]:
total_pop_var = 'B01001_001E'

In [4]:
## can hit Census API limits with frequent runs?

# blockgrp_pop = get_census_ca_counties(total_pop_var, 'block%20group')
# blockgrp_pop = blockgrp_pop.rename(columns={'B01001_001E': 'block_grp_pop',
#                                            'block group': 'block_grp'})

In [5]:
## read pre-fetched 2019 ACS data from catalog
blockgrp_pop = (catalog.ca_bg_population.read()
                .rename(columns={'block group': 'block_grp',
                                'population': 'block_grp_pop'})
                .astype({'block_grp_pop': 'int64', 'state': 'int64',
                        'county': 'int64', 'tract': 'int64',
                        'block_grp': 'int64'})
               )

In [6]:
# blockgrp_pop.head(3)

In [7]:
accessible_stops_trips = get_stops_and_trips(filter_accessible = True)



In [8]:
# accessible_stops_trips.head(3)

In [9]:
bus_route_types = ['3', '11']

def buffer_by_route_type(row):
    '''
    Buffer bus stops by 400 meters (.25mi),
    rail/ferry by 1600 meters (1mi)
    '''
    if row.route_type in bus_route_types:
        row.geometry = row.geometry.buffer(400)
    else:
        row.geometry = row.geometry.buffer(1600)
    return row

In [10]:
accessible_stops_trips = accessible_stops_trips.apply(buffer_by_route_type, axis=1)

In [11]:
# ca_block_geo = get_ca_block_group_geo()

In [12]:
# shared_utils.utils.geoparquet_gcs_export(ca_block_geo, GCS_FILE_PATH,
#                                          'ca_block_grp_clipped_prj')

In [13]:
ca_block_geo = gpd.read_parquet(f'{GCS_FILE_PATH}ca_block_grp_clipped_prj.parquet')
ca_block_geo['area'] = ca_block_geo.geometry.area

In [14]:
ca_block_geo = ca_block_geo.astype({'STATEFP': 'int64',
                                            'COUNTYFP': 'int64',
                                            'TRACTCE': 'int64',
                                            'BLKGRPCE': 'int64'})
ca_block_geo = ca_block_geo.rename(columns={'STATEFP': 'state',
                                                   'COUNTYFP': 'county',
                                                   'TRACTCE': 'tract',
                                                   'BLKGRPCE': 'block_grp'})

In [15]:
ca_block_geo = ca_block_geo >> inner_join(_, blockgrp_pop, on = ['state', 'county', 'tract', 'block_grp'])

In [16]:
## drop large block groups (not useful access data...)
## 4 sq km threshold
ca_block_geo['under_4_sq_km'] = ca_block_geo.area < 4e+06
ca_block_geo_filtered = ca_block_geo >> filter(_.under_4_sq_km)

In [17]:
accessible_geo_joined = ca_block_geo_filtered.sjoin(accessible_stops_trips, how='inner', predicate='intersects')

In [18]:
accessible_geo_joined = accessible_geo_joined.drop_duplicates(subset=['GEOID'])

In [19]:
accessible_geo_joined.head(3)

Unnamed: 0,state,county,tract,block_grp,GEOID,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,...,under_4_sq_km,index_right,stop_id,route_type,stop_lon,stop_lat,calitp_itp_id,calitp_url_number,wheelchair_boarding,wheelchair_accessible
0,6,73,10009,1,60730100091,Block Group 1,G5030,S,1759046,528681,...,True,6342,60594,3,-117.0483,32.553833,278,0,1,1
3,6,73,10009,3,60730100093,Block Group 3,G5030,S,707892,0,...,True,6342,60594,3,-117.0483,32.553833,278,0,1,1
4,6,73,10013,1,60730100131,Block Group 1,G5030,S,491924,0,...,True,6342,60594,3,-117.0483,32.553833,278,0,1,1


In [20]:
def calculate_access_proportion(num_df, denom_df, col):
    proportion = num_df[col].sum() / denom_df[col].sum()
    percentage = (proportion * 100).round(2)
    return percentage

#### Current Percentage of CA population in block groups within .25mi of accessible transit

In [21]:
calculate_access_proportion(accessible_geo_joined, ca_block_geo_filtered, 'block_grp_pop')

11.47

#### Current Percentage of CA land area in block groups within .25mi of accessible transit

In [22]:
calculate_access_proportion(accessible_geo_joined, ca_block_geo_filtered, 'area')

10.9

### Map

In [23]:
# simple_map(accessible_geo_joined, 'calitp_itp_id')

#### Static Map (block groups near accessible transit, Bay Area)

![bay area accessiblity](accessible.png)

# General Transit Metrics

### New General Metric (by area):

The % of non-water area of Californian that is within 1/4 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


### New General Metric (by population):

The % of Californians that live within 1/4 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


In [24]:
all_stops = get_stops_and_trips(filter_accessible = False)
all_stops = all_stops.apply(buffer_by_route_type, axis=1)

In [1]:
all_stops_joined = (ca_block_geo_filtered
                    .sjoin(all_stops, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['GEOID'])
                   )

NameError: name 'ca_block_geo_filtered' is not defined

In [None]:
# all_stops_pop_joined = all_stops_joined >> inner_join(_, blockgrp_pop, on=['state', 'county', 'tract', 'block_grp'])

#### Current Percentage of CA population in block groups near transit with GTFS Schedule data

In [None]:
calculate_access_proportion(all_stops_joined, blockgrp_pop, 'block_grp_pop')

#### Current Percentage of CA land area in block groups near transit with GTFS Schedule data

In [None]:
calculate_access_proportion(all_stops_joined, ca_block_geo_filtered, 'area')

## Employment Metrics

In [None]:
service_path = 'gs://calitp-analytics-data/data-analyses/bus_service_increase/'

In [None]:
## Read in processed df from bus_service_increase/B1
tract_pop_employ = gpd.read_parquet(f'{service_path}bus_stop_times_by_tract.parquet')
tract_pop_employ = tract_pop_employ >> select(-_.num_arrivals, -_.stop_id, -_.itp_id)

In [None]:
tract_pop_employ.head(3)

In [None]:
tract_pop_employ = tract_pop_employ.to_crs(
                        shared_utils.geography_utils.CA_NAD83Albers)
tract_pop_employ['area'] = tract_pop_employ.geometry.area

In [None]:
## option to filter out large tracts (not useful access data...)
## 4 sq km threshold
tract_pop_employ['under_4_sq_km'] = tract_pop_employ.area < 4e+06

In [None]:
job_density = tract_pop_employ >> group_by('under_4_sq_km') >> summarize(jobs = _.num_jobs.sum())
job_density

In [None]:
## about 60% of CA jobs are in tracts under 4 sq km (scope for this analysis)
(job_density >> filter(_.under_4_sq_km) >> select(_.jobs)).sum() / (job_density >> select(_.jobs)).sum()

In [None]:
## filter out large tracts
tract_pop_employ_filtered = tract_pop_employ >> filter(_.under_4_sq_km)

In [None]:
all_employment_joined = (tract_pop_employ_filtered
                    .sjoin(all_stops, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )

In [None]:
accessible_employment_joined = (tract_pop_employ_filtered
                    .sjoin(accessible_stops_trips, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )

In [None]:
## all stops employment access, jobs
calculate_access_proportion(all_employment_joined, tract_pop_employ_filtered, 'num_jobs')

In [None]:
## accessible stops employment access, jobs
calculate_access_proportion(accessible_employment_joined, tract_pop_employ_filtered, 'num_jobs')

## Adding RT Availability

In [None]:
import calitp.magics

In [None]:
%%capture
%%sql -o feed_extract_date

SELECT
    *,
    PARSE_DATE(
      '%Y-%m-%d',
      REGEXP_EXTRACT(_FILE_NAME, ".*/([0-9]+-[0-9]+-[0-9]+)")
    ) AS extract_date
FROM gtfs_schedule_history.calitp_feeds_raw

In [None]:
latest = feed_extract_date >> filter(_.extract_date == _.extract_date.max())

In [None]:
rt_complete = latest >> filter(-_.gtfs_rt_vehicle_positions_url.isna(),
                              -_.gtfs_rt_service_alerts_url.isna(),
                              -_.gtfs_rt_trip_updates_url.isna())

In [None]:
rt_complete = (rt_complete >> select(_.calitp_itp_id == _.itp_id, _.calitp_url_number == _.url_number))

In [None]:
rt_complete.head(3)

In [None]:
all_stops_rt = all_stops_joined >> inner_join(_, rt_complete, on =['calitp_itp_id', 'calitp_url_number'])

In [None]:
accessible_geo_joined = accessible_geo_joined.drop(columns=['index_right'])

In [None]:
accessible_stops_trips_rt = accessible_geo_joined >> inner_join(_, rt_complete, on =['calitp_itp_id', 'calitp_url_number'])

In [None]:
all_stops_rt = all_stops_rt.drop(columns=['index_right'])

In [None]:
acc_rt_employ = (tract_pop_employ
                    .sjoin(accessible_stops_trips_rt, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )

In [None]:
## accessible with RT stops employment access, population
calculate_access_proportion(acc_rt_employ, tract_pop_employ_filtered, 'num_jobs')

In [None]:
all_rt_employ = (tract_pop_employ
                    .sjoin(all_stops_rt, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )

In [None]:
## all RT stops employment access, population
calculate_access_proportion(all_rt_employ, tract_pop_employ_filtered, 'num_jobs')

#### Very few agencies with both RT and accessibility info

In [None]:
accessible_stops_trips_rt >> distinct(_.calitp_itp_id) >> inner_join(_, tbl.gtfs_schedule.agency() >> collect(), on = 'calitp_itp_id')

In [None]:
row_metrics = {'Population': 'block_grp_pop', 'Land Area': 'area', 'Jobs': 'num_jobs'}
col_geographies = {'GTFS Static': [all_stops_joined, all_employment_joined],
                   'Accessible Static': [accessible_geo_joined, accessible_employment_joined],
                   'GTFS RT': [all_stops_rt, all_rt_employ],
                   'Accessible RT': [accessible_stops_trips_rt, acc_rt_employ]}

In [None]:
summary_df = pd.DataFrame()

for row in row_metrics.keys():
    to_append = {}
    if row == 'Jobs':
        for col in col_geographies.keys():
            metric = calculate_access_proportion(col_geographies[col][1], tract_pop_employ_filtered, row_metrics[row])
            to_append[col] = metric
    else:
        for col in col_geographies.keys():
            metric = calculate_access_proportion(col_geographies[col][0], ca_block_geo_filtered, row_metrics[row])
            to_append[col] = metric
    to_append = pd.DataFrame(to_append, index = [row])
    summary_df = summary_df.append(to_append)    

## Summary of all Metrics

* Population metrics: percent of CA population within a block group near a qualifying transit stop, excluding block groups > 4 sq km
* Land Area metrics: percent of CA land area made up of block groups near a qualifying transit stop, excluding block groups > 4 sq km
* Employment metrics: percent of CA jobs in a census tract near a qualifying transit stop, excluding tracts > 4 sq km
    * this currently means the analysis only looks at about 60% of CA jobs, it could be made more precise by pulling finer-grained employment data if desired
    * likely makes transit look "better" than a job analysis of all jobs, since these urban/suburban tracts are probably more likely to have at least some transit service

In [None]:
summary_df

### Maps and Charts

* maps should show for each metric:
    * covered regions
    * non-covered regions
    * regions outsde analysis (large block groups and tracts)

In [None]:
m = simple_map(all_stops_joined.dissolve());

In [None]:
def make_geo_data(gdf, color):
    geo_data = GeoData(geo_dataframe = gdf.to_crs('EPSG:4326'),
                   style={'color': 'black', 'fillColor': color,
                                'opacity':0.2, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                   hover_style={'fillColor': 'red' , 'fillOpacity': 0.3},
                   name = 'data')
    return geo_data

In [None]:
## dissolve geos first? (for speed)
## quick function to append as seperate layers

In [None]:
unserved = ca_block_geo >> anti_join(_, all_stops_joined, on = 'GEOID') >> filter(_.under_4_sq_km) ## in analysis, not served
out = ca_block_geo >> filter(-_.under_4_sq_km) ## out of analysis scope (geo too large) #999999

In [None]:
m.add_layer(make_geo_data(unserved.dissolve(), '#f1a340'))
m.add_layer(make_geo_data(out.dissolve(), '#999999'))

### GTFS Schedule only

![GTFS Schedule coverage](img/schedule.jpg)

In [None]:
m

In [None]:
m_accessible = simple_map(accessible_geo_joined.dissolve());
unserved = ca_block_geo >> anti_join(_, accessible_geo_joined, on = 'GEOID') >> filter(_.under_4_sq_km) ## in analysis, not served
m_accessible.add_layer(make_geo_data(unserved.dissolve(), '#f1a340'))
m_accessible.add_layer(make_geo_data(out.dissolve(), '#999999'))

### Accessible Trips (static)

![accessible GTFS coverage](img/accessible.jpg)

In [None]:
m_accessible

In [None]:
m_rt = simple_map(all_stops_rt.dissolve());
unserved = ca_block_geo >> anti_join(_, all_stops_rt, on = 'GEOID') >> filter(_.under_4_sq_km) ## in analysis, not served
m_rt.add_layer(make_geo_data(unserved.dissolve(), '#f1a340'))
m_rt.add_layer(make_geo_data(out.dissolve(), '#999999'))

### Trips by operators with GTFS-RT

![rt operators](img/rt.jpg)

In [None]:
m_rt

In [None]:
m_rt_access = simple_map(accessible_stops_trips_rt.dissolve());
unserved = ca_block_geo >> anti_join(_, accessible_stops_trips_rt, on = 'GEOID') >> filter(_.under_4_sq_km) ## in analysis, not served
m_rt_access.add_layer(make_geo_data(unserved.dissolve(), '#f1a340'))
m_rt_access.add_layer(make_geo_data(out.dissolve(), '#999999'))

### Accessible Trips by operators with GTFS-RT

![accessible trips, RT operator](img/rt_accessible.jpg)

In [None]:
m_rt_access