In [24]:
import pandas as pd
import geopandas as gpd

import shared_utils
from utils import *
import intake
import gcsfs

import calitp
from calitp.tables import tbl
from siuba import *

from ipyleaflet import Map, GeoJSON, projections, basemaps, GeoData, LayersControl, WidgetControl, GeoJSON, LegendControl
from ipywidgets import Text, HTML

In [25]:
def calculate_access_proportion(num_df, denom_df, col):
    proportion = num_df[col].sum() / denom_df[col].sum()
    percentage = (proportion * 100).round(2)
    return percentage

In [26]:
bus_route_types = ['3', '11']

def buffer_by_route_type(row):
    '''
    Buffer bus stops by 800 meters (.5mi),
    rail/ferry by 1600 meters (1mi)
    '''
    if row.route_type in bus_route_types:
        row.geometry = row.geometry.buffer(800)
    else:
        row.geometry = row.geometry.buffer(1600)
    return row

In [27]:
catalog = intake.open_catalog('./catalog.yml')

In [28]:
# shoreline_proj = catalog.stanford_shorelines.read().to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [29]:
ca_blocks = gpd.read_parquet(f'{GCS_FILE_PATH}2020_tiger_block_geo.parquet')
ca_blocks = (ca_blocks >> filter(_.ALAND20 > 10) ## remove water
             >> select(_.county == _.COUNTYFP20, _.tract == _.TRACTCE20, _.block == _.BLOCKCE20,
                               _.geo_id == _.GEOID20, _.geometry))
ca_blocks = ca_blocks.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [30]:
ca_blocks >> head(3)

Unnamed: 0,county,tract,block,geo_id,geometry
0,17,30706,3012,60170307063012,"POLYGON ((-93795.217 73341.696, -93736.960 734..."
1,17,30601,3006,60170306013006,"POLYGON ((-88050.178 97425.705, -88049.410 974..."
2,65,42628,4008,60650426284008,"POLYGON ((257831.965 -463783.472, 257867.237 -..."


In [31]:
ca_block_pop = catalog.ca_block_population.read()

In [32]:
ca_block_pop['GEO_ID'] = ca_block_pop.GEO_ID.apply(lambda x: x.split('US')[1])
ca_block_pop = ca_block_pop >> select(_.geo_id == _.GEO_ID, _.block_pop == _.P1_001N)

In [33]:
ca_block_pop >> arrange(-_.block_pop) >> head(3)

Unnamed: 0,geo_id,block_pop
134075,60372653011006,8727
301621,60659810001003,5972
365951,60730094001036,5917


In [34]:
ca_block_joined = ca_blocks >> inner_join(_, ca_block_pop, on='geo_id')

In [35]:
ca_block_joined >> head(3)

Unnamed: 0,county,tract,block,geo_id,geometry,block_pop
0,17,30706,3012,60170307063012,"POLYGON ((-93795.217 73341.696, -93736.960 734...",363
1,17,30601,3006,60170306013006,"POLYGON ((-88050.178 97425.705, -88049.410 974...",64
2,65,42628,4008,60650426284008,"POLYGON ((257831.965 -463783.472, 257867.237 -...",103


In [36]:
# sf = ca_block_joined >> filter(_.county == '075')

In [37]:
# simple_map(sf, 'block')

In [38]:
ca_block_joined = ca_block_joined.astype({'county':'int64', 'tract':'int64', 'block':'int64', 'geo_id':'int64'})
ca_block_joined['area'] = ca_block_joined.geometry.area
ca_block_joined['geometry'] = ca_block_joined.geometry.simplify(tolerance=100)

In [39]:
all_stops = get_stops_and_trips(filter_accessible = False)
all_stops = all_stops.apply(buffer_by_route_type, axis=1)

In [40]:
accessible_stops_trips = get_stops_and_trips(filter_accessible = True)
accessible_stops_trips = accessible_stops_trips.apply(buffer_by_route_type, axis=1)

## Adding RT Availability

In [41]:
import calitp.magics

In [42]:
%%capture
%%sql -o feed_extract_date

SELECT
    *,
    PARSE_DATE(
      '%Y-%m-%d',
      REGEXP_EXTRACT(_FILE_NAME, ".*/([0-9]+-[0-9]+-[0-9]+)")
    ) AS extract_date
FROM gtfs_schedule_history.calitp_feeds_raw

In [43]:
latest = feed_extract_date >> filter(_.extract_date == _.extract_date.max())

In [44]:
rt_complete = latest >> filter(-_.gtfs_rt_vehicle_positions_url.isna(),
                              -_.gtfs_rt_service_alerts_url.isna(),
                              -_.gtfs_rt_trip_updates_url.isna())

In [45]:
rt_complete = (rt_complete >> select(_.calitp_itp_id == _.itp_id, _.calitp_url_number == _.url_number))

In [46]:
rt_complete.head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number
1009,183,0
1016,200,0
1534,282,1


# Accessible Transit Metrics

### New Accessibilty Metric (Area)

The % of non-water area of California that is within 1/2mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### New Accessibility Metric (Population)

 The % of Californians that are within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### Notes and Caveats:

* The vast majority of accessible services don't appear to provide accessibility data in GTFS
* Pathways appears to be empty in data warehouse for now...

## Block Level Metrics

### Accessible Static

In [47]:
block_level_accessible = (ca_block_joined
                          .sjoin(accessible_stops_trips, how='inner', predicate='intersects')
                        .drop_duplicates(subset=['geo_id'])) ##important at block level to avoid double counts

In [48]:
calculate_access_proportion(block_level_accessible, ca_block_joined, 'block_pop')

10.02

In [49]:
calculate_access_proportion(block_level_accessible, ca_block_joined, 'area')

0.93

In [50]:
# simple_map(accessible_stops_trips, 'calitp_itp_id')

### All Static

#### New General Metric (by area):

The % of non-water area of Californian that is within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


#### New General Metric (by population):

The % of Californians that live within 1/2 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


In [51]:
block_level_static = (ca_block_joined
                          .sjoin(all_stops, how='inner', predicate='intersects')
                        # .drop_duplicates(subset=['geo_id'])
                     ) ## not dropping enables correct feed aggregations...

In [52]:
calculate_access_proportion(block_level_static.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'block_pop')

85.95

In [53]:
calculate_access_proportion(block_level_static.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'area')

11.58

### All RT

In [54]:
all_stops_rt = block_level_static >> inner_join(_, rt_complete, on =['calitp_itp_id', 'calitp_url_number'])

In [55]:
calculate_access_proportion(all_stops_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'block_pop')

59.65

In [56]:
calculate_access_proportion(all_stops_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'area')

5.33

### Accessible RT

In [57]:
accessible_stops_trips_rt = block_level_accessible >> inner_join(_, rt_complete, on =['calitp_itp_id', 'calitp_url_number'])

In [58]:
calculate_access_proportion(accessible_stops_trips_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'block_pop')

3.67

In [59]:
calculate_access_proportion(accessible_stops_trips_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'area')

0.42

### RT Milestone (Partial)

In [60]:
milestone_ids = [13, 121, 323, 228, 325, 56, 93, 254, 251, 81, 122, 76, 199,
 312, 344, 372, 280, 327, 112, 17, 118, 289, 120, 33, 356,
 251, 351, 232, 6, 14, 271, 296, 389, 256]

In [65]:
milestone_ids = [mid for mid in milestone_ids if mid not in all_stops_rt.calitp_itp_id.unique()]

### Export for Mapping

In [52]:
accessible_stops_trips_rt >> head(3)

Unnamed: 0,county,tract,block,geo_id,geometry,block_pop,area,index_right,stop_id,route_type,stop_lon,stop_lat,calitp_itp_id,calitp_url_number,wheelchair_boarding,wheelchair_accessible
0,113,10610,1013,61130106101013,"POLYGON ((-150269.150 60281.940, -149620.149 6...",0,86977.6478,387,DAV,2,-121.737758,38.543494,4,1,1,1
1,113,10701,4024,61130107014024,"POLYGON ((-151430.154 60140.398, -151380.346 5...",0,7134.524478,387,DAV,2,-121.737758,38.543494,4,1,1,1
2,113,10610,2012,61130106102012,"POLYGON ((-150172.028 60949.016, -150107.449 6...",58,16243.747689,387,DAV,2,-121.737758,38.543494,4,1,1,1


In [67]:
rt_acc_status = accessible_stops_trips_rt.drop_duplicates(subset=['geo_id', 'calitp_itp_id']).copy()
rt_acc_status  = rt_acc_status >> select(_.geo_id, _.calitp_itp_id) >> mutate(rt_accessible = True)

rt_status = all_stops_rt.drop_duplicates(subset=['geo_id', 'calitp_itp_id']).copy()
rt_status  = rt_status >> select(_.geo_id, _.calitp_itp_id) >> mutate(rt = True)

acc_status = block_level_accessible.drop_duplicates(subset=['geo_id', 'calitp_itp_id']).copy()
acc_status  = acc_status >> select(_.geo_id, _.calitp_itp_id) >> mutate(accessible = True)

static_status = block_level_static.drop_duplicates(subset=['geo_id', 'calitp_itp_id']).copy()
static_status  = static_status >> select(_.geo_id, _.calitp_itp_id) >> mutate(static = True)

rt_milestones = (block_level_static.copy()
                 >> filter(_.calitp_itp_id.isin(milestone_ids))
                ).drop_duplicates(subset=['geo_id', 'calitp_itp_id'])
rt_milestones = rt_milestones >> select(_.geo_id, _.calitp_itp_id) >> mutate(rt_milestone = True)

In [68]:
all_df = pd.concat([rt_acc_status, rt_status, acc_status, static_status, rt_milestones])

In [69]:
all_df = all_df.fillna(False)
all_df = all_df.groupby(['geo_id', 'calitp_itp_id']).any().reset_index()

In [70]:
for_mapping = (ca_block_joined.astype({'geo_id':'int64'})
 >> inner_join(_, all_df, on = ['geo_id'])
 # >> left_join(_, rt_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, acc_status, on = ['geo_id', 'calitp_itp_id'])
 # >> left_join(_, static_status, on = ['geo_id', 'calitp_itp_id'])
)
for_mapping['calitp_itp_id'] = for_mapping.calitp_itp_id.fillna(0).astype('int64')
# for_mapping = for_mapping.fillna(False)

In [71]:
for_mapping.head(3)

Unnamed: 0,county,tract,block,geo_id,geometry,block_pop,area,calitp_itp_id,rt_accessible,rt,accessible,static,rt_milestone
0,65,42628,4008,60650426284008,"POLYGON ((257831.965 -463783.472, 258008.040 -...",103,15986.958897,269,False,True,False,True,False
1,65,42711,3025,60650427113025,"POLYGON ((259577.302 -476173.973, 259651.066 -...",42,29609.218158,269,False,True,False,True,False
2,65,43401,1023,60650434011023,"POLYGON ((281412.631 -469351.296, 281613.718 -...",21,19971.850439,13,False,False,False,True,True


In [72]:
shared_utils.utils.geoparquet_gcs_export(for_mapping, GCS_FILE_PATH, 'block_accessibility_metrics')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



## Employment Metrics (Tract Level)

In [42]:
service_path = 'gs://calitp-analytics-data/data-analyses/bus_service_increase/'

In [43]:
## Read in processed df from bus_service_increase/B1
tract_pop_employ = gpd.read_parquet(f'{service_path}bus_stop_times_by_tract.parquet')
tract_pop_employ = tract_pop_employ >> select(-_.num_arrivals, -_.stop_id, -_.itp_id)

In [44]:
tract_pop_employ.head(3)

Unnamed: 0,Tract,ZIP,Population,sq_mi,pop_sq_mi,overall_ptile,pollution_ptile,popchar_ptile,equity_group,pollution_group,popchar_group,County,City,geometry,num_jobs,jobs_sq_mi,num_pop_jobs,popjobs_sq_mi
0,6001400100,94704,3120,2.655917,1174.735672,2.79879,26.621033,1.525466,1.0,1,1.0,Alameda,Oakland,"POLYGON ((-122.24408 37.88322, -122.24198 37.8...",936,352.420702,4056,1527.156374
1,6001400200,94618,2007,0.229901,8729.842564,2.874433,24.181705,1.651538,1.0,1,1.0,Alameda,Oakland,"POLYGON ((-122.24191 37.85181, -122.24202 37.8...",1357,5902.539292,3364,14632.381857
2,6001400300,94618,5051,0.427356,11819.185546,15.935451,33.366521,12.266768,1.0,2,1.0,Alameda,Oakland,"POLYGON ((-122.24590 37.84500, -122.25241 37.8...",1978,4628.459515,7029,16447.645061


In [45]:
tract_pop_employ = tract_pop_employ.to_crs(
                        shared_utils.geography_utils.CA_NAD83Albers)
tract_pop_employ['area'] = tract_pop_employ.geometry.area

In [46]:
## option to filter out large tracts (not useful access data...)
## 4 sq km threshold
tract_pop_employ['under_4_sq_km'] = tract_pop_employ.area < 4e+06

In [47]:
job_density = tract_pop_employ >> group_by('under_4_sq_km') >> summarize(jobs = _.num_jobs.sum())
job_density

Unnamed: 0,under_4_sq_km,jobs
0,False,6770495
1,True,10386412


In [48]:
## about 60% of CA jobs are in tracts under 4 sq km (scope for this analysis)
(job_density >> filter(_.under_4_sq_km) >> select(_.jobs)).sum() / (job_density >> select(_.jobs)).sum()

jobs    0.605378
dtype: float64

In [49]:
## filter out large tracts
tract_pop_employ_filtered = tract_pop_employ >> filter(_.under_4_sq_km)

In [50]:
all_employment_joined = (tract_pop_employ_filtered
                    .sjoin(all_stops, how='inner', predicate='intersects')
                    # .drop_duplicates(subset=['Tract'])
                   ) >> select(-_.index_right, -_.index_left)

In [51]:
accessible_employment_joined = (tract_pop_employ_filtered
                    .sjoin(accessible_stops_trips, how='inner', predicate='intersects')
                    # .drop_duplicates(subset=['Tract'])
                   ) >> select(-_.index_right, -_.index_left)

In [52]:
## all stops employment access, jobs
calculate_access_proportion(all_employment_joined.drop_duplicates(subset=['Tract']), tract_pop_employ_filtered, 'num_jobs')

99.32

In [53]:
## accessible stops employment access, jobs
calculate_access_proportion(accessible_employment_joined.drop_duplicates(subset=['Tract']), tract_pop_employ_filtered, 'num_jobs')

18.61

In [54]:
accessible_stops_trips_rt = accessible_stops_trips_rt >> select(-_.index_right, -_.index_left)

In [55]:
acc_rt_employ = (tract_pop_employ_filtered
                    .sjoin(accessible_stops_trips_rt, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )

In [56]:
## accessible with RT stops employment access, population
calculate_access_proportion(acc_rt_employ, tract_pop_employ_filtered, 'num_jobs')

11.79

In [57]:
all_stops_rt = all_stops_rt >> select(-_.index_right, -_.index_left)

In [None]:
all_rt_employ = (tract_pop_employ_filtered
                    .sjoin(all_stops_rt, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )

In [None]:
## all RT stops employment access, population
calculate_access_proportion(all_rt_employ, tract_pop_employ_filtered, 'num_jobs')

### Summarizing Coverage

In [None]:
## since employment data is tract-level, only includes tracts < 4 sq km (~60% of jobs)
employment_summary = (all_employment_joined
 >> group_by(_.calitp_itp_id)
 >> distinct(_.Tract, _keep_all=True) ##geo_id must be distinct per feed...
 >> summarize(total_jobs = _.num_jobs.sum())
)

In [None]:
SQ_MI_PER_SQ_M = 3.86e-7

coverage_summary = (block_level_static
 >> group_by(_.calitp_itp_id)
 >> distinct(_.geo_id, _keep_all=True) ##geo_id must be distinct per feed...
 >> summarize(total_sq_mi = (_.area.sum() * SQ_MI_PER_SQ_M).astype('int64'),
             total_pop = _.block_pop.sum(),
             ))

In [None]:
coverage_summary = coverage_summary >> inner_join(_, employment_summary, on ='calitp_itp_id')

In [None]:
coverage_summary['any_accessible_stops_trips'] = coverage_summary.apply(lambda x: x.calitp_itp_id in (
                             block_level_accessible.calitp_itp_id.unique()), axis=1)
coverage_summary['any_rt'] = coverage_summary.apply(lambda x: x.calitp_itp_id in (
                             rt_complete.calitp_itp_id.unique()), axis=1)

In [None]:
with_name = coverage_summary >> inner_join(_, tbl.gtfs_schedule.agency() >> distinct(_.agency_name, _.calitp_itp_id) >> collect(), on = 'calitp_itp_id')

In [None]:
# with_name = with_name >> filter(_.calitp_itp_id.apply(lamb) != 1 0, _.calitp_itp_id != 8,
#                                 _.calitp_itp_id != 2, _.calitp_itp_id != 3) ## see issue 988
coverage_summary = coverage_summary >> filter(_.calitp_itp_id.apply(lambda x: x not in [0,1,2,3,8,6])) ## see issue 988

In [None]:
coverage_summary.to_csv('draft_feed_metrics.csv')

In [None]:
row_metrics = {'Population': 'block_pop', 'Land Area': 'area', 'Jobs (<4sq km tracts only)': 'num_jobs'}
col_geographies = {'GTFS Static': [block_level_static.drop_duplicates(subset=['geo_id']),
                                   all_employment_joined.drop_duplicates(subset=['Tract'])],
                   'Accessible Static': [block_level_accessible.drop_duplicates(subset=['geo_id']),
                                         accessible_employment_joined.drop_duplicates(subset=['Tract'])],
                   'GTFS RT': [all_stops_rt.drop_duplicates(subset=['geo_id']), all_rt_employ],
                   'Accessible RT': [accessible_stops_trips_rt.drop_duplicates(subset=['geo_id']),
                                     acc_rt_employ.drop_duplicates(subset=['Tract'])]}

In [None]:
summary_df = pd.DataFrame()

for row in row_metrics.keys():
    to_append = {}
    if row == 'Jobs (<4sq km tracts only)':
        for col in col_geographies.keys():
            metric = calculate_access_proportion(col_geographies[col][1], tract_pop_employ_filtered, row_metrics[row])
            to_append[col] = metric
    else:
        for col in col_geographies.keys():
            metric = calculate_access_proportion(col_geographies[col][0], ca_block_joined, row_metrics[row])
            to_append[col] = metric
    to_append = pd.DataFrame(to_append, index = [row])
    summary_df = summary_df.append(to_append)    

## Summary of all Metrics

* Population metrics: percent of CA population within a block group near a qualifying transit stop, excluding block groups > 4 sq km
* Land Area metrics: percent of CA land area made up of block groups near a qualifying transit stop, excluding block groups > 4 sq km
* Employment metrics: percent of CA jobs in a census tract near a qualifying transit stop, excluding tracts > 4 sq km
    * this currently means the analysis only looks at about 60% of CA jobs, it could be made more precise by pulling finer-grained employment data if desired
    * likely makes transit look "better" than a job analysis of all jobs, since these urban/suburban tracts are probably more likely to have at least some transit service

In [None]:
summary_df