In [63]:
import pandas as pd
import geopandas as gpd

import shared_utils
from utils import *
import intake
import gcsfs

import calitp
from calitp.tables import tbl
from siuba import *

from ipyleaflet import Map, GeoJSON, projections, basemaps, GeoData, LayersControl, WidgetControl, GeoJSON, LegendControl
from ipywidgets import Text, HTML

In [2]:
def calculate_access_proportion(num_df, denom_df, col):
    proportion = num_df[col].sum() / denom_df[col].sum()
    percentage = (proportion * 100).round(2)
    return percentage

In [3]:
bus_route_types = ['3', '11']

def buffer_by_route_type(row):
    '''
    Buffer bus stops by 400 meters (.25mi),
    rail/ferry by 1600 meters (1mi)
    '''
    if row.route_type in bus_route_types:
        row.geometry = row.geometry.buffer(400)
    else:
        row.geometry = row.geometry.buffer(1600)
    return row

In [4]:
catalog = intake.open_catalog('./catalog.yml')

In [5]:
shoreline_proj = catalog.stanford_shorelines.read().to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [6]:
ca_blocks = gpd.read_parquet(f'{GCS_FILE_PATH}/2020_tiger_block_geo.parquet')
ca_blocks = (ca_blocks >> filter(_.ALAND20 > 10) ## remove water
             >> select(_.county == _.COUNTYFP20, _.tract == _.TRACTCE20, _.block == _.BLOCKCE20,
                               _.geo_id == _.GEOID20, _.geometry))
ca_blocks = ca_blocks.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [7]:
ca_blocks >> head(3)

Unnamed: 0,county,tract,block,geo_id,geometry
0,17,30706,3012,60170307063012,"POLYGON ((-93795.217 73341.696, -93736.960 734..."
1,17,30601,3006,60170306013006,"POLYGON ((-88050.178 97425.705, -88049.410 974..."
2,65,42628,4008,60650426284008,"POLYGON ((257831.965 -463783.472, 257867.237 -..."


In [8]:
ca_block_pop = catalog.ca_block_population.read()

In [9]:
ca_block_pop['GEO_ID'] = ca_block_pop.GEO_ID.apply(lambda x: x.split('US')[1])
ca_block_pop = ca_block_pop >> select(_.geo_id == _.GEO_ID, _.block_pop == _.P1_001N)

In [10]:
ca_block_pop >> arrange(-_.block_pop) >> head(3)

Unnamed: 0,geo_id,block_pop
134075,60372653011006,8727
301621,60659810001003,5972
365951,60730094001036,5917


In [11]:
ca_block_joined = ca_blocks >> inner_join(_, ca_block_pop, on='geo_id')

In [12]:
ca_block_joined >> head(3)

Unnamed: 0,county,tract,block,geo_id,geometry,block_pop
0,17,30706,3012,60170307063012,"POLYGON ((-93795.217 73341.696, -93736.960 734...",363
1,17,30601,3006,60170306013006,"POLYGON ((-88050.178 97425.705, -88049.410 974...",64
2,65,42628,4008,60650426284008,"POLYGON ((257831.965 -463783.472, 257867.237 -...",103


In [13]:
# sf = ca_block_joined >> filter(_.county == '075')

In [14]:
# simple_map(sf, 'block')

In [15]:
ca_block_joined = ca_block_joined.astype({'county':'int64', 'tract':'int64', 'block':'int64', 'geo_id':'int64'})
ca_block_joined['area'] = ca_block_joined.geometry.area
ca_block_joined['geometry'] = ca_block_joined.geometry.simplify(tolerance=100)

In [16]:
all_stops = get_stops_and_trips(filter_accessible = False)
all_stops = all_stops.apply(buffer_by_route_type, axis=1)



In [17]:
accessible_stops_trips = get_stops_and_trips(filter_accessible = True)
accessible_stops_trips = accessible_stops_trips.apply(buffer_by_route_type, axis=1)

## Adding RT Availability

In [18]:
import calitp.magics

In [19]:
%%capture
%%sql -o feed_extract_date

SELECT
    *,
    PARSE_DATE(
      '%Y-%m-%d',
      REGEXP_EXTRACT(_FILE_NAME, ".*/([0-9]+-[0-9]+-[0-9]+)")
    ) AS extract_date
FROM gtfs_schedule_history.calitp_feeds_raw

In [20]:
latest = feed_extract_date >> filter(_.extract_date == _.extract_date.max())

In [21]:
rt_complete = latest >> filter(-_.gtfs_rt_vehicle_positions_url.isna(),
                              -_.gtfs_rt_service_alerts_url.isna(),
                              -_.gtfs_rt_trip_updates_url.isna())

In [22]:
rt_complete = (rt_complete >> select(_.calitp_itp_id == _.itp_id, _.calitp_url_number == _.url_number))

In [23]:
rt_complete.head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number
3,183,0
10,200,0
1245,282,1


# Accessible Transit Metrics

### New Accessibilty Metric (Area)

The % of non-water area of California that is within 1/4 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### New Accessibility Metric (Population)

 The % of Californians that are within 1/4 mi of a bus stop or 1 mi of a ferry/rail stop that is explicitly wheelchair accessible (and if in a station, that station has explicit pathways coding), and that has is served by a public-funded, open to the general public transit service with GTFS Schedule data that is served by a service that is explicitly wheelchair accessible

### Notes and Caveats:

* The vast majority of accessible services don't appear to provide accessibility data in GTFS
* Pathways appears to be empty in data warehouse for now...

## Block Level Metrics

### Accessible Static

In [24]:
block_level_accessible = (ca_block_joined
                          .sjoin(accessible_stops_trips, how='inner', predicate='intersects')
                        .drop_duplicates(subset=['geo_id'])) ##important at block level to avoid double counts

In [25]:
calculate_access_proportion(block_level_accessible, ca_block_joined, 'block_pop')

8.49

In [26]:
calculate_access_proportion(block_level_accessible, ca_block_joined, 'area')

0.74

In [27]:
# simple_map(accessible_stops_trips, 'calitp_itp_id')

### All Static

#### New General Metric (by area):

The % of non-water area of Californian that is within 1/4 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


#### New General Metric (by population):

The % of Californians that live within 1/4 mi of a bus stop or 1 mi of a ferry/rail stop that has is served by a public-funded, open to the general public transit service with GTFS Schedule data


In [28]:
block_level_static = (ca_block_joined
                          .sjoin(all_stops, how='inner', predicate='intersects')
                        # .drop_duplicates(subset=['geo_id'])
                     ) ## not dropping enables correct feed aggregations...

In [29]:
calculate_access_proportion(block_level_static.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'block_pop')

76.87

In [30]:
calculate_access_proportion(block_level_static.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'area')

9.04

### All RT

In [31]:
all_stops_rt = block_level_static >> inner_join(_, rt_complete, on =['calitp_itp_id', 'calitp_url_number'])

In [32]:
calculate_access_proportion(all_stops_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'block_pop')

53.05

In [33]:
calculate_access_proportion(all_stops_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'area')

4.23

### Accessible RT

In [34]:
accessible_stops_trips_rt = block_level_accessible >> inner_join(_, rt_complete, on =['calitp_itp_id', 'calitp_url_number'])

In [35]:
calculate_access_proportion(accessible_stops_trips_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'block_pop')

3.11

In [36]:
calculate_access_proportion(accessible_stops_trips_rt.drop_duplicates(subset=['geo_id']),
                            ca_block_joined, 'area')

0.29

## Employment Metrics (Tract Level)

In [37]:
service_path = 'gs://calitp-analytics-data/data-analyses/bus_service_increase/'

In [38]:
## Read in processed df from bus_service_increase/B1
tract_pop_employ = gpd.read_parquet(f'{service_path}bus_stop_times_by_tract.parquet')
tract_pop_employ = tract_pop_employ >> select(-_.num_arrivals, -_.stop_id, -_.itp_id)

In [39]:
tract_pop_employ.head(3)

Unnamed: 0,Tract,ZIP,Population,sq_mi,pop_sq_mi,overall_ptile,pollution_ptile,popchar_ptile,equity_group,pollution_group,popchar_group,County,City,geometry,num_jobs,jobs_sq_mi,num_pop_jobs,popjobs_sq_mi
0,6001400100,94704,3120,2.655917,1174.735672,2.79879,26.621033,1.525466,1.0,1,1.0,Alameda,Oakland,"POLYGON ((-122.24408 37.88322, -122.24198 37.8...",936,352.420702,4056,1527.156374
1,6001400200,94618,2007,0.229901,8729.842564,2.874433,24.181705,1.651538,1.0,1,1.0,Alameda,Oakland,"POLYGON ((-122.24191 37.85181, -122.24202 37.8...",1357,5902.539292,3364,14632.381857
2,6001400300,94618,5051,0.427356,11819.185546,15.935451,33.366521,12.266768,1.0,2,1.0,Alameda,Oakland,"POLYGON ((-122.24590 37.84500, -122.25241 37.8...",1978,4628.459515,7029,16447.645061


In [40]:
tract_pop_employ = tract_pop_employ.to_crs(
                        shared_utils.geography_utils.CA_NAD83Albers)
tract_pop_employ['area'] = tract_pop_employ.geometry.area

In [41]:
## option to filter out large tracts (not useful access data...)
## 4 sq km threshold
tract_pop_employ['under_4_sq_km'] = tract_pop_employ.area < 4e+06

In [42]:
job_density = tract_pop_employ >> group_by('under_4_sq_km') >> summarize(jobs = _.num_jobs.sum())
job_density

Unnamed: 0,under_4_sq_km,jobs
0,False,6770495
1,True,10386412


In [43]:
## about 60% of CA jobs are in tracts under 4 sq km (scope for this analysis)
(job_density >> filter(_.under_4_sq_km) >> select(_.jobs)).sum() / (job_density >> select(_.jobs)).sum()

jobs    0.605378
dtype: float64

In [44]:
## filter out large tracts
tract_pop_employ_filtered = tract_pop_employ >> filter(_.under_4_sq_km)

In [45]:
all_employment_joined = (tract_pop_employ_filtered
                    .sjoin(all_stops, how='inner', predicate='intersects')
                    # .drop_duplicates(subset=['Tract'])
                   ) >> select(-_.index_right, -_.index_left)

In [46]:
accessible_employment_joined = (tract_pop_employ_filtered
                    .sjoin(accessible_stops_trips, how='inner', predicate='intersects')
                    # .drop_duplicates(subset=['Tract'])
                   ) >> select(-_.index_right, -_.index_left)

In [47]:
## all stops employment access, jobs
calculate_access_proportion(all_employment_joined.drop_duplicates(subset=['Tract']), tract_pop_employ_filtered, 'num_jobs')

99.21

In [48]:
## accessible stops employment access, jobs
calculate_access_proportion(accessible_employment_joined.drop_duplicates(subset=['Tract']), tract_pop_employ_filtered, 'num_jobs')

17.84

In [49]:
accessible_stops_trips_rt = accessible_stops_trips_rt >> select(-_.index_right, -_.index_left)

In [50]:
acc_rt_employ = (tract_pop_employ_filtered
                    .sjoin(accessible_stops_trips_rt, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )

In [51]:
## accessible with RT stops employment access, population
calculate_access_proportion(acc_rt_employ, tract_pop_employ_filtered, 'num_jobs')

11.32

In [52]:
all_stops_rt = all_stops_rt >> select(-_.index_right, -_.index_left)

In [53]:
all_rt_employ = (tract_pop_employ_filtered
                    .sjoin(all_stops_rt, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['Tract'])
                   )

In [54]:
## all RT stops employment access, population
calculate_access_proportion(all_rt_employ, tract_pop_employ_filtered, 'num_jobs')

82.34

### Summarizing Coverage

In [55]:
## since employment data is tract-level, only includes tracts < 4 sq km (~60% of jobs)
employment_summary = (all_employment_joined
 >> group_by(_.calitp_itp_id)
 >> distinct(_.Tract, _keep_all=True) ##geo_id must be distinct per feed...
 >> summarize(total_jobs = _.num_jobs.sum())
)

In [56]:
SQ_MI_PER_SQ_M = 3.86e-7

coverage_summary = (block_level_static
 >> group_by(_.calitp_itp_id)
 >> distinct(_.geo_id, _keep_all=True) ##geo_id must be distinct per feed...
 >> summarize(total_sq_mi = (_.area.sum() * SQ_MI_PER_SQ_M).astype('int64'),
             total_pop = _.block_pop.sum(),
             ))

In [57]:
coverage_summary = coverage_summary >> inner_join(_, employment_summary, on ='calitp_itp_id')

In [58]:
coverage_summary['any_accessible_stops_trips'] = coverage_summary.apply(lambda x: x.calitp_itp_id in (
                             block_level_accessible.calitp_itp_id.unique()), axis=1)
coverage_summary['any_rt'] = coverage_summary.apply(lambda x: x.calitp_itp_id in (
                             rt_complete.calitp_itp_id.unique()), axis=1)

In [59]:
with_name = coverage_summary >> inner_join(_, tbl.gtfs_schedule.agency() >> distinct(_.agency_name, _.calitp_itp_id) >> collect(), on = 'calitp_itp_id')

In [60]:
# with_name = with_name >> filter(_.calitp_itp_id.apply(lamb) != 1 0, _.calitp_itp_id != 8,
#                                 _.calitp_itp_id != 2, _.calitp_itp_id != 3) ## see issue 988
coverage_summary = coverage_summary >> filter(_.calitp_itp_id.apply(lambda x: x not in [0,1,2,3,8,6])) ## see issue 988

In [61]:
coverage_summary.to_csv('draft_feed_metrics.csv')

In [73]:
row_metrics = {'Population': 'block_pop', 'Land Area': 'area', 'Jobs (<4sq km tracts only)': 'num_jobs'}
col_geographies = {'GTFS Static': [block_level_static.drop_duplicates(subset=['geo_id']),
                                   all_employment_joined.drop_duplicates(subset=['Tract'])],
                   'Accessible Static': [block_level_accessible.drop_duplicates(subset=['geo_id']),
                                         accessible_employment_joined.drop_duplicates(subset=['Tract'])],
                   'GTFS RT': [all_stops_rt.drop_duplicates(subset=['geo_id']), all_rt_employ],
                   'Accessible RT': [accessible_stops_trips_rt.drop_duplicates(subset=['geo_id']),
                                     acc_rt_employ.drop_duplicates(subset=['Tract'])]}

In [75]:
summary_df = pd.DataFrame()

for row in row_metrics.keys():
    to_append = {}
    if row == 'Jobs (<4sq km tracts only)':
        for col in col_geographies.keys():
            metric = calculate_access_proportion(col_geographies[col][1], tract_pop_employ_filtered, row_metrics[row])
            to_append[col] = metric
    else:
        for col in col_geographies.keys():
            metric = calculate_access_proportion(col_geographies[col][0], ca_block_joined, row_metrics[row])
            to_append[col] = metric
    to_append = pd.DataFrame(to_append, index = [row])
    summary_df = summary_df.append(to_append)    

## Summary of all Metrics

* Population metrics: percent of CA population within a block group near a qualifying transit stop, excluding block groups > 4 sq km
* Land Area metrics: percent of CA land area made up of block groups near a qualifying transit stop, excluding block groups > 4 sq km
* Employment metrics: percent of CA jobs in a census tract near a qualifying transit stop, excluding tracts > 4 sq km
    * this currently means the analysis only looks at about 60% of CA jobs, it could be made more precise by pulling finer-grained employment data if desired
    * likely makes transit look "better" than a job analysis of all jobs, since these urban/suburban tracts are probably more likely to have at least some transit service

In [76]:
summary_df

Unnamed: 0,GTFS Static,Accessible Static,GTFS RT,Accessible RT
Population,76.87,8.49,53.05,3.11
Land Area,9.04,0.74,4.23,0.29
Jobs (<4sq km tracts only),99.21,17.84,82.34,11.32


### Maps and Charts

* maps should show for each metric:
    * covered regions
    * non-covered regions
    * _regions outsde analysis (large block groups and tracts)_ non-issue with block data

In [273]:
all_blocks_basemap = block_level_static.copy()
all_blocks_basemap.geometry = all_blocks_basemap.buffer(0)

In [None]:
m = simple_map(all_blocks_basemap);

In [None]:
def make_geo_data(gdf, color):
    geo_data = GeoData(geo_dataframe = gdf.to_crs('EPSG:4326'),
                   style={'color': 'black', 'fillColor': color,
                                'opacity':0.2, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                   hover_style={'fillColor': 'red' , 'fillOpacity': 0.3},
                   name = 'data')
    return geo_data

In [None]:
## dissolve geos first? (for speed)
## quick function to append as seperate layers

In [None]:
unserved = ca_blocks_joined >> anti_join(_, block_level_static, on = 'geo_id') ## in analysis, not served
# out = ca_bg_geo >> filter(-_.under_4_sq_km) ## out of analysis scope (geo too large) #999999

In [None]:
m.add_layer(make_geo_data(unserved.dissolve(), '#f1a340'))
# m.add_layer(make_geo_data(out.dissolve(), '#999999'))

### GTFS Schedule only

![GTFS Schedule coverage](img/schedule.jpg)

In [None]:
m

In [65]:
m_accessible = simple_map(accessible_geo_joined.dissolve());
unserved = ca_bg_geo >> anti_join(_, accessible_geo_joined, on = 'GEOID') >> filter(_.under_4_sq_km) ## in analysis, not served
m_accessible.add_layer(make_geo_data(unserved.dissolve(), '#f1a340'))
m_accessible.add_layer(make_geo_data(out.dissolve(), '#999999'))

### Accessible Trips (static)

![accessible GTFS coverage](img/accessible.jpg)

In [74]:
# m_accessible

In [67]:
m_rt = simple_map(all_stops_rt.dissolve());
unserved = ca_bg_geo >> anti_join(_, all_stops_rt, on = 'GEOID') >> filter(_.under_4_sq_km) ## in analysis, not served
m_rt.add_layer(make_geo_data(unserved.dissolve(), '#f1a340'))
m_rt.add_layer(make_geo_data(out.dissolve(), '#999999'))

### Trips by operators with GTFS-RT

![rt operators](img/rt.jpg)

In [73]:
# m_rt

In [69]:
m_rt_access = simple_map(accessible_stops_trips_rt.dissolve());
unserved = ca_bg_geo >> anti_join(_, accessible_stops_trips_rt, on = 'GEOID') >> filter(_.under_4_sq_km) ## in analysis, not served
m_rt_access.add_layer(make_geo_data(unserved.dissolve(), '#f1a340'))
m_rt_access.add_layer(make_geo_data(out.dissolve(), '#999999'))

### Accessible Trips by operators with GTFS-RT

![accessible trips, RT operator](img/rt_accessible.jpg)

## Block Group Metrics

In [16]:
total_pop_var = 'B01001_001E'

In [17]:
## can hit Census API limits with frequent runs?

# blockgrp_pop = get_census_ca_counties(total_pop_var, 'block%20group')
# blockgrp_pop = blockgrp_pop.rename(columns={'B01001_001E': 'block_grp_pop',
#                                            'block group': 'block_grp'})

In [18]:
## read pre-fetched 2019 ACS data from catalog
blockgrp_pop = (catalog.ca_bg_population.read()
                .rename(columns={'block group': 'block_grp',
                                'population': 'block_grp_pop'})
                .astype({'block_grp_pop': 'int64', 'state': 'int64',
                        'county': 'int64', 'tract': 'int64',
                        'block_grp': 'int64'})
               )

In [19]:
# blockgrp_pop.head(3)

In [24]:
# ca_bg_geo = get_ca_block_group_geo()

In [25]:
# shared_utils.utils.geoparquet_gcs_export(ca_bg_geo, GCS_FILE_PATH,
#                                          'ca_block_grp_clipped_prj')

In [26]:
ca_bg_geo = gpd.read_parquet(f'{GCS_FILE_PATH}ca_block_grp_clipped_prj.parquet')
ca_bg_geo['area'] = ca_bg_geo.geometry.area

In [27]:
ca_bg_geo = ca_bg_geo.astype({'STATEFP': 'int64',
                                            'COUNTYFP': 'int64',
                                            'TRACTCE': 'int64',
                                            'BLKGRPCE': 'int64'})
ca_bg_geo = ca_bg_geo.rename(columns={'STATEFP': 'state',
                                                   'COUNTYFP': 'county',
                                                   'TRACTCE': 'tract',
                                                   'BLKGRPCE': 'block_grp'})

In [28]:
ca_bg_geo = ca_bg_geo >> inner_join(_, blockgrp_pop, on = ['state', 'county', 'tract', 'block_grp'])

In [29]:
## drop large block groups (not useful access data...)
## 4 sq km threshold
ca_bg_geo['under_4_sq_km'] = ca_bg_geo.area < 4e+06
ca_bg_geo_filtered = ca_bg_geo >> filter(_.under_4_sq_km)

In [51]:
accessible_geo_joined = ca_bg_geo_filtered.sjoin(accessible_stops_trips, how='inner', predicate='intersects')
accessible_geo_joined = accessible_geo_joined.drop_duplicates(subset=['GEOID'])

#### Current Percentage of CA population in block groups within .25mi of accessible transit

In [54]:
calculate_access_proportion(accessible_geo_joined, ca_bg_geo_filtered, 'block_grp_pop')

11.47

#### Current Percentage of CA land area in block groups within .25mi of accessible transit

In [35]:
calculate_access_proportion(accessible_geo_joined, ca_bg_geo_filtered, 'area')

10.9

In [24]:
all_stops = get_stops_and_trips(filter_accessible = False)
all_stops = all_stops.apply(buffer_by_route_type, axis=1)

In [25]:
all_stops_joined = (ca_bg_geo_filtered
                    .sjoin(all_stops, how='inner', predicate='intersects')
                    .drop_duplicates(subset=['GEOID'])
                   )

In [26]:
# all_stops_pop_joined = all_stops_joined >> inner_join(_, blockgrp_pop, on=['state', 'county', 'tract', 'block_grp'])

#### Current Percentage of CA population in block groups near transit with GTFS Schedule data

In [27]:
calculate_access_proportion(all_stops_joined, blockgrp_pop, 'block_grp_pop')

79.27

#### Current Percentage of CA land area in block groups near transit with GTFS Schedule data

In [28]:
calculate_access_proportion(all_stops_joined, ca_bg_geo_filtered, 'area')

90.09