In [1]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona

from ipyleaflet import Map, GeoJSON, projections, basemaps, GeoData, LayersControl, WidgetControl, GeoJSON
from ipywidgets import Text, HTML

import shapely
from shapely.geometry import LineString, MultiPoint
from shapely.ops import split, substring

import zlib
import datetime as dt



### High Quality Transit Areas Relevant Statutes

[PRC 21155](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21155.&lawCode=PRC)
* Major transit stop definition: _A major transit stop is as defined in Section 21064.3, except that, for purposes of this section, it also includes major transit stops that are included in the applicable regional transportation plan_
* High-quality transit corridor definition: _For purposes of this section, a high-quality transit corridor means a corridor with fixed route bus service with service intervals no longer than 15 minutes during peak commute hours._
    * Unable to locate definition of "peak commute hours"

[PRC 21064.3](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21064.3.&lawCode=PRC)
* _Major transit stop means a site containing any of the following:
(a) An existing rail or bus rapid transit station.
(b) A ferry terminal served by either a bus or rail transit service.
(c) The intersection of two or more major bus routes with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods._
    * "Intersection" may not be sufficiently well-defined for this analysis

[PRC 21060.2](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PRC&sectionNum=21060.2.&highlight=true&keyword=bus%20rapid%20transit)
* _(a) “Bus rapid transit” means a public mass transit service provided by a public agency or by a public-private partnership that includes all of the following features:
(1) Full-time dedicated bus lanes or operation in a separate right-of-way dedicated for public transportation with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods.
(2) Transit signal priority.
(3) All-door boarding.
(4) Fare collection system that promotes efficiency.
(5) Defined stations._
    * Unlikely to determine if a service qualifies as BRT under this definition using GTFS alone

In [2]:
##definition of "peak" pending
am_peak = range(6, 9)
pm_peak = range(16, 19)

In [3]:
am_commute_hours = list(am_peak)
pm_commute_hours = list(pm_peak)
commute_hours = am_commute_hours + pm_commute_hours

In [4]:
commute_hours

[6, 7, 8, 16, 17, 18]

In [22]:
def create_segments(geometry, prior_operators_hqta):
    '''Splits a Shapely LineString into smaller LineStrings. If a MultiLineString passed,
    splits each LineString in that collection. 
    '''
    ##TODO clip prior operator geometry to this linestring, preserve those geoms+ids before calculating new segments
    ##TODO keep full prior operator segments only, otherwise calculate new
    
    lines = []
    segment_distance_meters = 1500
    geometry = geometry.iloc[0]
    if hasattr(geometry, 'geoms'): ##check if MultiLineString
        linestrings = geometry.geoms
    else:
        linestrings = [geometry]
    for linestring in linestrings:
        for i in range(0, int(linestring.length), segment_distance_meters):
            lines.append(substring(linestring, i, i+segment_distance_meters))
    return lines

In [6]:
def find_stop_with_most_routes(segment, stops, route_count_by_stop):
    '''Given a shape segment, finds the stop serving the most routes within that segment.
    Adds that stop's stop_id to segment data (a row).
    '''
    # print(segment.hqta_segment_id)
    # display(segment.geometry)
    stops_in_seg = gpd.clip(stops, segment.geometry)
    if stops_in_seg.size == 0:
        return segment
    # display(stops_in_seg)
    debug_dict['stops_in_seg'] = stops_in_seg
    max_routes_stop = (stops_in_seg
                   >> inner_join(_, route_count_by_stop, on = "stop_id")
                   >> filter(_.n_routes == _.n_routes.max())
                  ).iloc[[0]]
    segment['stop_id'] = max_routes_stop['stop_id'].iloc[0]
    segment['n_routes'] = max_routes_stop['n_routes'].iloc[0]
    return segment

In [7]:
def find_trips_in_segments(segments, stops, stop_times):
    stops_in_seg = gpd.sjoin(segments, stops, how='inner')
    if stops_in_seg.size == 0:
        print('uh-oh')
        return
    stop_times_in_seg = stops_in_seg >> inner_join(_, stop_times, on = 'stop_id')
    trip_count_by_stop = stop_times_in_seg >> count(_.stop_id) >> arrange(-_.n) >> rename(n_trips = _.n)
    top_3_stops = trip_count_by_stop.stop_id.iloc[:3]
    stop_times_in_seg >> filter(_.stop_id.isin(top_3_stops))
    return stop_times_in_seg.drop_duplicates(subset=['trip_id'])

In [8]:
def get_operator_views(itp_id):
    '''Returns relevant views from the data warehouse for a single transit operator.
    '''
    shapes = tbl.gtfs_schedule.shapes() >> filter(_.calitp_itp_id == int(itp_id)) >> collect()
    shapes = gpd.GeoDataFrame(shapes, 
                              geometry = gpd.points_from_xy(shapes.shape_pt_lon, shapes.shape_pt_lat),
                              crs = 'EPSG:4326').to_crs('EPSG:6414') ## https://epsg.io/6414 (meters)
    print('loaded shapes')

    cal_wednesdays = (tbl.gtfs_schedule.calendar()
                      >> filter(_.calitp_itp_id == int(itp_id))
                      >> filter(_.wednesday == '1')
                      >> collect())
    bus_routes = (tbl.gtfs_schedule.routes()
                    >> filter(_.calitp_itp_id == int(itp_id))
                    >> filter(_.route_type.isin(['3', '11'])) ## bus and trolleybus
                    >> select(_.route_id) >> collect())
    print('loaded bus routes')
    trips = (tbl.gtfs_schedule.trips()
             >> filter(_.calitp_itp_id == int(itp_id))
             >> filter(_.service_id.isin(cal_wednesdays.service_id))
             >> filter(_.route_id.isin(bus_routes.route_id))
             >> collect())
    print('loaded trips')
    stop_times = (tbl.gtfs_schedule.stop_times()
                  >> filter(_.calitp_itp_id == int(itp_id))
                  >> collect())
    stop_times =  (stop_times >> filter(_.trip_id.isin(trips.trip_id))
                  >> select(-_.calitp_itp_id, -_.calitp_extracted_at))
    print('loaded stop times')

    stops = (tbl.gtfs_schedule.stops() 
             >> filter(_.calitp_itp_id == itp_id)
             >> select(_.stop_id, _.stop_lat, _.stop_lon)
             >> collect())
    stops = gpd.GeoDataFrame(stops,
                     geometry = gpd.points_from_xy(stops.stop_lon, stops.stop_lat),
                     crs = 'EPSG:4326').to_crs('EPSG:6414') ## https://epsg.io/6414 (meters)
    print('loaded stops')

    return shapes, trips, stop_times, stops

In [9]:
def fix_arrival_time(gtfs_timestring):
    '''Reformats a GTFS timestamp (which allows the hour to exceed 24 to mark service day continuity)
    to standard 24-hour time.
    '''
    split = gtfs_timestring.split(':')
    hour = int(split[0])
    if hour >= 24:
        split[0] = str(hour - 24)
        corrected = (':').join(split)
        return corrected
    else:
        return gtfs_timestring

In [10]:
def single_shape_hqta(shapes, trips, stop_times, stops, route_count_by_stop, shape_id, hqta, prior_operators_hqta=None):
    '''Starting with a single GTFS shape, split that shape into segments and determine if each segment qualifies
    as an HQTA. Existing segments within a shape are dropped for that shape, since their peak frequency and 
    HQTA status would have already been calculated for a previous shape.
    '''

    
    single_shape = (shapes
         >> filter(_.shape_id == shape_id)
         >> mutate(shape_pt_sequence = _.shape_pt_sequence.astype('int64'))
         >> arrange(_.shape_pt_sequence) ##arrange, then convert to line to preserve order...
        )
    
    route_line = LineString(list(single_shape['geometry']))
    single_line = single_shape[['calitp_itp_id', 'shape_id', 'calitp_extracted_at']].iloc[[0]] ##preserve info cols
    single_line['geometry'] = route_line
    single_line = gpd.GeoDataFrame(single_line, crs='EPSG:6414')
    
    if shape_id in debug_ids:
        print(f'***debug shape*** {shape_id}')
        debug_dict[f'{shape_id}_single_line'] = single_line
        debug_dict[f'{shape_id}_hqta'] = hqta

        ## test a shapes df split into segments
    if hqta.size != 0:
        already_calculated = hqta.dissolve(by='calitp_itp_id') ## get single polygon of HQTA calculation complete area
        single_line = single_line.overlay(already_calculated, how='difference') ## drop calculation complete area from current shape
        if single_line.size == 0:
            segments_with_max_stop = None
            print(f'no line for shape {shape_id}')
            return

    segmented = pd.DataFrame()
    for segment in create_segments(single_line.geometry, prior_operators_hqta):
        to_append = single_line.drop(columns=['geometry'])
        to_append['geometry'] = segment
        segmented = segmented.append(to_append)
        
    segmented = segmented.reset_index()
    segmented['segment_sequence'] = segmented.index.astype(str)
    segmented = segmented.astype({'calitp_itp_id': str})
    
    ## compute (hopefully unique) hash of segment id that can be used across routes/operators
    segmented['hqta_segment_id'] = segmented.apply(lambda x:
                zlib.crc32((x.calitp_itp_id + x.shape_id + x.segment_sequence).encode('utf-8')), axis=1)
    
    segmented.geometry = segmented.buffer(50) ##generous buffer for street/sidewalk width? Required to spatially find stops within each segment

    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_segmented'] = segmented
        debug_dict[f'{shape_id}_stops'] = stops
        debug_dict[f'{shape_id}_route_ct_by_stop'] = route_count_by_stop
    
    # try:
    segments_with_max_stop = segmented.apply(find_stop_with_most_routes, axis=1,
                                             args = (stops, route_count_by_stop))
    if not 'stop_id' in segments_with_max_stop.columns:
        segments_with_max_stop = None
        print(f'no stops for shape {shape_id}')
        return ## no stops within segment

    max_stop_times = (stop_times 
                 >> select(_.stop_id, _.trip_id, _.departure_time)
                 >> inner_join(_, segments_with_max_stop, on = "stop_id")
                ) ## filter stop_times to the key stops in each segment
    
    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_max_stop0'] = max_stop_times
    
    max_stop_times['departure_time'] = max_stop_times['departure_time'].apply(fix_arrival_time) ## reformat GTFS time to a format datetime can ingest
    max_stop_times['departure_hour'] = max_stop_times['departure_time'].apply(lambda x:
                                                                    dt.datetime.strptime(x, '%H:%M:%S').hour)
    
    max_stop_times = max_stop_times >> filter(_.departure_hour.isin(commute_hours))
    
    if max_stop_times.size == 0:
        print(f'no commute hour trips for shape {shape_id}')
        return
    
    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_max_stop'] = max_stop_times
        
    max_stop_times['am_peak'] = max_stop_times['departure_hour'].apply(lambda x: x in am_commute_hours)
    max_stop_times['pm_peak'] = ~max_stop_times['am_peak']
    
    segment_peak_service = max_stop_times.groupby(['hqta_segment_id'])[['am_peak', 'pm_peak']].sum() ## count total trips at stop during each peak period
    
    ## convert to trips per hour
    segment_peak_service['am_peak'] = (segment_peak_service['am_peak'] / len(am_commute_hours)).round(1)
    segment_peak_service['pm_peak'] = (segment_peak_service['pm_peak'] / len(pm_commute_hours)).round(1)
    
    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_segment_peak'] = segment_peak_service

    ## consider segment HQTA if stop within segment has at least 4 trips/hour in both peaks
    segment_peak_service['hq_transit_corr'] = segment_peak_service.apply(lambda x:
                              True if x.am_peak >= 4 and x.pm_peak >= 4 else False, axis=1)
    
    segment_peak_service = segment_peak_service.reset_index()
    segment_peak_service['last_seg_hqta'] = segment_peak_service['hq_transit_corr'].shift(1)
    segment_peak_service['next_seg_hqta'] = segment_peak_service['hq_transit_corr'].shift(-1)
    
    ## consider segment not HQTA if both the prior and next segements are not HQTAs
    segment_peak_service['hq_transit_corr'] = segment_peak_service.apply(
            lambda x: False if x.hq_transit_corr == False else x.last_seg_hqta or x.next_seg_hqta, axis = 1)
    segment_peak_service['hq_transit_corr'] = segment_peak_service['hq_transit_corr'].fillna(True)
    
    single_hqta = segments_with_max_stop >> inner_join(_, segment_peak_service, on = 'hqta_segment_id')
    single_hqta.drop(columns=['calitp_extracted_at', 'next_seg_hqta', 'last_seg_hqta'], inplace=True)
    
    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_single_hqta'] = single_hqta
    
    return single_hqta

In [21]:
def single_operator_hqta(views, hqta_df=None):  
    
    global debug_dict
    debug_dict = {}
    
    shapes, trips, stop_times, stops = views
    
    if hqta_df and hqta_df.size > 0:
        prior_operators_hqta = hqta_df
    else:
        prior_operators_hqta = None
    
    distinct_routes = (trips
                   >> distinct(_.route_id, _.shape_id, _.direction_id, _keep_all = True)
                   >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.direction_id, _.trip_id))
    
    route_count_by_stop = (stop_times
                         >> select(_.stop_id, _.trip_id)
                         >> inner_join(_, distinct_routes, on = "trip_id")
                         >> count(_.stop_id)
                         >> rename(n_routes = _.n)
                         >> arrange(-_.n_routes))    
    
    hqta = gpd.GeoDataFrame()
    ## start with shapes including the highest number of trips
    shapes_sorted = trips.groupby('shape_id').count().sort_values(by='trip_id', ascending=False).index
    shapes_sorted = pd.Series(shapes_sorted)
    total_shapes = len(shapes_sorted)
    print(f'there are {total_shapes} shapes total')
    for ix, shape_id in shapes_sorted.items():
        print(f'calculating for shape_id {shape_id}')
        if ix % 25 == 0:
            print(f'progress: {ix}/{total_shapes} shapes ({round(((ix/total_shapes)*100), 2)}%)')
        # try:
        _result = single_shape_hqta(shapes, trips, stop_times, stops, route_count_by_stop,
                                    shape_id, hqta, prior_operators_hqta)
        hqta = hqta.append(_result)
        # except:
        #     print(f'unable to calculate HQTA for shape_id {shape_id}')
        try:
            hqta = hqta.set_crs('EPSG:6414')
        except:
            continue
        
    return hqta

In [12]:
debug_ids = ['shp-6-21']

In [13]:
bbb_views = get_operator_views(300)

loaded shapes
loaded bus routes
loaded trips
loaded stop times
loaded stops


In [18]:
bbb_hqta = single_operator_hqta(bbb_views)

there are 66 shapes total
calculating for shape_id 25351
progress: 0/66 shapes (0.0%)
calculating for shape_id 25352
calculating for shape_id 25315
calculating for shape_id 25325
calculating for shape_id 25331
calculating for shape_id 25314
calculating for shape_id 25326
calculating for shape_id 25333
calculating for shape_id 25354
calculating for shape_id 25358
calculating for shape_id 25365
calculating for shape_id 25368
calculating for shape_id 25382
calculating for shape_id 25400
no stops for shape 25400
calculating for shape_id 25399
no stops for shape 25399
calculating for shape_id 25320
calculating for shape_id 25323
calculating for shape_id 25376
no stops for shape 25376
calculating for shape_id 25380
no stops for shape 25380
calculating for shape_id 25340
calculating for shape_id 25337
calculating for shape_id 25363
calculating for shape_id 25369
calculating for shape_id 25372
no stops for shape 25372
calculating for shape_id 25343
calculating for shape_id 25346
progress: 25/6

In [15]:
# ccb_views = get_operator_views(87)

In [16]:
# ccb_hqta = single_operator_hqta(ccb_views)

### Mapping

In [15]:
def map_hqta(hqta):
    
    x = hqta['geometry'].to_crs('EPSG:4326').iloc[0].centroid.x
    y = hqta['geometry'].to_crs('EPSG:4326').iloc[0].centroid.y
    
    m = Map(basemap=basemaps.CartoDB.Positron, center=[y, x], zoom=11)
    geo_data_hq = GeoData(geo_dataframe = hqta[hqta['hq_transit_corr']].to_crs('EPSG:4326'),
                           style={'color': 'black', 'fillColor': '#3366cc',
                                        'opacity':0.3, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                           hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                           name = 'HQTA')

    geo_data_not_hq = GeoData(geo_dataframe = hqta[~hqta['hq_transit_corr']].to_crs('EPSG:4326'),
                           style={'color': 'black', 'fillColor': '#fec44f',
                                        'opacity':0.3, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                           hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                           name = 'non-HQTA')

    m.add_layer(geo_data_hq)
    m.add_layer(geo_data_not_hq)
    m.add_control(LayersControl())
    
    return m

In [19]:
map_hqta(bbb_hqta)

Map(center=[34.024461994324184, -118.41142785170783], controls=(ZoomControl(options=['position', 'zoom_in_text…

In [30]:
# act_hqta.to_file('./ac_transit_test.geojson', driver='GeoJSON')
# bbb_hqta.to_file('./smbbb_test.geojson', driver='GeoJSON')
# lbt.to_file('./lbt_test.geojson', driver='GeoJSON')
# lacmta.to_file('./lacmta_test.geojson', driver='GeoJSON')

## AC Transit

In [17]:
map_hqta(act_hqta)

Map(center=[37.80464236021857, -122.27063925338948], controls=(ZoomControl(options=['position', 'zoom_in_text'…

### AC Transit Map Image

![map](img/act.png)

## Santa Monica Big Blue Bus

In [18]:
map_hqta(bbb_hqta)

Map(center=[34.01444675024602, -118.49077832945909], controls=(ZoomControl(options=['position', 'zoom_in_text'…

### Santa Monica Big Blue Bus Map Image

![map](img/bbb.png)

## Long Beach Transit

In [28]:
map_hqta(lbt)

Map(center=[33.767382163422255, -118.18711911176604], controls=(ZoomControl(options=['position', 'zoom_in_text…

### Long Beach Transit Map Image

![map](img/lbt.png)

## Result

* Tested on several operators, provides an approximation of high-quality transit corridors but still incomplete
    * Segments not containing stops will not appear as HQTA-- may need to interpolate (e.g., freeway segments)
    * Various questionable short segments
    * HQTA classification questionable for some routes, will investigate
* Working on improving/documenting code, and towards a statewide proof of concept

## Multiple Operators

In [None]:
def multiple_operator_hqta(itp_ids):
    
    hqta = pd.DataFrame()
    for itp_id in itp_ids:
        hqta = hqta.append(single_operator_hqta(get_operator_views(itp_id), hqta)
        
    return hqta

## Sandbox

shapes, trips, stop_times, stops

In [49]:
def simple_map(gdf, mouseover=None):
    if 'calitp_extracted_at' in gdf.columns:
        gdf = gdf.drop(columns='calitp_extracted_at')
        gdf.geometry = gdf.geometry.buffer(50)
    
    m = Map(basemap=basemaps.CartoDB.Positron, center=[34, -118.34], zoom=11)

    if mouseover:
        html = HTML(f'hover to see {mouseover}')
        html.layout.margin = '0px 20px 20px 20px'
        control = WidgetControl(widget=html, position='topright')
        m.add_control(control)

        def update_html(feature,  **kwargs):
            html.value = '''
                <h3><b>{}</b></h3>
            '''.format(feature['properties'][mouseover])
    if 'hq_transit_corr' in gdf.columns:
        geo_data_hq = GeoData(geo_dataframe = gdf[gdf['hq_transit_corr']].to_crs('EPSG:4326'),
                               style={'color': 'black', 'fillColor': '#3366cc',
                                            'opacity':0.3, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                               hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                               name = 'HQTA')

        geo_data_not_hq = GeoData(geo_dataframe = gdf[~gdf['hq_transit_corr']].to_crs('EPSG:4326'),
                               style={'color': 'black', 'fillColor': '#fec44f',
                                            'opacity':0.3, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                               hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                               name = 'non-HQTA')

        m.add_layer(geo_data_hq)
        m.add_layer(geo_data_not_hq)
    
    else:
    
        geo_data_hq = GeoData(geo_dataframe = gdf.to_crs('EPSG:4326'),
                               style={'color': 'black', 'fillColor': '#3366cc',
                                            'opacity':0.3, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                               hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                               name = 'gdf')
        m.add_layer(geo_data_hq)
    
    if mouseover:
        geo_data_hq.on_hover(update_html)

    m.add_control(LayersControl())

    return m

In [50]:
simple_map(debug_dict['shp-3-01_single_hqta'], 'stop_id')

Map(center=[34, -118.34], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_ou…