In [32]:
!pip install -r requirements.txt

Collecting pygeos
  Downloading pygeos-0.11.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 7.7 MB/s eta 0:00:01
Installing collected packages: pygeos
Successfully installed pygeos-0.11.1


In [1]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona

from ipyleaflet import Map, GeoJSON, projections, basemaps, GeoData, LayersControl, WidgetControl, GeoJSON
from ipywidgets import Text, HTML

import shapely
from shapely.geometry import LineString, MultiPoint
from shapely.ops import split, substring

import zlib
import datetime as dt



### High Quality Transit Areas Relevant Statutes

[PRC 21155](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21155.&lawCode=PRC)
* Major transit stop definition: _A major transit stop is as defined in Section 21064.3, except that, for purposes of this section, it also includes major transit stops that are included in the applicable regional transportation plan_
* High-quality transit corridor definition: _For purposes of this section, a high-quality transit corridor means a corridor with fixed route bus service with service intervals no longer than 15 minutes during peak commute hours._
    * Unable to locate definition of "peak commute hours"

[PRC 21064.3](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21064.3.&lawCode=PRC)
* _Major transit stop means a site containing any of the following:
(a) An existing rail or bus rapid transit station.
(b) A ferry terminal served by either a bus or rail transit service.
(c) The intersection of two or more major bus routes with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods._
    * "Intersection" may not be sufficiently well-defined for this analysis

[PRC 21060.2](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PRC&sectionNum=21060.2.&highlight=true&keyword=bus%20rapid%20transit)
* _(a) “Bus rapid transit” means a public mass transit service provided by a public agency or by a public-private partnership that includes all of the following features:
(1) Full-time dedicated bus lanes or operation in a separate right-of-way dedicated for public transportation with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods.
(2) Transit signal priority.
(3) All-door boarding.
(4) Fare collection system that promotes efficiency.
(5) Defined stations._
    * Unlikely to determine if a service qualifies as BRT under this definition using GTFS alone

In [2]:
##definition of "peak" pending
am_peak = range(6, 9)
pm_peak = range(16, 19)

In [3]:
am_commute_hours = list(am_peak)
pm_commute_hours = list(pm_peak)
commute_hours = am_commute_hours + pm_commute_hours

In [4]:
commute_hours

[6, 7, 8, 16, 17, 18]

In [5]:
def create_segments(geometry):
    '''Splits a Shapely LineString into smaller LineStrings. If a MultiLineString passed,
    splits each LineString in that collection. 
    '''
    
    lines = []
    segment_distance_meters = 1250
    geometry = geometry.iloc[0]
    if hasattr(geometry, 'geoms'): ##check if MultiLineString
        linestrings = geometry.geoms
    else:
        linestrings = [geometry]
    for linestring in linestrings:
        for i in range(0, int(linestring.length), segment_distance_meters):
            lines.append(substring(linestring, i, i+segment_distance_meters))
    return lines

In [6]:
def find_stop_with_high_trip_count(segment, stops, stop_times, rank, calculated_stops):
    '''Given a shape segment, finds the stop serving the most (or other rank) trips within that segment.
    Adds that stop's stop_id to segment data (a row).
    '''
    
    stops_in_seg = gpd.clip(stops, segment.geometry)
    if stops_in_seg.size == 0:
        return segment
    
    stop_times_in_seg = stops_in_seg >> inner_join(_, stop_times, on = 'stop_id')
    trip_count_by_stop = stop_times_in_seg >> count(_.stop_id) >> arrange(-_.n) >> rename(n_trips = _.n)
    try:
        stop_id = trip_count_by_stop['stop_id'].iloc[rank - 1]

        if stop_id in list(calculated_stops):
            return segment
        segment['stop_id'] = stop_id
        segment['n_trips'] = trip_count_by_stop['n_trips'].iloc[rank - 1]
        return segment
    except IndexError:
        return segment

In [7]:
def get_operator_views(itp_id):
    '''Returns relevant views from the data warehouse for a single transit operator.
    '''
    shapes = tbl.gtfs_schedule.shapes() >> filter(_.calitp_itp_id == int(itp_id)) >> collect()
    shapes = gpd.GeoDataFrame(shapes, 
                              geometry = gpd.points_from_xy(shapes.shape_pt_lon, shapes.shape_pt_lat),
                              crs = 'EPSG:4326').to_crs('EPSG:6414') ## https://epsg.io/6414 (meters)
    print('loaded shapes')
    if itp_id != 273:
        wednesday = (tbl.views.gtfs_schedule_fact_daily_service() 
                 >> filter(_.calitp_itp_id == int(itp_id)) >> collect())
        wednesday = wednesday >> arrange(-_.service_date)
        wednesday = wednesday[wednesday['service_date'].apply(lambda x: x.weekday() == 2)]
        wednesday = wednesday[wednesday['service_date'].apply(lambda x: x.month < 11)]
        wednesday = wednesday[wednesday['service_date'] == wednesday['service_date'].iloc[0]] ## pick most recent Wednesday from Oct or earlier
    
    bus_routes = (tbl.gtfs_schedule.routes()
                    >> filter(_.calitp_itp_id == int(itp_id))
                    >> filter(_.route_type.isin(['3', '11'])) ## bus and trolleybus
                    >> select(_.route_id) >> collect())
    print('loaded bus routes')
    
    if itp_id == 273:
        trips = (tbl.gtfs_schedule.trips()
         >> filter(_.calitp_itp_id == int(itp_id))
         >> filter(_.service_id.isin(['1'])) ## temporary hardcode for SacRT
         >> filter(_.route_id.isin(bus_routes.route_id))
         >> collect())
    else:
        trips = (tbl.gtfs_schedule.trips()
                 >> filter(_.calitp_itp_id == int(itp_id))
                 >> filter(_.service_id.isin(wednesday.service_id))
                 >> filter(_.route_id.isin(bus_routes.route_id))
                 >> collect())
    print('loaded trips')
    stop_times = (tbl.gtfs_schedule.stop_times()
                  >> filter(_.calitp_itp_id == int(itp_id))
                  >> collect())
    stop_times =  (stop_times >> filter(_.trip_id.isin(trips.trip_id))
                  >> select(-_.calitp_itp_id, -_.calitp_extracted_at))
    print('loaded stop times')

    stops = (tbl.gtfs_schedule.stops() 
             >> filter(_.calitp_itp_id == itp_id)
             >> select(_.stop_id, _.stop_lat, _.stop_lon)
             >> collect())
    stops = gpd.GeoDataFrame(stops,
                     geometry = gpd.points_from_xy(stops.stop_lon, stops.stop_lat),
                     crs = 'EPSG:4326').to_crs('EPSG:6414') ## https://epsg.io/6414 (meters)
    print('loaded stops')

    return shapes, trips, stop_times, stops

In [8]:
def fix_arrival_time(gtfs_timestring):
    '''Reformats a GTFS timestamp (which allows the hour to exceed 24 to mark service day continuity)
    to standard 24-hour time.
    '''
    split = gtfs_timestring.split(':')
    hour = int(split[0])
    if hour >= 24:
        split[0] = str(hour - 24)
        corrected = (':').join(split)
        return corrected.strip()
    else:
        return gtfs_timestring.strip()

In [18]:
def single_shape_hqta(shapes, trips, stop_times, stops, route_count_by_stop, shape_id, hqta):
    '''Starting with a single GTFS shape, split that shape into segments and determine if each segment qualifies
    as an HQTA. Existing segments within a shape are dropped for that shape, since their peak frequency and 
    HQTA status would have already been calculated for a previous shape.
    '''
    
    single_shape = (shapes
         >> filter(_.shape_id == shape_id)
         >> mutate(shape_pt_sequence = _.shape_pt_sequence.astype('int64'))
         >> arrange(_.shape_pt_sequence) ##arrange, then convert to line to preserve order...
        )
    
    route_line = LineString(list(single_shape['geometry']))
    single_line = single_shape[['calitp_itp_id', 'shape_id', 'calitp_extracted_at']].iloc[[0]] ##preserve info cols
    single_line['geometry'] = route_line
    single_line = gpd.GeoDataFrame(single_line, crs='EPSG:6414')
    
    if shape_id in debug_ids:
        print(f'***debug shape*** {shape_id}')
        debug_dict[f'{shape_id}_single_line'] = single_line
        debug_dict[f'{shape_id}_hqta'] = hqta

    calculated_stops = []
    if hqta.size != 0:
        already_calculated = hqta.dissolve(by='calitp_itp_id') ## get single polygon of HQTA calculation complete area
        single_line = single_line.overlay(already_calculated, how='difference') ## drop calculation complete area from current shape
        if single_line.size == 0:
            segments_with_max_stop = None
            print(f'no line for shape {shape_id}')
            return

    segmented = pd.DataFrame()
    for segment in create_segments(single_line.geometry):
        to_append = single_line.drop(columns=['geometry'])
        to_append['geometry'] = segment
        segmented = segmented.append(to_append)
        
    segmented = segmented.reset_index()
    segmented['segment_sequence'] = segmented.index.astype(str)
    segmented = segmented.astype({'calitp_itp_id': str})
    
    ## compute (hopefully unique) hash of segment id that can be used across routes/operators
    segmented['hqta_segment_id'] = segmented.apply(lambda x:
                zlib.crc32((x.calitp_itp_id + x.shape_id + x.segment_sequence).encode('utf-8')), axis=1)
    
    segmented.geometry = segmented.buffer(50) ##generous buffer for street/sidewalk width? Required to spatially find stops within each segment

    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_segmented'] = segmented
        debug_dict[f'{shape_id}_stops'] = stops
        debug_dict[f'{shape_id}_route_ct_by_stop'] = route_count_by_stop
     
    segments_with_max_stop = segmented.apply(find_stop_with_high_trip_count, axis=1,
                                             args = (stops, stop_times, 1, calculated_stops))
    
    if not 'stop_id' in segments_with_max_stop.columns:
        segments_with_top2_stops = None
        print(f'no stops for shape {shape_id}')
        return ## no stops within segment

    max_stop_times = (stop_times 
                 >> select(_.stop_id, _.trip_id, _.departure_time)
                 >> inner_join(_, segments_with_max_stop, on = "stop_id")
                ) ## filter stop_times to the key stops in each segment
    max_stop_times = max_stop_times.dropna(subset=['departure_time'])
    max_stop_times = max_stop_times.drop_duplicates(subset=['trip_id','hqta_segment_id']) ## filter duplicates for top2 approach
    
    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_max_stop0'] = max_stop_times
    
    max_stop_times['departure_time'] = max_stop_times['departure_time'].apply(fix_arrival_time) ## reformat GTFS time to a format datetime can ingest
    max_stop_times['departure_dt'] = max_stop_times['departure_time'].apply(lambda x:
                                                                    dt.datetime.strptime(x, '%H:%M:%S'))
    max_stop_times['departure_hour'] = max_stop_times['departure_dt'].apply(lambda x: x.hour)
    
    # max_stop_times = max_stop_times >> filter(_.departure_hour.isin(commute_hours))
    
    if max_stop_times.size == 0:
        print(f'no commute hour trips for shape {shape_id}')
        return
    
    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_max_stop'] = max_stop_times
        
    max_stop_times['am_peak'] = max_stop_times['departure_hour'].apply(lambda x: x in am_commute_hours)
    max_stop_times['pm_peak'] = ~max_stop_times['am_peak']
    
    max_stop_times.drop_duplicates(subset = ['stop_id', 'departure_dt'], inplace=True) ## drop duplicates if duplicitous service_ids sneak through
    max_stop_times.drop(columns=['departure_dt'], inplace=True)
    
    segment_peak_service = max_stop_times.groupby(['hqta_segment_id'])[['am_peak', 'pm_peak']].sum() ## count total trips at stop during each peak period
    
    ## convert to trips per hour
    segment_peak_service['am_peak'] = (segment_peak_service['am_peak'] / len(am_commute_hours)).round(1)
    segment_peak_service['pm_peak'] = (segment_peak_service['pm_peak'] / len(pm_commute_hours)).round(1)
    
    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_segment_peak'] = segment_peak_service

    ## consider segment HQTA if stop within segment has at least 4 trips/hour in both peaks
    segment_peak_service['hq_transit_corr'] = segment_peak_service.apply(lambda x:
                              True if x.am_peak >= 4 and x.pm_peak >= 4 else False, axis=1)
    
    segment_peak_service = segment_peak_service.reset_index()
    segment_peak_service['last_seg_hqta'] = segment_peak_service['hq_transit_corr'].shift(1)
    segment_peak_service['next_seg_hqta'] = segment_peak_service['hq_transit_corr'].shift(-1)
    
    ## consider segment not HQTA if both the prior and next segements are not HQTAs
    segment_peak_service['hq_transit_corr'] = segment_peak_service.apply(
            lambda x: False if x.hq_transit_corr == False else x.last_seg_hqta or x.next_seg_hqta, axis = 1)
    segment_peak_service['hq_transit_corr'] = segment_peak_service['hq_transit_corr'].fillna(True)
    
    single_hqta = segments_with_max_stop >> inner_join(_, segment_peak_service, on = 'hqta_segment_id')
    single_hqta.drop(columns=['calitp_extracted_at', 'next_seg_hqta', 'last_seg_hqta'], inplace=True)
    
    if shape_id in debug_ids:
        debug_dict[f'{shape_id}_single_hqta'] = single_hqta
    
    return single_hqta

In [19]:
def single_operator_hqta(views):  
    
    global debug_dict
    debug_dict = {}
    
    shapes, trips, stop_times, stops = views
    
    
    distinct_routes = (trips
                   >> distinct(_.route_id, _.shape_id, _.direction_id, _keep_all = True)
                   >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.direction_id, _.trip_id))
    
    route_count_by_stop = (stop_times
                         >> select(_.stop_id, _.trip_id)
                         >> inner_join(_, distinct_routes, on = "trip_id")
                         >> count(_.stop_id)
                         >> rename(n_routes = _.n)
                         >> arrange(-_.n_routes))    
    
    hqta = gpd.GeoDataFrame()
    ## start with shapes including the highest number of trips
    shapes_sorted = trips.groupby('shape_id').count().sort_values(by='trip_id', ascending=False).index
    shapes_sorted = pd.Series(shapes_sorted)
    total_shapes = len(shapes_sorted)
    print(f'there are {total_shapes} shapes total')
    for ix, shape_id in shapes_sorted.items():
        print(f'calculating for shape_id {shape_id}')
        if ix % 25 == 0:
            print(f'progress: {ix}/{total_shapes} shapes ({round(((ix/total_shapes)*100), 2)}%)')
        # try:
        result = single_shape_hqta(shapes, trips, stop_times, stops, route_count_by_stop,
                                shape_id, hqta)
        hqta = hqta.append(result)
        # except:
            # print(f'unable to calculate HQTA for shape_id {shape_id}')
        try:
            hqta = hqta.set_crs('EPSG:6414')
        except:
            continue
        
    return hqta

In [20]:
debug_ids = ['25351']

In [12]:
bbb_views = get_operator_views(300)

loaded shapes
loaded bus routes
loaded trips
loaded stop times
loaded stops


In [21]:
bbb_hqta = single_operator_hqta(bbb_views)

there are 70 shapes total
calculating for shape_id 25351
progress: 0/70 shapes (0.0%)
***debug shape*** 25351
calculating for shape_id 25315
calculating for shape_id 25352
calculating for shape_id 25325
calculating for shape_id 25314
calculating for shape_id 25331
calculating for shape_id 25326
calculating for shape_id 25333
calculating for shape_id 25387
calculating for shape_id 25386
calculating for shape_id 25354
calculating for shape_id 25358
calculating for shape_id 25365
calculating for shape_id 25368
calculating for shape_id 25382
calculating for shape_id 25323
calculating for shape_id 25320
calculating for shape_id 25399
calculating for shape_id 25400
no stops for shape 25400
calculating for shape_id 25376
no line for shape 25376
calculating for shape_id 25380
no stops for shape 25380
calculating for shape_id 25340
calculating for shape_id 25337
calculating for shape_id 25363
calculating for shape_id 25346
calculating for shape_id 25343
progress: 25/70 shapes (35.71%)
calculati

In [23]:
debug_dict.keys()

dict_keys(['25351_single_line', '25351_hqta', '25351_segmented', '25351_stops', '25351_route_ct_by_stop', '25351_max_stop0', '25351_max_stop', '25351_segment_peak', '25351_single_hqta'])

In [25]:
max_stop = debug_dict['25351_max_stop']

In [26]:
max_stop.head(3)

Unnamed: 0,stop_id,trip_id,departure_time,index,calitp_itp_id,shape_id,calitp_extracted_at,geometry,segment_sequence,hqta_segment_id,n_trips,departure_hour,am_peak,pm_peak
0,1409,859796,10:59:00,11214,300,25351,2021-10-21,"POLYGON ((147058.435 -442840.043, 147038.878 -...",0,3704801516,153,10,False,True
1,1409,859924,07:26:00,11214,300,25351,2021-10-21,"POLYGON ((147058.435 -442840.043, 147038.878 -...",0,3704801516,153,7,True,False
2,1409,859814,14:35:00,11214,300,25351,2021-10-21,"POLYGON ((147058.435 -442840.043, 147038.878 -...",0,3704801516,153,14,False,True


In [49]:
segment_am_max = (max_stop
                  >> count(_.hqta_segment_id, _.departure_hour)
                  >> filter(_.departure_hour < 12)
                  >> group_by(_.hqta_segment_id)
                  >> summarize(am_max_trips = _.n.max())
                 )
segment_am_max.head(2)

Unnamed: 0,hqta_segment_id,am_max_trips
0,853322176,9
1,1172019542,14


In [50]:
segment_pm_max = (max_stop
                  >> count(_.hqta_segment_id, _.departure_hour)
                  >> filter(_.departure_hour >= 12)
                  >> group_by(_.hqta_segment_id)
                  >> summarize(pm_max_trips = _.n.max())
                 )
segment_pm_max.head(2)

Unnamed: 0,hqta_segment_id,pm_max_trips
0,853322176,8
1,1172019542,14


In [55]:
df = (segment_am_max >> inner_join(_, segment_pm_max, on = 'hqta_segment_id'))
df['hq_transit_corr'] = df.apply(lambda x: x.am_max_trips > 4 and x.pm_max_trips > 4, axis = 1)
df.head(3)

Unnamed: 0,hqta_segment_id,am_max_trips,pm_max_trips,hq_transit_corr
0,853322176,9,8,True
1,1172019542,14,14,True
2,2882926714,9,8,True


In [35]:
debug_dict['25351_segment_peak'].head(3)

Unnamed: 0_level_0,am_peak,pm_peak,hq_transit_corr
hqta_segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
853322176,7.7,30.7,True
1172019542,12.0,55.3,True
2882926714,7.7,29.7,True


### Mapping

In [12]:
def map_hqta(gdf, mouseover=None):
    if 'calitp_extracted_at' in gdf.columns:
        gdf = gdf.drop(columns='calitp_extracted_at')
        gdf.geometry = gdf.geometry.buffer(50)
    
    x = gdf.to_crs('EPSG:4326').geometry.iloc[0].centroid.x
    y = gdf.to_crs('EPSG:4326').geometry.iloc[0].centroid.y
    
    m = Map(basemap=basemaps.CartoDB.Positron, center=[y, x], zoom=11)

    if mouseover:
        html = HTML(f'hover to see {mouseover}')
        html.layout.margin = '0px 20px 20px 20px'
        control = WidgetControl(widget=html, position='topright')
        m.add_control(control)

        def update_html(feature,  **kwargs):
            html.value = '''
                <h3><b>{}</b></h3>
            '''.format(feature['properties'][mouseover])
    if 'hq_transit_corr' in gdf.columns:
        geo_data_hq = GeoData(geo_dataframe = gdf[gdf['hq_transit_corr']].to_crs('EPSG:4326'),
                               style={'color': 'black', 'fillColor': '#08589e',
                                            'opacity':0.4, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                               hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                               name = 'HQTA')
        #a8ddb5
        geo_data_not_hq = GeoData(geo_dataframe = gdf[~gdf['hq_transit_corr']].to_crs('EPSG:4326'),
                               style={'color': 'black', 'fillColor': '#fec44f',
                                            'opacity':0.2, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                               hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                               name = 'non-HQTA')

        m.add_layer(geo_data_hq)
        m.add_layer(geo_data_not_hq)
    
    else:
    
        geo_data_hq = GeoData(geo_dataframe = gdf.to_crs('EPSG:4326'),
                               style={'color': 'black', 'fillColor': '#08589e',
                                            'opacity':0.4, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                               hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                               name = 'gdf')
        m.add_layer(geo_data_hq)
    
    if mouseover:
        geo_data_hq.on_hover(update_html)

    m.add_control(LayersControl())

    return m

## Multiple Operators

In [13]:
itp_ids = tbl.gtfs_schedule.agency() >> distinct(_.calitp_itp_id) >> collect()
itp_ids = itp_ids.calitp_itp_id

In [14]:
def multiple_operator_hqta(existing=pd.DataFrame()):
    
    hqta = pd.DataFrame()    
    for _, itp_id in itp_ids.items():
        if not existing.empty:
            if str(itp_id) in existing['calitp_itp_id'].to_list():
                print(f'{itp_id} existing')
                # break
                continue
        if int(itp_id) == 200:
            continue ## skip MTC feed to use individual operator feeds
        try:
            print(f'attempting for operator {itp_id}')
            hqta = hqta.append(single_operator_hqta(get_operator_views(itp_id)))
        except:
            print(f'failed for operator {itp_id}')
        
    return hqta

In [17]:
# all_operators = multiple_operator_hqta()
# all_operators.to_parquet('./all_operators.parquet')

In [50]:
all_operators = gpd.read_parquet('./data/bus/all_operators.parquet')

In [51]:
itp_ids = itp_ids.astype(str)

In [52]:
df = tbl.gtfs_schedule.agency() >> collect() >> distinct(_.calitp_itp_id, _keep_all = True)

In [53]:
not_ran = df[~df['calitp_itp_id'].isin(all_operators['calitp_itp_id'].astype('int64'))]

In [54]:
# with pd.option_context('display.max_rows', 100):
#     display(not_ran)

## Spatially Aggregate non-HQTAs

In [55]:
non_hqta = all_operators >> filter(-_.hq_transit_corr)

In [56]:
non_hqta_overlaid = non_hqta.overlay(non_hqta, how='intersection')

In [57]:
non_hqta_deduplicated = non_hqta_overlaid[non_hqta_overlaid['hqta_segment_id_1'] != non_hqta_overlaid['hqta_segment_id_2']]

In [58]:
def combine_segment_ids(row):
    both_segments = [row.hqta_segment_id_1, row.hqta_segment_id_2]
    both_segments.sort()
    return str(both_segments)

In [59]:
non_hqta_deduplicated['overlap_id'] = non_hqta_deduplicated.apply(combine_segment_ids, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [60]:
more_deduplicated = non_hqta_deduplicated.drop_duplicates(subset=['overlap_id'])

In [61]:
more_deduplicated.to_parquet('./data/bus/overlaps.parquet')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  more_deduplicated.to_parquet('./data/bus/overlaps.parquet')


In [32]:
##TODO rewrite to new definition... (or just drop?)

def sum_overlaps(row):
    row['am_peak'] = row.am_peak_1 + row.am_peak_2
    row['pm_peak'] = row.pm_peak_1 + row.pm_peak_2
    row['n_trips'] = row.n_trips_1 + row.n_trips_2
    if row.am_peak >= 4 and row.pm_peak >= 4:
        row['hq_transit_corr'] = True
    else:
        row['hq_transit_corr'] = False
    return row

In [63]:
overlaps_summed = more_deduplicated.apply(sum_overlaps, axis = 1) 

In [64]:
new_hq = overlaps_summed[overlaps_summed['hq_transit_corr']]

In [65]:
new_hq = new_hq[new_hq['am_peak'] != new_hq['am_peak_1']*2] ## drop likely duplicates; can rework above to do this better

In [66]:
new_hq = new_hq.drop(columns=['calitp_itp_id_2',
               'hqta_segment_id_2', 'index_2', 'n_trips_2', 'segment_sequence_2',
               'shape_id_2', 'stop_id_2', 'am_peak_2', 'pm_peak_2',
               'hq_transit_corr_2', 'index_1', 'index_2', 'n_trips_2', 'n_trips_1',
                'am_peak_1', 'pm_peak_1', 'hq_transit_corr_1'])

In [67]:
new_hq = new_hq.rename(columns = {'calitp_itp_id_1':'calitp_itp_id', 'hqta_segment_id_1':'hqta_segment_id',
       'segment_sequence_1':'segment_sequence', 'shape_id_1':'shape_id', 'stop_id_1':'stop_id'})

In [68]:
all_operators = all_operators.append(new_hq)

In [69]:
short_dropped = all_operators[all_operators['geometry'].area > 50*400] ##50m width * 400m segment min

In [70]:
short_dropped = short_dropped.reset_index().drop(columns=['index'])

In [71]:
short_dropped.to_parquet('./data/bus/all_operators_cleaned_appended.parquet')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  short_dropped.to_parquet('./data/bus/all_operators_cleaned_appended.parquet')


In [72]:
cleaned_all = gpd.read_parquet('./data/bus/all_operators_cleaned_appended.parquet')

In [73]:
cleaned_all

Unnamed: 0,level_0,calitp_itp_id,geometry,hqta_segment_id,n_trips,segment_sequence,shape_id,stop_id,am_peak,pm_peak,hq_transit_corr,overlap_id
0,0,208,"POLYGON ((-160759.302 -149987.381, -160752.166...",2965703173,183.0,0,26030,1133,8.7,11.0,False,
1,1,208,"POLYGON ((-159824.204 -149976.259, -159820.991...",3351394963,65.0,1,26030,1229,3.0,4.0,False,
2,2,208,"POLYGON ((-158026.195 -149577.875, -158026.289...",701258687,123.0,3,26030,1635,3.3,5.0,False,
3,0,208,"POLYGON ((-158181.307 -148896.366, -158163.502...",2849913668,72.0,0,26031,1629,3.3,5.0,False,
4,1,208,"POLYGON ((-157787.115 -149177.055, -157782.416...",3738766290,72.0,1,26031,1630,3.0,4.0,False,
...,...,...,...,...,...,...,...,...,...,...,...,...
21292,90486,273,"POLYGON ((-130771.403 63232.410, -130657.005 6...",356699929,64.0,4,39715,456,7.4,9.0,True,"[356699929, 1808774315]"
21293,90972,308,"POLYGON ((117998.860 -413255.153, 117994.454 -...",2315245041,146.0,10,p_898189,2387409,8.0,8.6,True,"[1949746130, 2315245041]"
21294,91009,308,"POLYGON ((111669.065 -415372.364, 111669.342 -...",1129572458,100.0,8,p_898188,2387299,5.3,5.0,True,"[1129572458, 3421010252]"
21295,91409,257,"POLYGON ((-216483.814 -21828.804, -216478.140 ...",1645218673,464.0,1,13738,1681822,4.7,5.3,True,"[353844199, 1645218673]"


In [81]:
dissolved = cleaned_all.dissolve(by=['hq_transit_corr', 'shape_id']).reset_index()

In [84]:
dissolved = dissolved[dissolved['geometry'].area > 50*3000] ##50m width * 3000m shape min

In [86]:
dissolved.to_parquet('./data/bus/shape_hqta_dissolve.parquet')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  dissolved.to_parquet('./data/bus/shape_hqta_dissolve.parquet')


In [85]:
map_hqta(dissolved)

Map(center=[33.596015707487744, -117.87571863402819], controls=(ZoomControl(options=['position', 'zoom_in_text…

### Bay Area Map Image

![map](img/bay.png)

### Los Angeles Map Image

![map](img/la.png)

## Result

* Ran and aggregated for nearly all bus operators statewide
    * Segments not containing stops will not appear as HQTA-- may need to interpolate (e.g., freeway segments)
    * Some questionable short segments
* Algorithm may be overestimating for SFMTA

### Data Issues

* 61 County Connection has a lot of na departure times?
    * strange results after dropna
* 48 B-Line similarly choppy
* 116 Fresno Area Express has whitespace in departure times
    * ValueError: time data ' 7:04:00' does not match format '%H:%M:%S'

