In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(900_000_000_000) ## 800GB?

In [2]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona

import shapely
from shapely.geometry import LineString, MultiPoint
from shapely.ops import split, substring

import zlib
import datetime as dt

from utilities import *
import shared_utils



In [3]:
import sys

In [4]:
sys.path.append('../rt_delay/')

In [5]:
import utils as rt_utils

### High Quality Transit Areas Relevant Statutes

[PRC 21155](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21155.&lawCode=PRC)
* Major transit stop definition: _A major transit stop is as defined in Section 21064.3, except that, for purposes of this section, it also includes major transit stops that are included in the applicable regional transportation plan_
* High-quality transit corridor definition: _For purposes of this section, a high-quality transit corridor means a corridor with fixed route bus service with service intervals no longer than 15 minutes during peak commute hours._
    * Unable to locate definition of "peak commute hours"

[PRC 21064.3](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21064.3.&lawCode=PRC)
* _Major transit stop means a site containing any of the following:
(a) An existing rail or bus rapid transit station.
(b) A ferry terminal served by either a bus or rail transit service.
(c) The intersection of two or more major bus routes with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods._
    * "Intersection" may not be sufficiently well-defined for this analysis

[PRC 21060.2](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PRC&sectionNum=21060.2.&highlight=true&keyword=bus%20rapid%20transit)
* _(a) “Bus rapid transit” means a public mass transit service provided by a public agency or by a public-private partnership that includes all of the following features:
(1) Full-time dedicated bus lanes or operation in a separate right-of-way dedicated for public transportation with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods.
(2) Transit signal priority.
(3) All-door boarding.
(4) Fare collection system that promotes efficiency.
(5) Defined stations._
    * Unlikely to determine if a service qualifies as BRT under this definition using GTFS alone

In [6]:
def single_shape_hqta(
    routelines, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
):
    """Starting with a single GTFS shape, split that shape into segments and determine if each segment qualifies
    as an HQTA. Existing segments within a shape are dropped for that shape, since their peak frequency and
    HQTA status would have already been calculated for a previous shape.
    """

    single_line = routelines >> filter(_.shape_id == shape_id)
    if single_line.size == 0 or single_line.geometry.isna().all():
        print(f"no geometry for shape {shape_id}")
        return

    if shape_id in debug_ids:
        print(f"***debug shape*** {shape_id}")
        debug_dict[f"{shape_id}_single_line"] = single_line
        debug_dict[f"{shape_id}_hqta"] = hqta

    calculated_stops = []
    ## TODO any way to make more efficient? a pre-calculated shape overlap? something else?
    if hqta.size != 0:
        already_calculated = hqta.dissolve(
            by="calitp_itp_id"
        )  ## get single polygon of HQTA calculation complete area
        single_line = single_line.overlay(
            already_calculated, how="difference"
        )  ## drop calculation complete area from current shape
        if single_line.size == 0:
            segments_with_max_stop = None
            print(f"already calculated corridor for shape {shape_id}")
            return

    segmented = gpd.GeoDataFrame() ##changed to gdf?
    for segment in create_segments(single_line.geometry):
        to_append = single_line.drop(columns=["geometry"])
        to_append["geometry"] = segment
        segmented = pd.concat((segmented, to_append))

    segmented = segmented.reset_index()
    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_a_segmented"] = segmented

    try:
        segmented["segment_sequence"] = segmented.index.astype(str)
        assert 'calitp_itp_id' in segmented.columns
        # segmented = segmented.astype({"calitp_itp_id": str}) ## casting this to string is bad for exports...
        ## instead, cast when generating segment id below...
    except:
        print(f"segmented shape has no itp_id {shape_id}")
        return

    ## compute (hopefully unique) hash of segment id that can be used across routes/operators
    segmented["hqta_segment_id"] = segmented.apply(
        lambda x: zlib.crc32(
            (str(x.calitp_itp_id) + x.shape_id + x.segment_sequence).encode("utf-8")
        ),
        axis=1,
    )

    segmented.geometry = segmented.buffer(
        50
    )  ##generous buffer for street/sidewalk width? Required to spatially find stops within each segment

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_segmented"] = segmented
        debug_dict[f"{shape_id}_stops"] = stops
        debug_dict[f"{shape_id}_route_ct_by_stop"] = route_count_by_stop

    segments_with_max_stop = segmented.apply(
        find_stop_with_high_trip_count,
        axis=1,
        args=(stops, stop_times, 1, calculated_stops),
    )

    if not "stop_id" in segments_with_max_stop.columns:
        segments_with_top2_stops = None
        print(f"no stops for shape {shape_id}")
        return  ## no stops within segment

    max_stop_times = (
        stop_times
        >> select(_.stop_id, _.trip_id, _.departure_time)
        >> inner_join(_, segments_with_max_stop, on="stop_id")
    )  ## filter stop_times to the key stops in each segment
    max_stop_times = max_stop_times.dropna(subset=["departure_time"])
    max_stop_times = max_stop_times.drop_duplicates(
        subset=["trip_id", "hqta_segment_id"]
    )  ## filter duplicates for top2 approach

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_max_stop0"] = max_stop_times

    max_stop_times["departure_time"] = max_stop_times["departure_time"].apply(
        fix_arrival_time
    )  ## reformat GTFS time to a format datetime can ingest
    max_stop_times["departure_dt"] = max_stop_times["departure_time"].apply(
        lambda x: dt.datetime.strptime(x, "%H:%M:%S")
    )
    max_stop_times["departure_hour"] = max_stop_times["departure_dt"].apply(
        lambda x: x.hour
    )

    if max_stop_times.size == 0:
        print(f"no commute hour trips for shape {shape_id}")
        return

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_max_stop"] = max_stop_times

    ## new flexible peak
    segment_am_max = (
        max_stop_times
        >> count(_.hqta_segment_id, _.departure_hour)
        >> filter(_.departure_hour < 12)
        >> group_by(_.hqta_segment_id)
        >> summarize(am_max_trips=_.n.max())
    )

    segment_pm_max = (
        max_stop_times
        >> count(_.hqta_segment_id, _.departure_hour)
        >> filter(_.departure_hour >= 12)
        >> group_by(_.hqta_segment_id)
        >> summarize(pm_max_trips=_.n.max())
    )
    try:
        segment_peak_service = segment_am_max >> inner_join(
            _, segment_pm_max, on="hqta_segment_id"
        )
        segment_peak_service["hq_transit_corr"] = segment_peak_service.apply(
            lambda x: x.am_max_trips > 4 and x.pm_max_trips > 4, axis=1
        )
    except:  ## append when all segments only have am or pm trips, not an hqta by definition
        segment_peak_service = pd.concat((segment_am_max, segment_pm_max))
        segment_peak_service["hq_transit_corr"] = False

    segment_peak_service = segment_peak_service.reset_index(drop=True)

    single_hqta = segments_with_max_stop >> inner_join(
        _, segment_peak_service, on="hqta_segment_id"
    )
    single_hqta = single_hqta >> select(-_.calitp_extracted_at, -_.index, -_.n_trips)

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_single_hqta"] = single_hqta

    return single_hqta.reset_index(drop=True)

In [7]:
def single_operator_hqta(itp_id, analysis_date):

    global debug_dict
    debug_dict = {}

    # shapes, trips, stop_times, stops = views
    routelines = rt_utils.get_routelines(itp_id, analysis_date)
    ## force clear to ensure route type data present
    trips = rt_utils.get_trips(itp_id, analysis_date, force_clear=True, route_types = ['3'])
    stop_times = rt_utils.get_stop_times(itp_id, analysis_date)
    stops = rt_utils.get_stops(itp_id, analysis_date)

    distinct_routes = (
        trips
        >> distinct(_.route_id, _.shape_id, _.direction_id, _keep_all=True)
        >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.direction_id, _.trip_id)
    )

    route_count_by_stop = (
        stop_times
        >> select(_.stop_id, _.trip_id)
        >> inner_join(_, distinct_routes, on="trip_id")
        >> count(_.stop_id)
        >> rename(n_routes=_.n)
        >> arrange(-_.n_routes)
    )

    hqta = gpd.GeoDataFrame()
    ## start with shapes including the highest number of trips
    trips_shape_sorted = (
        trips.groupby("shape_id")
        .count()
        .sort_values(by="trip_id", ascending=False)
        .index
    )
    trips_shape_sorted = pd.Series(trips_shape_sorted)
    total_shapes = len(trips_shape_sorted)
    print(f"there are {total_shapes} shapes total")
    for ix, shape_id in trips_shape_sorted.items():
        print(f"calculating for shape_id {shape_id}")
        if ix % 25 == 0:
            print(
                f"progress: {ix}/{total_shapes} shapes ({round(((ix/total_shapes)*100), 2)}%)"
            )
        # try:
        result = single_shape_hqta(
            routelines, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
        )
        hqta = pd.concat((hqta, result))
        # except:
            # print(f"unable to calculate HQTA for shape_id {shape_id}")
        try:
            hqta = hqta.set_crs(shared_utils.geography_utils.CA_NAD83Albers)
        except:
            continue
    return hqta
    # return hqta.drop(columns=['n', 'departure_hour']).reset_index(drop=True)

In [8]:
debug_ids = ['30']

In [9]:
##TODO replace map with shared utils folium, useful mouseover

In [10]:
# map_hqta(hqta)

## Multiple Operators

In [11]:
## clear cached HQTA runs
# fs.rm(f'{GCS_FILE_PATH}bus_corridors/*')

In [12]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [13]:
# fs = get_fs()
fs_list = fs.ls(f'{GCS_FILE_PATH}bus_corridors/')

In [14]:
ran_operators = [int(path.split('bus_corridors/')[1].split('_')[0])
                 for path in fs_list
                 if path.split('bus_corridors/')[1]
                 and path.split('bus_corridors/')[1].split('_')[0].isnumeric()]

In [15]:
# ran_operators += [194] ##skip marin, inf loop??

In [16]:
itp_ids = tbl.gtfs_schedule.agency() >> distinct(_.calitp_itp_id) >> collect()
itp_ids = itp_ids.calitp_itp_id.to_list()

In [30]:
ran_operators += [200] ## view failed feeds besides MTC, which we don't run on purpose
tbl.gtfs_schedule.agency() >> collect() >> filter(-_.calitp_itp_id.isin(ran_operators),
                                                 -_.agency_url.str.contains('amtrak')
                                                 )

Unnamed: 0,calitp_itp_id,calitp_url_number,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email,calitp_extracted_at,calitp_hash,agency_key
64,338,1,TD,Tideline Water Taxi,http://bit.ly/tideline,America/Los_Angeles,en,415-339-0196,http://bit.ly/tideline,,2021-12-29,z/O7gkQGUH3q34eyE2jyVg==,6075967454087118308
65,338,0,513,Tideline Water Taxi,http://bit.ly/tideline,America/Los_Angeles,en,415-339-0196,http://bit.ly/tideline,,2021-04-15,Wlklb++d/AX+vr4xfToZCA==,8812373581699044399
73,327,0,Sunline,Sunline Transit Agency,http://www.sunline.org,America/Los_Angeles,en,(760) 343-3456,,,2021-04-16,TsVqaNmfHBGpAVDGMLsd+g==,-3492305784899410305
74,10,1,CE,Altamont Corridor Express,http://www.acerail.com/,America/Los_Angeles,en,8004117245,https://acerail.com/schedules/,customerservice@acerail.com,2021-12-29,4g1avZrlO74vWQ2uhjcQvA==,-5812548697820985556
75,10,0,CE,Altamont Corridor Express,http://www.acerail.com/,America/Los_Angeles,en,8004117245,https://acerail.com/schedules/,customerservice@acerail.com,2021-04-15,PWvJd8wS43GiMldVBY2wew==,6810323365661741356
77,246,1,CT,Caltrain,http://www.caltrain.com,America/Los_Angeles,en,800-660-4287,,,2021-12-29,4ajWXnbBcR2t3afgG9rZ7g==,-7989191801549264275
78,246,0,1000,Caltrain,http://www.caltrain.com,America/Los_Angeles,en,800-660-4287,,,2021-04-15,yUW4NE2sajmDw6Y8dGFDcg==,5770210708396956390
81,206,0,582,Montebello Bus Lines,http://www.ridembl.com/,America/Los_Angeles,en,323-558-1625,http://www.ridembl.com/Fares,,2021-05-13,s6DvHvDdJrn2bJcItvkkWw==,-5971964472456342283
85,81,0,1661,South County Transit Link,http://www.sctlink.com/,America/Los_Angeles,en,(209) 745-3052,,,2021-04-15,HjaybX8lRZUcWUWDfvEJ0w==,-7937234628455881187
126,33,0,1708,Bear Transit,http://pt.berkeley.edu/home,America/Los_Angeles,en,(510) 643-5708,,,2021-04-15,WUuP/NTX5n7WT7KiFbSXfg==,6063612100406188695


In [17]:
analysis_date = dt.date(2022, 5, 4) ## Wed, May 4

In [18]:
def multiple_operator_hqta(itp_ids, analysis_date):
    
    hqta = pd.DataFrame()
    
    for itp_id in itp_ids:
        if itp_id in ran_operators:
            print(f'already ran {itp_id}')
            continue
        if int(itp_id) == 200:
            continue ## skip MTC feed to use individual operator feeds
        try:
            print(f'attempting for operator {itp_id}')
            operator = single_operator_hqta(itp_id, analysis_date)
            if not operator.empty:
                ##TODO add date to exports...
                shared_utils.utils.geoparquet_gcs_export(operator, f'{GCS_FILE_PATH}bus_corridors/', f'{itp_id}_bus')
                hqta = pd.concat([hqta, operator])
            else:
                print(f'no hqta for operator {itp_id}')
        except:
            print(f'failed for operator {itp_id}')
            
    return hqta

In [31]:
montebello = multiple_operator_hqta([206], analysis_date)

attempting for operator 206



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



getting trips...
filtering to GTFS route types ['3']
found parquet
cached parquet empty, will try a fresh query
found parquet
cached parquet empty, will try a fresh query
there are 0 shapes total
no hqta for operator 206



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



In [34]:
rt_utils.get_trips(206, analysis_date, force_clear=True)

getting trips...


Unnamed: 0,calitp_itp_id,calitp_url_number,service_date,trip_key,trip_id,direction_id,shape_id,calitp_extracted_at,calitp_deleted_at,route_id,route_short_name,route_long_name,route_desc,route_type


In [20]:
# hqta = multiple_operator_hqta(itp_ids, analysis_date)

In [21]:
## grab+concat all files...
fs = gcsfs.GCSFileSystem()
all_operators = gpd.GeoDataFrame()
for operator_file in fs_list:
    if len(operator_file.split('.')) > 1 and operator_file.split('.')[1] == 'parquet':
        print(operator_file)
        all_operators = pd.concat((all_operators,
                                  gpd.read_parquet('gs://' + operator_file)))

calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/101_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/102_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/103_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/106_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/108_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/110_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/112_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/116_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/118_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/11_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corr

In [23]:
# ##TODO also rm before write...
# shared_utils.utils.geoparquet_gcs_export(all_operators, f'{GCS_FILE_PATH}bus_corridors/', f'all_bus')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



## Data Cleaning

In [59]:
short_dropped = all_operators[all_operators['geometry'].area > 50*400] ##50m width * 400m segment min

In [60]:
short_dropped = short_dropped.reset_index().drop(columns=['index'])

In [61]:
dissolved = short_dropped.dissolve(by=['hq_transit_corr', 'shape_id']).reset_index()

In [62]:
dissolved = dissolved[dissolved['geometry'].area > 50*3000] ##50m width * 3000m shape min
## TODO smarter dropping, parse multilinestring (ex. Samtrans coastside)
## a pre-screening step? frequency by stop, then exclude shapes...

In [63]:
# dissolved = gpd.read_parquet(f'{GCS_FILE_PATH}shape_hqta_dissolve.parquet') ##old

In [64]:
# map_hqta(dissolved)

In [65]:
# shared_utils.utils.geoparquet_gcs_export(dissolved, f'{GCS_FILE_PATH}intermediate/', f'shape_dissolve')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



In [2]:
dissolved = gpd.read_parquet(f'{GCS_FILE_PATH}intermediate/shape_dissolve.parquet')

### Bay Area Map Image

![map](img/bay.png)

### Los Angeles Map Image

![map](img/la.png)

## Result

* Ran and aggregated for nearly all bus operators statewide
    * Segments not containing stops will not appear as HQTA-- may need to interpolate (e.g., freeway segments)
    * Some questionable short segments
* Algorithm may be overestimating for SFMTA

### Data Issues

* 61 County Connection has a lot of na departure times?
    * strange results after dropna
* 48 B-Line similarly choppy
* 116 Fresno Area Express has whitespace in departure times
    * ValueError: time data ' 7:04:00' does not match format '%H:%M:%S'



In [10]:
tbl.views.gtfs_schedule_fact_daily_service() >> head(3)

Unnamed: 0,feed_key,calitp_itp_id,calitp_url_number,service_date,service_id,ttl_service_hours,n_trips,n_routes,first_departure_ts,last_arrival_ts,service_window
0,-1246851374109274803,13,0,2022-02-08,2814231,0.0,1,1,44100,44100,0.0
1,-1246851374109274803,13,0,2022-01-13,2814231,0.0,1,1,44100,44100,0.0
2,-1246851374109274803,13,0,2022-02-16,2814231,0.0,1,1,44100,44100,0.0
