In [1]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona

import shapely
from shapely.geometry import LineString, MultiPoint
from shapely.ops import split, substring

import zlib
import datetime as dt

from utilities import *
import shared_utils

E0503 21:14:44.928526193    1086 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0503 21:14:47.161801255    1086 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


### High Quality Transit Areas Relevant Statutes

[PRC 21155](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21155.&lawCode=PRC)
* Major transit stop definition: _A major transit stop is as defined in Section 21064.3, except that, for purposes of this section, it also includes major transit stops that are included in the applicable regional transportation plan_
* High-quality transit corridor definition: _For purposes of this section, a high-quality transit corridor means a corridor with fixed route bus service with service intervals no longer than 15 minutes during peak commute hours._
    * Unable to locate definition of "peak commute hours"

[PRC 21064.3](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21064.3.&lawCode=PRC)
* _Major transit stop means a site containing any of the following:
(a) An existing rail or bus rapid transit station.
(b) A ferry terminal served by either a bus or rail transit service.
(c) The intersection of two or more major bus routes with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods._
    * "Intersection" may not be sufficiently well-defined for this analysis

[PRC 21060.2](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PRC&sectionNum=21060.2.&highlight=true&keyword=bus%20rapid%20transit)
* _(a) “Bus rapid transit” means a public mass transit service provided by a public agency or by a public-private partnership that includes all of the following features:
(1) Full-time dedicated bus lanes or operation in a separate right-of-way dedicated for public transportation with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods.
(2) Transit signal priority.
(3) All-door boarding.
(4) Fare collection system that promotes efficiency.
(5) Defined stations._
    * Unlikely to determine if a service qualifies as BRT under this definition using GTFS alone

In [2]:
def single_shape_hqta(
    shapes, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
):
    """Starting with a single GTFS shape, split that shape into segments and determine if each segment qualifies
    as an HQTA. Existing segments within a shape are dropped for that shape, since their peak frequency and
    HQTA status would have already been calculated for a previous shape.
    """

    ## TODO use shared utils func
    single_shape = (
        shapes
        >> filter(_.shape_id == shape_id)
        >> mutate(shape_pt_sequence=_.shape_pt_sequence.astype("int64"))
        >> arrange(
            _.shape_pt_sequence
        )  ##arrange, then convert to line to preserve order...
    )

    route_line = LineString(
        list(single_shape["geometry"])
    )  ##TODO replace with shared util? (upstream in shapes...)
    single_line = single_shape[
        ["calitp_itp_id", "shape_id", "calitp_extracted_at"]
    ].iloc[
        [0]
    ]  ##preserve info cols
    single_line["geometry"] = route_line
    single_line = gpd.GeoDataFrame(
        single_line, crs=shared_utils.geography_utils.CA_NAD83Albers
    )

    if shape_id in debug_ids:
        print(f"***debug shape*** {shape_id}")
        debug_dict[f"{shape_id}_single_line"] = single_line
        debug_dict[f"{shape_id}_hqta"] = hqta

    calculated_stops = []
    ## TODO any way to make more efficient? a pre-calculated shape overlap? something else?
    if hqta.size != 0:
        already_calculated = hqta.dissolve(
            by="calitp_itp_id"
        )  ## get single polygon of HQTA calculation complete area
        single_line = single_line.overlay(
            already_calculated, how="difference"
        )  ## drop calculation complete area from current shape
        if single_line.size == 0:
            segments_with_max_stop = None
            print(f"no line for shape {shape_id}")
            return

    segmented = gpd.GeoDataFrame() ##changed to gdf?
    for segment in create_segments(single_line.geometry):
        to_append = single_line.drop(columns=["geometry"])
        to_append["geometry"] = segment
        segmented = pd.concat((segmented, to_append))

    segmented = segmented.reset_index()
    try:
        segmented["segment_sequence"] = segmented.index.astype(str)
        segmented = segmented.astype({"calitp_itp_id": str})
    except:
        print(f"segmented shape has no itp_id {shape_id}")
        return

    ## compute (hopefully unique) hash of segment id that can be used across routes/operators
    segmented["hqta_segment_id"] = segmented.apply(
        lambda x: zlib.crc32(
            (x.calitp_itp_id + x.shape_id + x.segment_sequence).encode("utf-8")
        ),
        axis=1,
    )

    segmented.geometry = segmented.buffer(
        50
    )  ##generous buffer for street/sidewalk width? Required to spatially find stops within each segment

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_segmented"] = segmented
        debug_dict[f"{shape_id}_stops"] = stops
        debug_dict[f"{shape_id}_route_ct_by_stop"] = route_count_by_stop

    segments_with_max_stop = segmented.apply(
        find_stop_with_high_trip_count,
        axis=1,
        args=(stops, stop_times, 1, calculated_stops),
    )

    if not "stop_id" in segments_with_max_stop.columns:
        segments_with_top2_stops = None
        print(f"no stops for shape {shape_id}")
        return  ## no stops within segment

    max_stop_times = (
        stop_times
        >> select(_.stop_id, _.trip_id, _.departure_time)
        >> inner_join(_, segments_with_max_stop, on="stop_id")
    )  ## filter stop_times to the key stops in each segment
    max_stop_times = max_stop_times.dropna(subset=["departure_time"])
    max_stop_times = max_stop_times.drop_duplicates(
        subset=["trip_id", "hqta_segment_id"]
    )  ## filter duplicates for top2 approach

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_max_stop0"] = max_stop_times

    max_stop_times["departure_time"] = max_stop_times["departure_time"].apply(
        fix_arrival_time
    )  ## reformat GTFS time to a format datetime can ingest
    max_stop_times["departure_dt"] = max_stop_times["departure_time"].apply(
        lambda x: dt.datetime.strptime(x, "%H:%M:%S")
    )
    max_stop_times["departure_hour"] = max_stop_times["departure_dt"].apply(
        lambda x: x.hour
    )

    if max_stop_times.size == 0:
        print(f"no commute hour trips for shape {shape_id}")
        return

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_max_stop"] = max_stop_times

    ## new flexible peak
    segment_am_max = (
        max_stop_times
        >> count(_.hqta_segment_id, _.departure_hour)
        >> filter(_.departure_hour < 12)
        >> group_by(_.hqta_segment_id)
        >> summarize(am_max_trips=_.n.max())
    )

    segment_pm_max = (
        max_stop_times
        >> count(_.hqta_segment_id, _.departure_hour)
        >> filter(_.departure_hour >= 12)
        >> group_by(_.hqta_segment_id)
        >> summarize(pm_max_trips=_.n.max())
    )
    try:
        segment_peak_service = segment_am_max >> inner_join(
            _, segment_pm_max, on="hqta_segment_id"
        )
        segment_peak_service["hq_transit_corr"] = segment_peak_service.apply(
            lambda x: x.am_max_trips > 4 and x.pm_max_trips > 4, axis=1
        )
    except:  ## append when all segments only have am or pm trips, not an hqta by definition
        segment_peak_service = pd.concat((segment_am_max, segment_pm_max))
        segment_peak_service["hq_transit_corr"] = False

    segment_peak_service = segment_peak_service.reset_index(drop=True)

    single_hqta = segments_with_max_stop >> inner_join(
        _, segment_peak_service, on="hqta_segment_id"
    )
    single_hqta = single_hqta >> select(-_.calitp_extracted_at, -_.index, -_.n_trips)

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_single_hqta"] = single_hqta

    return single_hqta.reset_index(drop=True)

In [3]:
def single_operator_hqta(views):

    global debug_dict
    debug_dict = {}

    shapes, trips, stop_times, stops = views

    distinct_routes = (
        trips
        >> distinct(_.route_id, _.shape_id, _.direction_id, _keep_all=True)
        >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.direction_id, _.trip_id)
    )

    route_count_by_stop = (
        stop_times
        >> select(_.stop_id, _.trip_id)
        >> inner_join(_, distinct_routes, on="trip_id")
        >> count(_.stop_id)
        >> rename(n_routes=_.n)
        >> arrange(-_.n_routes)
    )

    hqta = gpd.GeoDataFrame()
    ## start with shapes including the highest number of trips
    shapes_sorted = (
        trips.groupby("shape_id")
        .count()
        .sort_values(by="trip_id", ascending=False)
        .index
    )
    shapes_sorted = pd.Series(shapes_sorted)
    total_shapes = len(shapes_sorted)
    print(f"there are {total_shapes} shapes total")
    for ix, shape_id in shapes_sorted.items():
        print(f"calculating for shape_id {shape_id}")
        if ix % 25 == 0:
            print(
                f"progress: {ix}/{total_shapes} shapes ({round(((ix/total_shapes)*100), 2)}%)"
            )
        # try:
        result = single_shape_hqta(
            shapes, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
        )
        hqta = pd.concat((hqta, result))
        # except:
            # print(f"unable to calculate HQTA for shape_id {shape_id}")
        try:
            hqta = hqta.set_crs(shared_utils.geography_utils.CA_NAD83Albers)
        except:
            continue
    return hqta
    # return hqta.drop(columns=['n', 'departure_hour']).reset_index(drop=True)

In [4]:
debug_ids = ['940143', '1730123']

In [5]:
# views = get_operator_views()

In [6]:
# hqta = single_operator_hqta(views)

In [7]:
# shared_utils.utils.geoparquet_gcs_export(hqta, f'{GCS_FILE_PATH}bus_corridors/', f'{182}_bus')

In [8]:
##TODO replace map with shared utils folium, useful mouseover

In [9]:
# map_hqta(hqta)

## Multiple Operators

In [10]:
itp_ids = tbl.gtfs_schedule.agency() >> distinct(_.calitp_itp_id) >> collect()
itp_ids = itp_ids.calitp_itp_id

In [11]:
fs_list = fs.ls(f'{GCS_FILE_PATH}bus_corridors/')

In [17]:
ran_operators = [int(path.split('bus_corridors/')[1].split('_')[0])
                 for path in fs_list
                 if path.split('bus_corridors/')[1] and path.split('bus_corridors/')[1].isnumeric()]

In [18]:
ran_operators += [194] ##skip marin, inf loop??

In [19]:
def multiple_operator_hqta():
    
    hqta = pd.DataFrame()    
    for itp_id in itp_ids.to_list():
        if itp_id in ran_operators:
            print(f'already ran {itp_id}')
            continue
        if int(itp_id) == 200:
            continue ## skip MTC feed to use individual operator feeds
        try:
            print(f'attempting for operator {itp_id}')
            operator = single_operator_hqta(get_operator_views(itp_id))
            if not operator.empty:
                shared_utils.utils.geoparquet_gcs_export(operator, f'{GCS_FILE_PATH}bus_corridors/', f'{itp_id}_bus')
                hqta = pd.concat([hqta, operator])
            else:
                print(f'no hqta for operator {itp_id}')
        except:
            print(f'failed for operator {itp_id}')
        
    return hqta

In [40]:
fs_list[1].split('.')

['calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/101_bus',
 'parquet']

In [41]:
GCS_FILE_PATH

'gs://calitp-analytics-data/data-analyses/high_quality_transit_areas/'

In [57]:
all_operators = gpd.GeoDataFrame()
for operator_file in fs_list:
    if len(operator_file.split('.')) > 1 and operator_file.split('.')[1] == 'parquet':
        print(operator_file)
        all_operators = pd.concat((all_operators,
                                  gpd.read_parquet('gs://' + operator_file)))

calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/101_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/102_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/103_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/106_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/108_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/110_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/112_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/116_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/118_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/11_bus.parquet
calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corr

FileNotFoundError: calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/all_bus.parquet

In [35]:
# all_operators = multiple_operator_hqta() ## oh this doesn't actually include all if some already ran

In [36]:
##TODO grab+concat all files...

In [21]:
# shared_utils.utils.geoparquet_gcs_export(all_operators, f'{GCS_FILE_PATH}bus_corridors/', f'all_bus')

## Data Cleaning

In [59]:
short_dropped = all_operators[all_operators['geometry'].area > 50*400] ##50m width * 400m segment min

In [60]:
short_dropped = short_dropped.reset_index().drop(columns=['index'])

In [61]:
dissolved = short_dropped.dissolve(by=['hq_transit_corr', 'shape_id']).reset_index()

In [62]:
dissolved = dissolved[dissolved['geometry'].area > 50*3000] ##50m width * 3000m shape min
## TODO smarter dropping, parse multilinestring (ex. Samtrans coastside)
## a pre-screening step? frequency by stop, then exclude shapes...

In [63]:
# dissolved = gpd.read_parquet(f'{GCS_FILE_PATH}shape_hqta_dissolve.parquet') ##old

In [64]:
# map_hqta(dissolved)

In [65]:
# shared_utils.utils.geoparquet_gcs_export(dissolved, f'{GCS_FILE_PATH}intermediate/', f'shape_dissolve')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



In [2]:
dissolved = gpd.read_parquet(f'{GCS_FILE_PATH}intermediate/shape_dissolve.parquet')

### Bay Area Map Image

![map](img/bay.png)

### Los Angeles Map Image

![map](img/la.png)

## Result

* Ran and aggregated for nearly all bus operators statewide
    * Segments not containing stops will not appear as HQTA-- may need to interpolate (e.g., freeway segments)
    * Some questionable short segments
* Algorithm may be overestimating for SFMTA

### Data Issues

* 61 County Connection has a lot of na departure times?
    * strange results after dropna
* 48 B-Line similarly choppy
* 116 Fresno Area Express has whitespace in departure times
    * ValueError: time data ' 7:04:00' does not match format '%H:%M:%S'

