In [1]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona

from ipyleaflet import (
    Map,
    GeoJSON,
    projections,
    basemaps,
    GeoData,
    LayersControl,
    WidgetControl,
    GeoJSON,
)
from ipywidgets import Text, HTML

import shapely
from shapely.geometry import LineString, MultiPoint
from shapely.ops import split, substring

import zlib
import datetime as dt

from utilities import *
import shared_utils

E0318 17:44:15.644293409     934 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0318 17:44:16.035101601     934 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


### High Quality Transit Areas Relevant Statutes

[PRC 21155](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21155.&lawCode=PRC)
* Major transit stop definition: _A major transit stop is as defined in Section 21064.3, except that, for purposes of this section, it also includes major transit stops that are included in the applicable regional transportation plan_
* High-quality transit corridor definition: _For purposes of this section, a high-quality transit corridor means a corridor with fixed route bus service with service intervals no longer than 15 minutes during peak commute hours._
    * Unable to locate definition of "peak commute hours"

[PRC 21064.3](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?sectionNum=21064.3.&lawCode=PRC)
* _Major transit stop means a site containing any of the following:
(a) An existing rail or bus rapid transit station.
(b) A ferry terminal served by either a bus or rail transit service.
(c) The intersection of two or more major bus routes with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods._
    * "Intersection" may not be sufficiently well-defined for this analysis

[PRC 21060.2](https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=PRC&sectionNum=21060.2.&highlight=true&keyword=bus%20rapid%20transit)
* _(a) “Bus rapid transit” means a public mass transit service provided by a public agency or by a public-private partnership that includes all of the following features:
(1) Full-time dedicated bus lanes or operation in a separate right-of-way dedicated for public transportation with a frequency of service interval of 15 minutes or less during the morning and afternoon peak commute periods.
(2) Transit signal priority.
(3) All-door boarding.
(4) Fare collection system that promotes efficiency.
(5) Defined stations._
    * Unlikely to determine if a service qualifies as BRT under this definition using GTFS alone

In [2]:
# ##definition of "peak" pending
# am_peak = range(6, 9)
# pm_peak = range(16, 19)

In [3]:
# am_commute_hours = list(am_peak)
# pm_commute_hours = list(pm_peak)
# commute_hours = am_commute_hours + pm_commute_hours

In [4]:
def create_segments(geometry):
    """Splits a Shapely LineString into smaller LineStrings. If a MultiLineString passed,
    splits each LineString in that collection.
    """

    lines = []
    segment_distance_meters = 1250
    geometry = geometry.iloc[0]
    if hasattr(geometry, "geoms"):  ##check if MultiLineString
        linestrings = geometry.geoms
    else:
        linestrings = [geometry]
    for linestring in linestrings:
        for i in range(0, int(linestring.length), segment_distance_meters):
            lines.append(substring(linestring, i, i + segment_distance_meters))
    return lines

In [5]:
def find_stop_with_high_trip_count(segment, stops, stop_times, rank, calculated_stops):
    """Given a shape segment, finds the stop serving the most (or other rank) trips within that segment.
    Adds that stop's stop_id to segment data (a row).
    """

    stops_in_seg = gpd.clip(stops, segment.geometry)
    if stops_in_seg.size == 0:
        return segment

    stop_times_in_seg = stops_in_seg >> inner_join(_, stop_times, on="stop_id")
    trip_count_by_stop = (
        stop_times_in_seg >> count(_.stop_id) >> arrange(-_.n) >> rename(n_trips=_.n)
    )
    try:
        stop_id = trip_count_by_stop["stop_id"].iloc[rank - 1]

        if stop_id in list(calculated_stops):
            return segment
        segment["stop_id"] = stop_id
        segment["n_trips"] = trip_count_by_stop["n_trips"].iloc[rank - 1]
        return segment
    except IndexError:
        return segment

In [6]:
# itp_id = 170

# most_recent_wed = (tbl.views.dim_date()
#  >> filter(_.full_date < dt.datetime.now().date())
#  >> filter(_.day_name == 'Wednesday')
#  >> filter(_.full_date == _.full_date.max())
#  >> select(_.service_date == _.full_date)
# )
# wednesday = (tbl.views.gtfs_schedule_fact_daily_service() 
#          >> filter(_.calitp_itp_id == int(itp_id))
#          >> inner_join(_, most_recent_wed, on = 'service_date')
#          >> collect()
#             )
# wednesday = wednesday >> select(_.calitp_itp_id, _.calitp_url_number, _.service_id)

In [7]:
# bus_routes = (tbl.gtfs_schedule.routes()
#             >> filter(_.calitp_itp_id == int(itp_id))
#             >> filter((_.route_type == '3') | (_.route_type == '11'))
#             >> select(_.calitp_itp_id, _.calitp_url_number, _.route_id) 
#             >> collect()
#          )

In [8]:
# trips = (tbl.gtfs_schedule.trips()
#          >> filter(_.calitp_itp_id == int(itp_id))
#          >> collect()
#          >> inner_join(_, bus_routes, on = ['calitp_itp_id', 'calitp_url_number', 'route_id'])
#          >> inner_join(_, wednesday, on = ['calitp_itp_id', 'calitp_url_number', 'service_id'])
#          )

In [9]:
# stop_times = (tbl.gtfs_schedule.stop_times()
#               >> filter(_.calitp_itp_id == int(itp_id))
#               >> collect()
#              )

In [10]:
# stop_times =  (stop_times >> inner_join(_, trips, on = ['calitp_itp_id', 'calitp_url_number', 'trip_id'])
#               >> select(-_.stop_headsign, -_.pickup_type, -_.drop_off_type,
#                         -_.continuous_pickup, -_.continuous_drop_off, -_.shape_dist_travelled,
#                        -_.timepoint)
#               )

In [11]:
# stops = (tbl.gtfs_schedule.stops() 
#          >> filter(_.calitp_itp_id == itp_id)
#          >> select(_.stop_id, _.stop_lat, _.stop_lon)
#          >> collect())
# stops = gpd.GeoDataFrame(stops,
#                  geometry = gpd.points_from_xy(stops.stop_lon, stops.stop_lat),
#                  crs = 'EPSG:4326').to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [12]:
def get_operator_views(itp_id):
    """Returns relevant views from the data warehouse for a single transit operator."""
    shapes = (
        tbl.gtfs_schedule.shapes()
        >> filter(_.calitp_itp_id == int(itp_id))
        >> collect()
    )
    shapes = gpd.GeoDataFrame(
        shapes,
        geometry=gpd.points_from_xy(shapes.shape_pt_lon, shapes.shape_pt_lat),
        crs="EPSG:4326",
    ).to_crs(shared_utils.geography_utils.CA_NAD83Albers)
    most_recent_wed = (
        tbl.views.dim_date()
        >> filter(_.full_date < dt.datetime.now().date())
        >> filter(_.day_name == "Wednesday")
        >> filter(_.full_date == _.full_date.max())
        >> select(_.service_date == _.full_date)
    )
    wednesday = (
        tbl.views.gtfs_schedule_fact_daily_service()
        >> filter(_.calitp_itp_id == int(itp_id))
        >> inner_join(_, most_recent_wed, on="service_date")
        >> collect()
    )
    wednesday = wednesday >> select(_.calitp_itp_id, _.calitp_url_number, _.service_id)

    bus_routes = (
        tbl.gtfs_schedule.routes()
        >> filter(_.calitp_itp_id == int(itp_id))
        >> filter((_.route_type == "3") | (_.route_type == "11"))
        >> select(_.calitp_itp_id, _.calitp_url_number, _.route_id)
        >> collect()
    )
    print("loaded bus routes")

    trips = (
        tbl.gtfs_schedule.trips()
        >> filter(_.calitp_itp_id == int(itp_id))
        >> collect()
        >> inner_join(
            _, bus_routes, on=["calitp_itp_id", "calitp_url_number", "route_id"]
        )
        >> inner_join(
            _, wednesday, on=["calitp_itp_id", "calitp_url_number", "service_id"]
        )
    )
    print("loaded trips")
    stop_times = (
        tbl.gtfs_schedule.stop_times()
        >> filter(_.calitp_itp_id == int(itp_id))
        >> collect()
    )
    stop_times = (
        stop_times
        >> inner_join(_, trips, on=["calitp_itp_id", "calitp_url_number", "trip_id"])
        >> select(
            -_.stop_headsign,
            -_.pickup_type,
            -_.drop_off_type,
            -_.continuous_pickup,
            -_.continuous_drop_off,
            -_.shape_dist_travelled,
            -_.timepoint,
        )
    )
    print("loaded stop times")

    stops = (
        tbl.gtfs_schedule.stops()
        >> filter(_.calitp_itp_id == itp_id)
        >> select(_.stop_id, _.stop_lat, _.stop_lon)
        >> collect()
    )
    stops = gpd.GeoDataFrame(
        stops,
        geometry=gpd.points_from_xy(stops.stop_lon, stops.stop_lat),
        crs="EPSG:4326",
    ).to_crs(shared_utils.geography_utils.CA_NAD83Albers)
    print("loaded stops")

    return shapes, trips, stop_times, stops

In [13]:
def fix_arrival_time(gtfs_timestring):
    """Reformats a GTFS timestamp (which allows the hour to exceed 24 to mark service day continuity)
    to standard 24-hour time.
    """
    split = gtfs_timestring.split(":")
    hour = int(split[0])
    if hour >= 24:
        split[0] = str(hour - 24)
        corrected = (":").join(split)
        return corrected.strip()
    else:
        return gtfs_timestring.strip()

In [113]:
def single_shape_hqta(
    shapes, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
):
    """Starting with a single GTFS shape, split that shape into segments and determine if each segment qualifies
    as an HQTA. Existing segments within a shape are dropped for that shape, since their peak frequency and
    HQTA status would have already been calculated for a previous shape.
    """

    single_shape = (
        shapes
        >> filter(_.shape_id == shape_id)
        >> mutate(shape_pt_sequence=_.shape_pt_sequence.astype("int64"))
        >> arrange(
            _.shape_pt_sequence
        )  ##arrange, then convert to line to preserve order...
    )

    route_line = LineString(
        list(single_shape["geometry"])
    )  ##TODO replace with shared util? (upstream in shapes...)
    single_line = single_shape[
        ["calitp_itp_id", "shape_id", "calitp_extracted_at"]
    ].iloc[
        [0]
    ]  ##preserve info cols
    single_line["geometry"] = route_line
    single_line = gpd.GeoDataFrame(
        single_line, crs=shared_utils.geography_utils.CA_NAD83Albers
    )

    if shape_id in debug_ids:
        print(f"***debug shape*** {shape_id}")
        debug_dict[f"{shape_id}_single_line"] = single_line
        debug_dict[f"{shape_id}_hqta"] = hqta

    calculated_stops = []
    if hqta.size != 0:
        already_calculated = hqta.dissolve(
            by="calitp_itp_id"
        )  ## get single polygon of HQTA calculation complete area
        single_line = single_line.overlay(
            already_calculated, how="difference"
        )  ## drop calculation complete area from current shape
        if single_line.size == 0:
            segments_with_max_stop = None
            print(f"no line for shape {shape_id}")
            return

    segmented = pd.DataFrame()
    for segment in create_segments(single_line.geometry):
        to_append = single_line.drop(columns=["geometry"])
        to_append["geometry"] = segment
        # segmented = segmented.append(to_append)
        segmented = pd.concat((segmented, to_append))

    segmented = segmented.reset_index()
    segmented["segment_sequence"] = segmented.index.astype(str)
    segmented = segmented.astype({"calitp_itp_id": str})

    ## compute (hopefully unique) hash of segment id that can be used across routes/operators
    segmented["hqta_segment_id"] = segmented.apply(
        lambda x: zlib.crc32(
            (x.calitp_itp_id + x.shape_id + x.segment_sequence).encode("utf-8")
        ),
        axis=1,
    )

    segmented.geometry = segmented.buffer(
        50
    )  ##generous buffer for street/sidewalk width? Required to spatially find stops within each segment

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_segmented"] = segmented
        debug_dict[f"{shape_id}_stops"] = stops
        debug_dict[f"{shape_id}_route_ct_by_stop"] = route_count_by_stop

    segments_with_max_stop = segmented.apply(
        find_stop_with_high_trip_count,
        axis=1,
        args=(stops, stop_times, 1, calculated_stops),
    )

    if not "stop_id" in segments_with_max_stop.columns:
        segments_with_top2_stops = None
        print(f"no stops for shape {shape_id}")
        return  ## no stops within segment

    max_stop_times = (
        stop_times
        >> select(_.stop_id, _.trip_id, _.departure_time)
        >> inner_join(_, segments_with_max_stop, on="stop_id")
    )  ## filter stop_times to the key stops in each segment
    max_stop_times = max_stop_times.dropna(subset=["departure_time"])
    max_stop_times = max_stop_times.drop_duplicates(
        subset=["trip_id", "hqta_segment_id"]
    )  ## filter duplicates for top2 approach

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_max_stop0"] = max_stop_times

    max_stop_times["departure_time"] = max_stop_times["departure_time"].apply(
        fix_arrival_time
    )  ## reformat GTFS time to a format datetime can ingest
    max_stop_times["departure_dt"] = max_stop_times["departure_time"].apply(
        lambda x: dt.datetime.strptime(x, "%H:%M:%S")
    )
    max_stop_times["departure_hour"] = max_stop_times["departure_dt"].apply(
        lambda x: x.hour
    )

    # max_stop_times = max_stop_times >> filter(_.departure_hour.isin(commute_hours))

    if max_stop_times.size == 0:
        print(f"no commute hour trips for shape {shape_id}")
        return

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_max_stop"] = max_stop_times

    ## new flexible peak
    segment_am_max = (
        max_stop_times
        >> count(_.hqta_segment_id, _.departure_hour)
        >> filter(_.departure_hour < 12)
        >> group_by(_.hqta_segment_id)
        >> summarize(am_max_trips=_.n.max())
    )

    segment_pm_max = (
        max_stop_times
        >> count(_.hqta_segment_id, _.departure_hour)
        >> filter(_.departure_hour >= 12)
        >> group_by(_.hqta_segment_id)
        >> summarize(pm_max_trips=_.n.max())
    )
    try:
        segment_peak_service = segment_am_max >> inner_join(
            _, segment_pm_max, on="hqta_segment_id"
        )
        segment_peak_service["hq_transit_corr"] = segment_peak_service.apply(
            lambda x: x.am_max_trips > 4 and x.pm_max_trips > 4, axis=1
        )
    except:  ## append when all segments only have am or pm trips, not an hqta by definition
        segment_peak_service = segment_am_max.append(segment_pm_max)
        segment_peak_service["hq_transit_corr"] = False
    ##

    segment_peak_service = segment_peak_service.reset_index(drop=True)
    segment_peak_service["last_seg_hqta"] = segment_peak_service[
        "hq_transit_corr"
    ].shift(1)
    segment_peak_service["next_seg_hqta"] = segment_peak_service[
        "hq_transit_corr"
    ].shift(-1)

    ## this actually seems to drop segments that should be HQTAs -- commented out unless a better approach found
    ## consider segment not HQTA if both the prior and next segements are not HQTAs
    # debug_dict[f"{shape_id}_sps"] = segments_with_max_stop >> inner_join(
    #     _, segment_peak_service, on="hqta_segment_id"
    # )
    ## this seems not so good
    # segment_peak_service["hq_transit_corr"] = segment_peak_service.apply(
    #     lambda x: False
    #     if x.hq_transit_corr == False
    #     else x.last_seg_hqta or x.next_seg_hqta,
    #     axis=1,
    # )
    # segment_peak_service["hq_transit_corr"] = segment_peak_service[
    #     "hq_transit_corr"
    # ].fillna(True)

    single_hqta = segments_with_max_stop >> inner_join(
        _, segment_peak_service, on="hqta_segment_id"
    )
    single_hqta.drop(
        columns=[
            "calitp_extracted_at",
            "next_seg_hqta",
            "last_seg_hqta",
            "index",
            "n_trips",
        ],
        inplace=True,
    )

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_single_hqta"] = single_hqta

    return single_hqta.reset_index(drop=True)

In [114]:
def single_operator_hqta(views):

    global debug_dict
    debug_dict = {}

    shapes, trips, stop_times, stops = views

    distinct_routes = (
        trips
        >> distinct(_.route_id, _.shape_id, _.direction_id, _keep_all=True)
        >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.direction_id, _.trip_id)
    )

    route_count_by_stop = (
        stop_times
        >> select(_.stop_id, _.trip_id)
        >> inner_join(_, distinct_routes, on="trip_id")
        >> count(_.stop_id)
        >> rename(n_routes=_.n)
        >> arrange(-_.n_routes)
    )

    hqta = gpd.GeoDataFrame()
    ## start with shapes including the highest number of trips
    shapes_sorted = (
        trips.groupby("shape_id")
        .count()
        .sort_values(by="trip_id", ascending=False)
        .index
    )
    shapes_sorted = pd.Series(shapes_sorted)
    total_shapes = len(shapes_sorted)
    print(f"there are {total_shapes} shapes total")
    for ix, shape_id in shapes_sorted.items():
        print(f"calculating for shape_id {shape_id}")
        if ix % 25 == 0:
            print(
                f"progress: {ix}/{total_shapes} shapes ({round(((ix/total_shapes)*100), 2)}%)"
            )
        try:
            result = single_shape_hqta(
                shapes, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
            )
            # hqta = hqta.append(result)
            hqta = pd.concat((hqta, result))
        except:
            print(f"unable to calculate HQTA for shape_id {shape_id}")
        try:
            hqta = hqta.set_crs(shared_utils.geography_utils.CA_NAD83Albers)
        except:
            continue
    return hqta
    # return hqta.drop(columns=['n', 'departure_hour']).reset_index(drop=True)

In [83]:
debug_ids = ['940143', '1730123']

In [84]:
lbt_views = get_operator_views(170)

loaded bus routes
loaded trips
loaded stop times
loaded stops


In [85]:
lbt_hqta = single_operator_hqta(lbt_views)

there are 97 shapes total
calculating for shape_id 510074
progress: 0/97 shapes (0.0%)
calculating for shape_id 610105
calculating for shape_id 510075
calculating for shape_id 610106
calculating for shape_id 1210143
calculating for shape_id 1210150
calculating for shape_id 460064
calculating for shape_id 460067
no line for shape 460067
calculating for shape_id 1920184
calculating for shape_id 1920183
calculating for shape_id 1910328
calculating for shape_id 1910326
calculating for shape_id 1730123
***debug shape*** 1730123
calculating for shape_id 1730122
calculating for shape_id 220076
calculating for shape_id 220080
calculating for shape_id 1710178
calculating for shape_id 1710177
calculating for shape_id 1720090
calculating for shape_id 1720091
no line for shape 1720091
calculating for shape_id 10074
calculating for shape_id 80004
calculating for shape_id 410006
calculating for shape_id 40008
calculating for shape_id 40007
no line for shape 40007
calculating for shape_id 20014
progr

In [63]:
##TODO replace map with shared utils folium, useful mouseover

In [74]:
# map_hqta(lbt_hqta)

#### Debugging with LBT shape

* fixed for now by not dropping segments between non-hqtas

In [116]:
# debug_dict['1730123_single_line']

In [117]:
# map_hqta(lbt_hqta, 'hqta_segment_id')

In [118]:
# map_hqta(lbt_hqta >> filter(_.shape_id == '1730123'), 'stop_id')

In [119]:
## investigate PCH not being an hqta...

In [120]:
# ## still shows issue...
# lbt_hqta >> filter(_.shape_id == '1730123', _.am_max_trips >= 4, _.hq_transit_corr == False)

In [121]:
# sps = debug_dict['1730123_sps']
# sps >> filter(_.shape_id == '1730123', _.am_max_trips >= 4, _.hqta_segment_id == 3092845517)
# ## correct here but last/next incorrect

## Multiple Operators

In [122]:
itp_ids = tbl.gtfs_schedule.agency() >> distinct(_.calitp_itp_id) >> collect()
itp_ids = itp_ids.calitp_itp_id

In [123]:
def multiple_operator_hqta(existing=pd.DataFrame()):
    
    hqta = pd.DataFrame()    
    for _, itp_id in itp_ids.items():
        if not existing.empty:
            if str(itp_id) in existing['calitp_itp_id'].to_list():
                print(f'{itp_id} existing')
                # break
                continue
        if int(itp_id) == 200:
            continue ## skip MTC feed to use individual operator feeds
        try:
            print(f'attempting for operator {itp_id}')
            hqta = hqta.append(single_operator_hqta(get_operator_views(itp_id)))
        except:
            print(f'failed for operator {itp_id}')
        
    return hqta

In [None]:
all_operators = multiple_operator_hqta()
geoparquet_gcs_export(all_operators, 'all_operators')

attempting for operator 238
loaded bus routes
loaded trips
loaded stop times
loaded stops
there are 14 shapes total
calculating for shape_id p_900502
progress: 0/14 shapes (0.0%)
calculating for shape_id p_1304040
calculating for shape_id p_1304041
calculating for shape_id p_900477
calculating for shape_id p_902151
no commute hour trips for shape p_902151
calculating for shape_id p_900476
no commute hour trips for shape p_900476
calculating for shape_id p_900479
calculating for shape_id p_900480




calculating for shape_id p_111004
calculating for shape_id p_111005




calculating for shape_id p_900478
no commute hour trips for shape p_900478
calculating for shape_id p_902156
no commute hour trips for shape p_902156
calculating for shape_id p_902157
no commute hour trips for shape p_902157
calculating for shape_id p_902158
no commute hour trips for shape p_902158
attempting for operator 282




loaded bus routes
loaded trips
loaded stop times
loaded stops
there are 181 shapes total
calculating for shape_id 195251
progress: 0/181 shapes (0.0%)
calculating for shape_id 195244
calculating for shape_id 195210
calculating for shape_id 195215
calculating for shape_id 195112
calculating for shape_id 195116
calculating for shape_id 195355
calculating for shape_id 195354
calculating for shape_id 195260
calculating for shape_id 195169
calculating for shape_id 195174
calculating for shape_id 195115
no stops for shape 195115
calculating for shape_id 195553
calculating for shape_id 195114
no line for shape 195114
calculating for shape_id 195284
calculating for shape_id 195291
calculating for shape_id 195227
calculating for shape_id 195552
calculating for shape_id 195228
calculating for shape_id 195397
calculating for shape_id 195266
calculating for shape_id 195399
calculating for shape_id 195377
calculating for shape_id 195223
calculating for shape_id 195221
no line for shape 195221
calcu



calculating for shape_id 195322
calculating for shape_id 195324
calculating for shape_id 195342
calculating for shape_id 195418
calculating for shape_id 195419
no stops for shape 195419
calculating for shape_id 195416
calculating for shape_id 195417
calculating for shape_id 195348
calculating for shape_id 195224
no stops for shape 195224
calculating for shape_id 195340
calculating for shape_id 195347
no line for shape 195347
calculating for shape_id 195437
calculating for shape_id 195208
calculating for shape_id 195317
calculating for shape_id 195441
calculating for shape_id 195440
progress: 75/181 shapes (41.44%)
calculating for shape_id 195314
calculating for shape_id 195442
calculating for shape_id 195205
calculating for shape_id 195409
calculating for shape_id 195401
calculating for shape_id 195256
calculating for shape_id 195402
calculating for shape_id 195405
calculating for shape_id 195259
calculating for shape_id 195337
calculating for shape_id 195335
calculating for shape_id 1



calculating for shape_id 195117
no line for shape 195117
calculating for shape_id 195456
no stops for shape 195456
calculating for shape_id 195341
no line for shape 195341
calculating for shape_id 195613




calculating for shape_id 195232
no line for shape 195232
calculating for shape_id 195245
no line for shape 195245
calculating for shape_id 195457
no line for shape 195457
calculating for shape_id 195412
no stops for shape 195412
calculating for shape_id 195265
calculating for shape_id 195253
no line for shape 195253
calculating for shape_id 195134
no line for shape 195134
calculating for shape_id 195387
no line for shape 195387
calculating for shape_id 195376
no stops for shape 195376
calculating for shape_id 195371
no line for shape 195371
calculating for shape_id 195393
no line for shape 195393
calculating for shape_id 195381
no line for shape 195381
calculating for shape_id 195414
no line for shape 195414
calculating for shape_id 195113
no line for shape 195113
calculating for shape_id 195325
no line for shape 195325
calculating for shape_id 195230
no line for shape 195230
calculating for shape_id 195263
no line for shape 195263
calculating for shape_id 195323
no line for shape 1953



calculating for shape_id 195307
no stops for shape 195307
calculating for shape_id 195532
no stops for shape 195532
calculating for shape_id 195332
no line for shape 195332
calculating for shape_id 195633
calculating for shape_id 195615
calculating for shape_id 195268
no line for shape 195268
calculating for shape_id 195330
no line for shape 195330
calculating for shape_id 195153
no line for shape 195153
calculating for shape_id 195262
no line for shape 195262
calculating for shape_id 195380
no line for shape 195380
calculating for shape_id 195463
calculating for shape_id 195296
no line for shape 195296
calculating for shape_id 195458
no line for shape 195458
calculating for shape_id 195369
no line for shape 195369
calculating for shape_id 195299
no line for shape 195299
calculating for shape_id 195261
progress: 175/181 shapes (96.69%)
no line for shape 195261
calculating for shape_id 195257
no line for shape 195257
calculating for shape_id 195218
no line for shape 195218
calculating f



attempting for operator 135
loaded bus routes
loaded trips
loaded stop times
loaded stops
there are 43 shapes total
calculating for shape_id p_529
progress: 0/43 shapes (0.0%)
calculating for shape_id p_178073
calculating for shape_id p_899996
calculating for shape_id p_531
calculating for shape_id p_530
calculating for shape_id p_178062
calculating for shape_id p_914849
calculating for shape_id p_914848
no stops for shape p_914848
calculating for shape_id p_548
calculating for shape_id p_547
calculating for shape_id p_530998
calculating for shape_id p_530997
calculating for shape_id p_745109
calculating for shape_id p_551
calculating for shape_id p_550
no commute hour trips for shape p_550
calculating for shape_id p_1277369
calculating for shape_id p_467
calculating for shape_id p_179043
calculating for shape_id p_745102
calculating for shape_id p_338
calculating for shape_id p_438




calculating for shape_id p_333
no commute hour trips for shape p_333
calculating for shape_id p_178050
no stops for shape p_178050
calculating for shape_id p_745110
no commute hour trips for shape p_745110
calculating for shape_id p_179044
calculating for shape_id p_745108
progress: 25/43 shapes (58.14%)
no commute hour trips for shape p_745108
calculating for shape_id p_745101
no stops for shape p_745101
calculating for shape_id p_745100
no stops for shape p_745100
calculating for shape_id p_745099
no stops for shape p_745099
calculating for shape_id p_179049
no commute hour trips for shape p_179049
calculating for shape_id p_418
no stops for shape p_418
calculating for shape_id p_335
no commute hour trips for shape p_335
calculating for shape_id p_340
no commute hour trips for shape p_340
calculating for shape_id p_356
no commute hour trips for shape p_356
calculating for shape_id p_363
no commute hour trips for shape p_363
calculating for shape_id p_414
no commute hour trips for sha



loaded bus routes
loaded trips
loaded stop times
loaded stops
there are 43 shapes total
calculating for shape_id p_529
progress: 0/43 shapes (0.0%)
calculating for shape_id p_178073
calculating for shape_id p_899996
calculating for shape_id p_531
calculating for shape_id p_530
calculating for shape_id p_178062
calculating for shape_id p_914849
calculating for shape_id p_914848
no stops for shape p_914848
calculating for shape_id p_548
calculating for shape_id p_547
calculating for shape_id p_530998
calculating for shape_id p_530997
calculating for shape_id p_745109
calculating for shape_id p_551
calculating for shape_id p_550
no commute hour trips for shape p_550
calculating for shape_id p_1277369
calculating for shape_id p_467
calculating for shape_id p_179043
calculating for shape_id p_745102
calculating for shape_id p_338
calculating for shape_id p_438




calculating for shape_id p_333
no commute hour trips for shape p_333
calculating for shape_id p_178050
no stops for shape p_178050
calculating for shape_id p_745110
no commute hour trips for shape p_745110
calculating for shape_id p_179044
calculating for shape_id p_745108
progress: 25/43 shapes (58.14%)
no commute hour trips for shape p_745108
calculating for shape_id p_745101
no stops for shape p_745101
calculating for shape_id p_745100
no stops for shape p_745100
calculating for shape_id p_745099
no stops for shape p_745099
calculating for shape_id p_179049
no commute hour trips for shape p_179049
calculating for shape_id p_418
no stops for shape p_418
calculating for shape_id p_335
no commute hour trips for shape p_335
calculating for shape_id p_340
no commute hour trips for shape p_340
calculating for shape_id p_356
no commute hour trips for shape p_356
calculating for shape_id p_363
no commute hour trips for shape p_363
calculating for shape_id p_414
no commute hour trips for sha



loaded bus routes
loaded trips
loaded stop times
loaded stops
there are 43 shapes total
calculating for shape_id p_529
progress: 0/43 shapes (0.0%)
calculating for shape_id p_178073
calculating for shape_id p_899996
calculating for shape_id p_531
calculating for shape_id p_530
calculating for shape_id p_178062
calculating for shape_id p_914849
calculating for shape_id p_914848
no stops for shape p_914848
calculating for shape_id p_548
calculating for shape_id p_547
calculating for shape_id p_530998
calculating for shape_id p_530997
calculating for shape_id p_745109
calculating for shape_id p_551
calculating for shape_id p_550
no commute hour trips for shape p_550
calculating for shape_id p_1277369
calculating for shape_id p_467
calculating for shape_id p_179043
calculating for shape_id p_745102
calculating for shape_id p_338
calculating for shape_id p_438




calculating for shape_id p_333
no commute hour trips for shape p_333
calculating for shape_id p_178050
no stops for shape p_178050
calculating for shape_id p_745110
no commute hour trips for shape p_745110
calculating for shape_id p_179044
calculating for shape_id p_745108
progress: 25/43 shapes (58.14%)
no commute hour trips for shape p_745108
calculating for shape_id p_745101
no stops for shape p_745101
calculating for shape_id p_745100
no stops for shape p_745100
calculating for shape_id p_745099
no stops for shape p_745099
calculating for shape_id p_179049
no commute hour trips for shape p_179049
calculating for shape_id p_418
no stops for shape p_418
calculating for shape_id p_335
no commute hour trips for shape p_335
calculating for shape_id p_340
no commute hour trips for shape p_340
calculating for shape_id p_356
no commute hour trips for shape p_356
calculating for shape_id p_363
no commute hour trips for shape p_363
calculating for shape_id p_414
no commute hour trips for sha



loaded bus routes
loaded trips
loaded stop times
loaded stops
there are 43 shapes total
calculating for shape_id p_529
progress: 0/43 shapes (0.0%)
calculating for shape_id p_178073
calculating for shape_id p_899996
calculating for shape_id p_531
calculating for shape_id p_530
calculating for shape_id p_178062
calculating for shape_id p_914849
calculating for shape_id p_914848
no stops for shape p_914848
calculating for shape_id p_548
calculating for shape_id p_547
calculating for shape_id p_530998
calculating for shape_id p_530997
calculating for shape_id p_745109
calculating for shape_id p_551
calculating for shape_id p_550
no commute hour trips for shape p_550
calculating for shape_id p_1277369
calculating for shape_id p_467
calculating for shape_id p_179043
calculating for shape_id p_745102
calculating for shape_id p_338
calculating for shape_id p_438




calculating for shape_id p_333
no commute hour trips for shape p_333
calculating for shape_id p_178050
no stops for shape p_178050
calculating for shape_id p_745110
no commute hour trips for shape p_745110
calculating for shape_id p_179044
calculating for shape_id p_745108
progress: 25/43 shapes (58.14%)
no commute hour trips for shape p_745108
calculating for shape_id p_745101
no stops for shape p_745101
calculating for shape_id p_745100
no stops for shape p_745100
calculating for shape_id p_745099
no stops for shape p_745099
calculating for shape_id p_179049
no commute hour trips for shape p_179049
calculating for shape_id p_418
no stops for shape p_418
calculating for shape_id p_335
no commute hour trips for shape p_335
calculating for shape_id p_340
no commute hour trips for shape p_340
calculating for shape_id p_356
no commute hour trips for shape p_356
calculating for shape_id p_363
no commute hour trips for shape p_363
calculating for shape_id p_414
no commute hour trips for sha



loaded bus routes
loaded trips
loaded stop times
loaded stops
there are 81 shapes total
calculating for shape_id 20089
progress: 0/81 shapes (0.0%)
calculating for shape_id 20088
calculating for shape_id 41077
calculating for shape_id 41092
calculating for shape_id B090
calculating for shape_id A094
calculating for shape_id A089
calculating for shape_id B082
no line for shape B082
calculating for shape_id 18086
calculating for shape_id 18087
calculating for shape_id 2086
calculating for shape_id 1104
calculating for shape_id 1105
calculating for shape_id 41086
no line for shape 41086
calculating for shape_id 24192
calculating for shape_id 2077
calculating for shape_id 23295
calculating for shape_id 25041
calculating for shape_id 25040
no stops for shape 25040
calculating for shape_id 49029
calculating for shape_id 24193
no stops for shape 24193
calculating for shape_id 47013
calculating for shape_id 47012
no stops for shape 47012
calculating for shape_id 41096
calculating for shape_id 



calculating for shape_id 84036
calculating for shape_id 75095
calculating for shape_id 84034
calculating for shape_id 49031
no stops for shape 49031
calculating for shape_id 75094
calculating for shape_id 75091
calculating for shape_id 92020
no stops for shape 92020
calculating for shape_id 92018
no stops for shape 92018
calculating for shape_id 94053
no stops for shape 94053
calculating for shape_id 23298
no stops for shape 23298
calculating for shape_id 92016
no line for shape 92016
calculating for shape_id 91026
no stops for shape 91026
calculating for shape_id 84035
no stops for shape 84035
calculating for shape_id 23297
no stops for shape 23297
calculating for shape_id 49035
no stops for shape 49035
calculating for shape_id 2088
no stops for shape 2088
calculating for shape_id 2076
progress: 75/81 shapes (92.59%)
no stops for shape 2076
calculating for shape_id 24198
no stops for shape 24198
calculating for shape_id 24200
no stops for shape 24200
calculating for shape_id 23293
no 



loaded bus routes
loaded trips
loaded stop times
loaded stops
there are 281 shapes total
calculating for shape_id 201_9_29
progress: 0/281 shapes (0.0%)
calculating for shape_id 7_0_270
calculating for shape_id 202_8_33
no line for shape 202_8_33
calculating for shape_id 7_1_269
calculating for shape_id 215_2_16
calculating for shape_id 215_3_15
calculating for shape_id 2_0_81
calculating for shape_id 906_0_33
calculating for shape_id 2_1_82
no stops for shape 2_1_82
calculating for shape_id 10_2_68
calculating for shape_id 955_0_88
calculating for shape_id 30_0_330
calculating for shape_id 3_0_90
calculating for shape_id 235_0_54
calculating for shape_id 13_0_188
calculating for shape_id 12_2_13
calculating for shape_id 907_1_37
calculating for shape_id 934_8_53
calculating for shape_id 933_9_61
no line for shape 933_9_61
calculating for shape_id 3_1_93
calculating for shape_id 41_0_100
calculating for shape_id 30_1_326
calculating for shape_id 955_1_89
no line for shape 955_1_89
calc

In [32]:
all_operators = gpd.read_parquet(f'{GCS_FILE_PATH}all_operators.parquet')

In [33]:
itp_ids = itp_ids.astype(str)

In [34]:
df = tbl.gtfs_schedule.agency() >> collect() >> distinct(_.calitp_itp_id, _keep_all = True)

In [35]:
not_ran = df[~df['calitp_itp_id'].isin(all_operators['calitp_itp_id'].astype('int64'))]

In [36]:
with pd.option_context('display.max_rows', 100):
    display(not_ran)

Unnamed: 0,calitp_itp_id,calitp_url_number,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email,calitp_extracted_at
1,200,0,SF,San Francisco Municipal Transportation Agency,https://SFMTA.com,America/Los_Angeles,en,,,,2022-03-17
7,208,0,Monterey-Salinas Transit,Monterey-Salinas Transit,http://www.mst.org,America/Los_Angeles,,1-888-MST-BUS1,,,2022-03-15
12,13,0,190,Groome Transportation,http://www.amtrak.com,America/New_York,en,,,,2022-03-04
16,279,1,BA,Bay Area Rapid Transit,https://www.bart.gov/,America/Los_Angeles,,510-464-6000,,,2022-02-14
19,76,0,936,Commute.org Shuttles,http://www.commute.org,America/Los_Angeles,en,(650) 588-1600,,,2022-03-11
...,...,...,...,...,...,...,...,...,...,...,...
178,337,0,526,Thousand Oaks Transit,https://www.toaks.org/departments/public-works...,America/Los_Angeles,en,,https://www.toaks.org/departments/public-works...,,2021-12-23
182,203,0,0,Modesto Area Express,https://www.modestoareaexpress.com,America/Los_Angeles,en,209-521-1274,http://www.modestoareaexpress.com/169/Bus-Fare...,support@availtec.com,2022-01-04
183,308,0,729,Simi Valley Transit,http://www.simivalley.org/departments/communit...,America/Los_Angeles,en,(805) 583-6456,http://www.simivalley.org/departments/communit...,,2021-12-27
184,287,0,SLOT,City of San Luis Obispo Transit,http://www.slotransit.org,America/Los_Angeles,en,(805) 541-2877,https://www.slocity.org/government/department-...,,2022-01-30


## Spatially Aggregate non-HQTAs

* currently unable with new flexible peak definition

In [37]:
# non_hqta = all_operators >> filter(-_.hq_transit_corr)

# non_hqta_overlaid = non_hqta.overlay(non_hqta, how='intersection')

# non_hqta_deduplicated = non_hqta_overlaid[non_hqta_overlaid['hqta_segment_id_1'] != non_hqta_overlaid['hqta_segment_id_2']]

In [38]:
# def combine_segment_ids(row):
#     both_segments = [row.hqta_segment_id_1, row.hqta_segment_id_2]
#     both_segments.sort()
#     return str(both_segments)

In [39]:
# non_hqta_deduplicated['overlap_id'] = non_hqta_deduplicated.apply(combine_segment_ids, axis=1)

In [40]:
# more_deduplicated = non_hqta_deduplicated.drop_duplicates(subset=['overlap_id'])

In [41]:
# more_deduplicated.to_parquet('./data/bus/overlaps.parquet')

In [42]:
# ##TODO rewrite to new definition... (or just drop?)

# def sum_overlaps(row):
#     row['am_peak'] = row.am_peak_1 + row.am_peak_2
#     row['pm_peak'] = row.pm_peak_1 + row.pm_peak_2
#     row['n_trips'] = row.n_trips_1 + row.n_trips_2
#     if row.am_peak >= 4 and row.pm_peak >= 4:
#         row['hq_transit_corr'] = True
#     else:
#         row['hq_transit_corr'] = False
#     return row

In [43]:
# overlaps_summed = more_deduplicated.apply(sum_overlaps, axis = 1) 

In [44]:
# new_hq = overlaps_summed[overlaps_summed['hq_transit_corr']]

In [45]:
# new_hq = new_hq[new_hq['am_peak'] != new_hq['am_peak_1']*2] ## drop likely duplicates; can rework above to do this better

In [46]:
# new_hq = new_hq.drop(columns=['calitp_itp_id_2',
#                'hqta_segment_id_2', 'index_2', 'n_trips_2', 'segment_sequence_2',
#                'shape_id_2', 'stop_id_2', 'am_peak_2', 'pm_peak_2',
#                'hq_transit_corr_2', 'index_1', 'index_2', 'n_trips_2', 'n_trips_1',
#                 'am_peak_1', 'pm_peak_1', 'hq_transit_corr_1'])

In [47]:
# new_hq = new_hq.rename(columns = {'calitp_itp_id_1':'calitp_itp_id', 'hqta_segment_id_1':'hqta_segment_id',
#        'segment_sequence_1':'segment_sequence', 'shape_id_1':'shape_id', 'stop_id_1':'stop_id'})

In [48]:
# all_operators = all_operators.append(new_hq)

In [49]:
short_dropped = all_operators[all_operators['geometry'].area > 50*400] ##50m width * 400m segment min

In [50]:
short_dropped = short_dropped.reset_index().drop(columns=['index'])

In [51]:
# short_dropped.to_parquet('./data/bus/all_operators_cleaned_appended.parquet')

In [52]:
dissolved = short_dropped.dissolve(by=['hq_transit_corr', 'shape_id']).reset_index()

In [53]:
dissolved = dissolved[dissolved['geometry'].area > 50*3000] ##50m width * 3000m shape min

In [54]:
dissolved = gpd.read_parquet(f'{GCS_FILE_PATH}shape_hqta_dissolve.parquet')

In [55]:
map_hqta(dissolved)

Map(center=[33.596015707487744, -117.87571863402819], controls=(ZoomControl(options=['position', 'zoom_in_text…

In [56]:
geoparquet_gcs_export(dissolved, 'shape_hqta_dissolve')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



### Bay Area Map Image

![map](img/bay.png)

### Los Angeles Map Image

![map](img/la.png)

## Result

* Ran and aggregated for nearly all bus operators statewide
    * Segments not containing stops will not appear as HQTA-- may need to interpolate (e.g., freeway segments)
    * Some questionable short segments
* Algorithm may be overestimating for SFMTA

### Data Issues

* 61 County Connection has a lot of na departure times?
    * strange results after dropna
* 48 B-Line similarly choppy
* 116 Fresno Area Express has whitespace in departure times
    * ValueError: time data ' 7:04:00' does not match format '%H:%M:%S'

