In [30]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(900_000_000_000) ## 800GB?

import datetime as dt
import geopandas as gpd
import pandas as pd
import zlib

from siuba import *

import utilities
import A1_rail_ferry_brt as rail_ferry_brt

from shared_utils import rt_utils

GCS_FILE_PATH = utilities.GCS_FILE_PATH

analysis_date = rail_ferry_brt.analysis_date
analysis_date

datetime.date(2022, 6, 15)

In [2]:
# calitp-analytics-data/data-analyses/high_quality_transit_areas/bus_corridors/182_bus.parquet
df = gpd.read_parquet(f"{GCS_FILE_PATH}bus_corridors/182_bus.parquet")

In [3]:
df.head()

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,geometry,segment_sequence,hqta_segment_id,stop_id,am_max_trips,pm_max_trips,hq_transit_corr,departure_hour,n
0,182,0,160417_FEB22,"POLYGON ((149022.152 -434892.763, 149021.875 -...",0,1698844686,16532,30.0,30.0,True,,
1,182,0,160417_FEB22,"POLYGON ((149334.063 -434788.986, 149349.243 -...",1,306536600,10997,16.0,15.0,True,,
2,182,0,160417_FEB22,"POLYGON ((149181.592 -435478.694, 149181.809 -...",2,2337026338,14378,21.0,21.0,True,,
3,182,0,160417_FEB22,"POLYGON ((149579.273 -436261.193, 149580.419 -...",3,4232774068,13668,16.0,18.0,True,,
4,182,0,160417_FEB22,"POLYGON ((150476.929 -436765.871, 150544.530 -...",4,1647285271,7277,11.0,10.0,True,,


In [None]:
def single_shape_hqta(
    routelines, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
):
    """Starting with a single GTFS shape, split that shape into segments and determine if each segment qualifies
    as an HQTA. Existing segments within a shape are dropped for that shape, since their peak frequency and
    HQTA status would have already been calculated for a previous shape.
    """

    single_line = routelines >> filter(_.shape_id == shape_id)
    if single_line.size == 0 or single_line.geometry.isna().all():
        print(f"no geometry for shape {shape_id}")
        return

    if shape_id in debug_ids:
        print(f"***debug shape*** {shape_id}")
        debug_dict[f"{shape_id}_single_line"] = single_line
        debug_dict[f"{shape_id}_hqta"] = hqta

    calculated_stops = []
    ## TODO any way to make more efficient? a pre-calculated shape overlap? something else?
    if hqta.size != 0:
        already_calculated = hqta.dissolve(
            by="calitp_itp_id"
        )  ## get single polygon of HQTA calculation complete area
        single_line = single_line.overlay(
            already_calculated, how="difference"
        )  ## drop calculation complete area from current shape
        if single_line.size == 0:
            segments_with_max_stop = None
            print(f"already calculated corridor for shape {shape_id}")
            return

    segmented = gpd.GeoDataFrame() ##changed to gdf?
    for segment in utilities.create_segments(single_line.geometry):
        to_append = single_line.drop(columns=["geometry"])
        to_append["geometry"] = segment
        segmented = pd.concat((segmented, to_append))

    segmented = segmented.reset_index()
    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_a_segmented"] = segmented

    try:
        segmented["segment_sequence"] = segmented.index.astype(str)
        assert 'calitp_itp_id' in segmented.columns
        # segmented = segmented.astype({"calitp_itp_id": str}) ## casting this to string is bad for exports...
        ## instead, cast when generating segment id below...
    except:
        print(f"segmented shape has no itp_id {shape_id}")
        return

    ## compute (hopefully unique) hash of segment id that can be used across routes/operators
    segmented["hqta_segment_id"] = segmented.apply(
        lambda x: zlib.crc32(
            (str(x.calitp_itp_id) + x.shape_id + x.segment_sequence).encode("utf-8")
        ),
        axis=1,
    )

    segmented.geometry = segmented.buffer(
        50
    )  ##generous buffer for street/sidewalk width? Required to spatially find stops within each segment

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_segmented"] = segmented
        debug_dict[f"{shape_id}_stops"] = stops
        debug_dict[f"{shape_id}_route_ct_by_stop"] = route_count_by_stop

    segments_with_max_stop = segmented.apply(
        utilities.find_stop_with_high_trip_count,
        axis=1,
        args=(stops, stop_times, 1, calculated_stops),
    )

    if not "stop_id" in segments_with_max_stop.columns:
        segments_with_top2_stops = None
        print(f"no stops for shape {shape_id}")
        return  ## no stops within segment

    max_stop_times = (
        stop_times
        >> select(_.stop_id, _.trip_id, _.departure_time)
        >> inner_join(_, segments_with_max_stop, on="stop_id")
    )  ## filter stop_times to the key stops in each segment
    max_stop_times = max_stop_times.dropna(subset=["departure_time"])
    max_stop_times = max_stop_times.drop_duplicates(
        subset=["trip_id", "hqta_segment_id"]
    )  ## filter duplicates for top2 approach

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_max_stop0"] = max_stop_times

    max_stop_times["departure_time"] = max_stop_times["departure_time"].apply(
        fix_arrival_time
    )  ## reformat GTFS time to a format datetime can ingest
    max_stop_times["departure_dt"] = max_stop_times["departure_time"].apply(
        lambda x: dt.datetime.strptime(x, "%H:%M:%S")
    )
    max_stop_times["departure_hour"] = max_stop_times["departure_dt"].apply(
        lambda x: x.hour
    )

    if max_stop_times.size == 0:
        print(f"no commute hour trips for shape {shape_id}")
        return

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_max_stop"] = max_stop_times

    ## new flexible peak
    segment_am_max = (
        max_stop_times
        >> count(_.hqta_segment_id, _.departure_hour)
        >> filter(_.departure_hour < 12)
        >> group_by(_.hqta_segment_id)
        >> summarize(am_max_trips=_.n.max())
    )

    segment_pm_max = (
        max_stop_times
        >> count(_.hqta_segment_id, _.departure_hour)
        >> filter(_.departure_hour >= 12)
        >> group_by(_.hqta_segment_id)
        >> summarize(pm_max_trips=_.n.max())
    )
    try:
        segment_peak_service = segment_am_max >> inner_join(
            _, segment_pm_max, on="hqta_segment_id"
        )
        segment_peak_service["hq_transit_corr"] = segment_peak_service.apply(
            lambda x: x.am_max_trips > 4 and x.pm_max_trips > 4, axis=1
        )
    except:  ## append when all segments only have am or pm trips, not an hqta by definition
        segment_peak_service = pd.concat((segment_am_max, segment_pm_max))
        segment_peak_service["hq_transit_corr"] = False

    segment_peak_service = segment_peak_service.reset_index(drop=True)

    single_hqta = segments_with_max_stop >> inner_join(
        _, segment_peak_service, on="hqta_segment_id"
    )
    single_hqta = single_hqta >> select(-_.calitp_extracted_at, -_.index, -_.n_trips)

    if shape_id in debug_ids:
        debug_dict[f"{shape_id}_single_hqta"] = single_hqta

    return single_hqta.reset_index(drop=True)

In [None]:
def single_operator_hqta(itp_id, analysis_date):

    global debug_dict
    debug_dict = {}

    routelines = gpd.read_parquet(f"./data/routelines_{itp_id}_{date_str}.parquet")
    trips = pd.read_parquet(f"./data/trips_{itp_id}_{date_str}.parquet")
    stop_times = pd.read_parquet(f"./data/st_{itp_id}_{date_str}.parquet")
    stops = gpd.read_parquet(f"./data/stops_{itp_id}_{date_str}.parquet")
    
    # shapes, trips, stop_times, stops = views
    #routelines = rt_utils.get_routelines(itp_id, analysis_date)
    ## force clear to ensure route type data present
    #trips = rt_utils.get_trips(itp_id, analysis_date, force_clear=True, route_types = ['3'])
    #stop_times = rt_utils.get_stop_times(itp_id, analysis_date)
    #stops = rt_utils.get_stops(itp_id, analysis_date)

    distinct_routes = (
        trips
        >> distinct(_.route_id, _.shape_id, _.direction_id, _keep_all=True)
        >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.direction_id, _.trip_id)
    )

    route_count_by_stop = (
        stop_times
        >> select(_.stop_id, _.trip_id)
        >> inner_join(_, distinct_routes, on="trip_id")
        >> count(_.stop_id)
        >> rename(n_routes=_.n)
        >> arrange(-_.n_routes)
    )

    hqta = gpd.GeoDataFrame()
    ## start with shapes including the highest number of trips
    trips_shape_sorted = (
        trips.groupby("shape_id")
        .count()
        .sort_values(by="trip_id", ascending=False)
        .index
    )
    trips_shape_sorted = pd.Series(trips_shape_sorted)
    total_shapes = len(trips_shape_sorted)
    print(f"there are {total_shapes} shapes total")
    for ix, shape_id in trips_shape_sorted.items():
        print(f"calculating for shape_id {shape_id}")
        if ix % 25 == 0:
            print(
                f"progress: {ix}/{total_shapes} shapes ({round(((ix/total_shapes)*100), 2)}%)"
            )
        # try:
        result = single_shape_hqta(
            routelines, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
        )
        hqta = pd.concat((hqta, result))
        # except:
            # print(f"unable to calculate HQTA for shape_id {shape_id}")
        try:
            hqta = hqta.set_crs(shared_utils.geography_utils.CA_NAD83Albers)
        except:
            continue
    return hqta
    # return hqta.drop(columns=['n', 'departure_hour']).reset_index(drop=True)

## single_operator_hqta

Tease it apart.

Use LA Metro and save files locally, don't overwrite in GCS while testing.

In [None]:
def single_shape_hqta2(routelines, trips, stop_times, stops):
    single_line = routelines >> filter(_.shape_id == shape_id)
    
    segmented = gpd.GeoDataFrame() ##changed to gdf?
    for segment in utilities.create_segments(single_line.geometry):
        to_append = single_line.drop(columns=["geometry"])
        to_append["geometry"] = segment
        segmented = pd.concat((segmented, to_append))
   

    segmented = add_segment_id(segmented)
    
    trip_count_by_stop = hqta_segment_to_stops(segmented, stops, stop_times)
    segments_with_max_stop = find_segment_with_high_trip_count(trip_count_by_stop, 1)
    
    max_stop_times = (
        stop_times
        >> select(_.stop_id, _.trip_id, _.departure_time)
        >> inner_join(_, segments_with_max_stop, on="stop_id")
    )  ## filter stop_times to the key stops in each segment
    
    max_stop_times2 = clean_stop_times(max_stop_times)
    
    segment_am_max = count_peak_trips(
        max_stop_times2[max_stop_times2.departure_hour < 12], "am_max_trips")

    segment_pm_max = count_peak_trips(
        max_stop_times2[max_stop_times2.departure_hour >= 12], "pm_max_trips")

    segment_peak_service = segment_am_max >> inner_join(
            _, segment_pm_max, on="hqta_segment_id"
        )

    segment_peak_service["hq_transit_corr"] = segment_peak_service.apply(
                lambda x: x.am_max_trips > 4 and x.pm_max_trips > 4, axis=1
            )

In [49]:
stop_times >> count(_.trip_id) >> head(2)

Unnamed: 0,trip_id,n
0,10002011190254-FEB22,102
1,10002011190354-FEB22,102


In [None]:
single_line = routelines >> filter(_.shape_id == shape_id)

In [5]:
routelines = gpd.read_parquet(f"./data/routelines_{itp_id}_{date_str}.parquet")
trips = pd.read_parquet(f"./data/trips_{itp_id}_{date_str}.parquet")
stop_times = pd.read_parquet(f"./data/st_{itp_id}_{date_str}.parquet")
stops = gpd.read_parquet(f"./data/stops_{itp_id}_{date_str}.parquet")

In [6]:
distinct_routes = (
    trips
    >> distinct(_.route_id, _.shape_id, _.direction_id, _keep_all=True)
    >> select(_.calitp_itp_id, _.route_id, _.shape_id, _.direction_id, _.trip_id)
)

In [7]:
route_count_by_stop = (
        stop_times
        >> select(_.stop_id, _.trip_id)
        >> inner_join(_, distinct_routes, on="trip_id")
        >> count(_.stop_id)
        >> rename(n_routes=_.n)
        >> arrange(-_.n_routes)
    )

In [8]:
trips_shape_sorted = (
    trips.groupby("shape_id")
    .count()
    .sort_values(by="trip_id", ascending=False)
    .index
)


trips_shape_sorted = pd.Series(trips_shape_sorted)

In [10]:
hqta = gpd.GeoDataFrame()
debug_ids = {}

shape_id = trips_shape_sorted[0]

In [None]:
result = single_shape_hqta(
    routelines, trips, stop_times, stops, route_count_by_stop, shape_id, hqta
)

In [11]:
single_line = routelines >> filter(_.shape_id == shape_id)

In [12]:
single_line

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,geometry
171,182,0,160417_FEB22,"LINESTRING (149346.720 -435472.243, 149151.690..."


In [13]:
segmented = gpd.GeoDataFrame() ##changed to gdf?
for segment in utilities.create_segments(single_line.geometry):
    to_append = single_line.drop(columns=["geometry"])
    to_append["geometry"] = segment
    segmented = pd.concat((segmented, to_append))

In [None]:
len(segmented)

# At this point, one shape_id was split into 15 segments
# each around 1,250 m
# Create a segment id that can be used across routes/operators

In [20]:
df[df.shape_id==shape_id].hqta_segment_id.unique()

array([1698844686,  306536600, 2337026338, 4232774068, 1647285271,
        354985089, 2351035707, 4213622189, 1805244476,  480176298,
        169705783, 2098901409, 3826507803, 2467614861,  225461550])

In [19]:
segmented.hqta_segment_id.unique()

array([3839288110])

In [22]:
trip_count_by_stop = hqta_segment_to_stops(segmented, stops, stop_times)
segments_with_max_stop = find_segment_with_high_trip_count(trip_count_by_stop, 1)

In [23]:
segments_with_max_stop

Unnamed: 0,calitp_itp_id,hqta_segment_id,stop_id,n_trips
0,182,3839288110,5138,669


In [31]:
max_stop_times = (
        stop_times
        >> select(_.stop_id, _.trip_id, _.departure_time)
        >> inner_join(_, segments_with_max_stop, on="stop_id")
    )  ## filter stop_times to the key stops in each segment


def clean_stop_times(df):
    
    df = (df.dropna(subset=["departure_time"])
            ## filter duplicates for top2 approach
          .drop_duplicates(subset=["trip_id", "hqta_segment_id"])
         )
    
    ## reformat GTFS time to a format datetime can ingest 
    df["departure_time"] = df.departure_time.apply(utilities.fix_arrival_time)
    df["departure_dt"] = df.departure_time.apply(
            lambda x: dt.datetime.strptime(x, rt_utils.HOUR_MIN_SEC_FMT))
    df["departure_hour"] = df["departure_dt"].apply(
        lambda x: x.hour)
    
    return df

max_stop_times2 = clean_stop_times(max_stop_times)

In [35]:
def count_peak_trips(df, new_col):
    segment_max = (df 
                   >> count(_.hqta_segment_id, _.departure_hour)
                   >> group_by(_.hqta_segment_id)
                   >> summarize(new_col = _.n.max())
    ).rename(columns = {"new_col": new_col})
    
    return segment_max

In [41]:
segment_am_max = count_peak_trips(
    max_stop_times2[max_stop_times2.departure_hour < 12], "am_max_trips")

segment_pm_max = count_peak_trips(
    max_stop_times2[max_stop_times2.departure_hour >= 12], "pm_max_trips")

segment_peak_service = segment_am_max >> inner_join(
        _, segment_pm_max, on="hqta_segment_id"
    )

segment_peak_service["hq_transit_corr"] = segment_peak_service.apply(
            lambda x: x.am_max_trips > 4 and x.pm_max_trips > 4, axis=1
        )

In [42]:
segment_peak_service

Unnamed: 0,hqta_segment_id,am_max_trips,pm_max_trips,hq_transit_corr
0,3839288110,45,45,True


In [27]:
df[(df.shape_id==shape_id) & (df.stop_id=="5138")]

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,geometry,segment_sequence,hqta_segment_id,stop_id,am_max_trips,pm_max_trips,hq_transit_corr,departure_hour,n
13,182,0,160417_FEB22,"POLYGON ((160648.100 -438885.876, 160643.219 -...",13,2467614861,5138,45.0,45.0,True,,
