In [1]:
import sys

In [131]:
import intake
import pandas as pd
import geopandas as gpd
import numpy as np
from calitp_data_analysis import geography_utils
from siuba import *
import gtfs_segments
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates, rt_utils
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers

In [5]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [6]:
analysis_date = rt_dates.DATES['feb2025']

# Aggregations

Combine trip-level info with border zone info and stops in tract/border info, then aggregate.

## Methodology

* analysis segment in tract & shape has 1+ stops in tract -> allocate vrm, vrh to that tract
* analysis segment in border zone & shape has 1+ stops in zone -> allocate vrm, vrh to that zone
    * sub-allocate border zone vrm, vrh 50/50 to bordering tracts
* analysis segment in tract or border zone but shape has 0 stops in tract/zone
    * allocate 50/50 to adjacent tracts or zones, repeat above 

In [134]:
trip_tsi_alameda = pd.concat([pd.read_parquet('./trips_set1_tsi_segs_alameda_2025-02-12.parquet'),
                             pd.read_parquet('./trips_set2_tsi_segs_alameda_2025-02-12.parquet')])

In [135]:
trip_tsi_alameda.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds
71580,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,7936e1ebd5e663bc3c0e621579b40329,87052.024038,87073.502298,21.47826
71596,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,3b584e25a7ce90b5c7814e8ace9598ea,26030.851102,26053.502298,22.651195
71604,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,8b67bae021d2b2a32d8d99ab369f0762,34488.270644,34518.003063,29.73242


In [252]:
def read_shapes_stopping_in_seg(analysis_date):
    cols = ['shape_array_key', 'tsi_segment_id']
    sstb = pd.read_parquet(f'./shape_stops_tracts_borders_{analysis_date}.parquet')[cols]
    sstb['has_stop'] = True
    return sstb

In [253]:
sstb = read_shapes_stopping_in_seg(analysis_date)

In [254]:
sstb.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,has_stop
0,3c4985abe54a0185f7b7e9dc726d5e11,06001400100,True
2294,3c4985abe54a0185f7b7e9dc726d5e11,fa118075-aeb1-4986-81bc-b312385b09a9,True
2295,3c4985abe54a0185f7b7e9dc726d5e11,84aea543-aa00-4882-bbdc-08b596ba1456,True


In [255]:
def attach_stopping_info(trip_segment_df, shape_stopping_df):
    '''
    '''
    df = trip_segment_df.merge(shape_stopping_df, how='left', on=['shape_array_key', 'tsi_segment_id'])
    df.has_stop = df.has_stop.fillna(False)
    return df

In [256]:
joined = attach_stopping_info(trip_tsi_alameda, sstb)

In [257]:
joined.query('has_stop').tsi_segment_id.value_counts()

9fd233b8-6dee-4d0a-871e-8e11a6563b58    25876
1e00c86d-91aa-462a-8e89-6b96e247944f    16428
ca3c9577-87ca-4db4-8b36-5f7f1072170c    12668
d6f6b887-19d8-4b8b-978e-fa3819245d3b    10764
617fe0ff-8999-4e5e-8d06-3a2d65eeb616    10517
                                        ...  
9cd5899f-e0ca-493e-962b-1d8953091fb0        3
06001407000                                 2
06001450605                                 2
cdec1d4d-6b14-49f8-883c-4bc64166149a        2
67e03416-0f5e-4cd0-be9b-e5d3e0a5a496        1
Name: tsi_segment_id, Length: 674, dtype: int64

In [258]:
bart_shape_array = 'db1920458bee7ea9de34b68eb9f4d8a5'

## test aggregation

In [259]:
bart = joined.query('shape_array_key == @bart_shape_array')

In [260]:
bart.sort_values(by=['trip_instance_key', 'start_meters'])

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds,has_stop
591415,db1920458bee7ea9de34b68eb9f4d8a5,d5943833-7265-4344-8c41-d3885e324d22,0.0,220.775044,8c29a9682d3d980f26790edfc02fdb4e,17040.0,17047.732648,7.732648,False
591416,db1920458bee7ea9de34b68eb9f4d8a5,efe1744a-37d5-439c-ba0c-a94fe7cde241,152.876295,2008.070855,8c29a9682d3d980f26790edfc02fdb4e,17047.732648,17145.834656,98.102008,False
591417,db1920458bee7ea9de34b68eb9f4d8a5,e6993038-7716-4244-ad54-49e12404126d,2091.626481,2116.26204,8c29a9682d3d980f26790edfc02fdb4e,17145.834656,17249.412026,103.57737,False
591418,db1920458bee7ea9de34b68eb9f4d8a5,daa07c56-0aca-4403-906b-0c26efc779ce,4138.584024,1452.569693,8c29a9682d3d980f26790edfc02fdb4e,17249.412026,17320.491657,71.079631,False
591419,db1920458bee7ea9de34b68eb9f4d8a5,f3b15ae2-5d14-454d-9283-da9baf9636d9,5520.781123,196.459744,8c29a9682d3d980f26790edfc02fdb4e,17320.491657,17326.829759,6.338102,False
591420,db1920458bee7ea9de34b68eb9f4d8a5,046e7852-fc9e-4187-a65e-bbcdd058adc4,5642.513311,540.772714,8c29a9682d3d980f26790edfc02fdb4e,17326.829759,17353.765047,26.935288,False
591421,db1920458bee7ea9de34b68eb9f4d8a5,06001435601,6159.843501,218.788462,8c29a9682d3d980f26790edfc02fdb4e,17353.765047,17365.132449,11.367402,False
591422,db1920458bee7ea9de34b68eb9f4d8a5,06001435602,6378.170477,880.931663,8c29a9682d3d980f26790edfc02fdb4e,17365.132449,17410.998993,45.866545,False
591423,db1920458bee7ea9de34b68eb9f4d8a5,06001433700,7259.10214,708.587876,8c29a9682d3d980f26790edfc02fdb4e,17410.998993,17447.892295,36.893301,False
591424,db1920458bee7ea9de34b68eb9f4d8a5,06001434000,7967.690015,505.381763,8c29a9682d3d980f26790edfc02fdb4e,17447.892295,17474.857958,26.965663,False


In [261]:
act_6_sa = '3caab5c44277cbdc8fbc755bc0ea7633'

In [262]:
joined.query('tsi_segment_id == "66da07ad-5ef4-42bb-89b6-ca5cd53d076d" & has_stop')

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds,has_stop


In [263]:
sstb.query('tsi_segment_id == "4462356b-89b4-4d5c-8ee1-3926722dd2e5"')

Unnamed: 0,shape_array_key,tsi_segment_id,has_stop
2776,550f52ed44a5f60196bb37cfc227ba91,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
3803,550f52ed44a5f60196bb37cfc227ba91,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
2776,3caab5c44277cbdc8fbc755bc0ea7633,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
3803,3caab5c44277cbdc8fbc755bc0ea7633,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
2776,550f52ed44a5f60196bb37cfc227ba91,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
3803,550f52ed44a5f60196bb37cfc227ba91,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
2776,3caab5c44277cbdc8fbc755bc0ea7633,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
3803,3caab5c44277cbdc8fbc755bc0ea7633,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
4818,a14dbc109d7f73a04bc5ee3946d40008,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
6619,a14dbc109d7f73a04bc5ee3946d40008,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True


In [264]:
sstb.query('shape_array_key == @act_6_sa & has_stop')

Unnamed: 0,shape_array_key,tsi_segment_id,has_stop
2,3caab5c44277cbdc8fbc755bc0ea7633,06001400300,True
2,3caab5c44277cbdc8fbc755bc0ea7633,06001400300,True
3,3caab5c44277cbdc8fbc755bc0ea7633,06001400400,True
2776,3caab5c44277cbdc8fbc755bc0ea7633,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
3803,3caab5c44277cbdc8fbc755bc0ea7633,4462356b-89b4-4d5c-8ee1-3926722dd2e5,True
...,...,...,...
142,3caab5c44277cbdc8fbc755bc0ea7633,06001423602,True
142,3caab5c44277cbdc8fbc755bc0ea7633,06001423602,True
146,3caab5c44277cbdc8fbc755bc0ea7633,06001423902,True
146,3caab5c44277cbdc8fbc755bc0ea7633,06001423902,True


In [265]:
sstb_geo = gpd.read_parquet(f'./shape_stops_tracts_borders_{analysis_date}.parquet')
# sstb_geo.query('shape_array_key == @act_6_sa').explore()

### handle snap to tracts/borders with stops

In [266]:
def locate_stopping_segments(row, df):
    if row.has_stop:
        return row
    else:
        id_before = None
        id_after = None
        # print(row.name)
        stop_before = df.loc[:(row.name - 1)].query('has_stop')
        if not stop_before.empty:
            id_before = stop_before.query('start_meters == start_meters.max()').tsi_segment_id.iloc[0]
        stop_after = df.loc[(row.name + 1):].query('has_stop')
        if not stop_after.empty:
            id_after = stop_after.query('start_meters == start_meters.min()').tsi_segment_id.iloc[0]
        row['stopping_segments'] = (id_before, id_after)
        # return (id_before, id_after)
        return row

In [267]:
def assign_stopping_sequences(joined_df):
    '''
    with a joined trip tsi segment df and shape
    stopping df, create a new df by shape showing 
    which tsi segments (tracts or border zones) vrh & vrm
    should be allocated to when there are no stops for that
    shape in that segment
    '''
    cols = ['shape_array_key', 'start_meters', 'tsi_segment_id', 'has_stop']
    simple_sequence_df = (joined_df[cols]
                          .drop_duplicates()
                          .sort_values(['shape_array_key', 'start_meters'])
                          .reset_index(drop=True)
                         )
    fn = lambda df: df.apply(locate_stopping_segments, df=df, axis=1)
    #  tuples will be (None, id) where there are no previous stops, or (id, None) where no subsequent stops
    stopping_sequences_df = simple_sequence_df.groupby('shape_array_key', group_keys=False).progress_apply(fn)
    #  scrub nones from tuples for accurate count:
    stopping_sequences_df.stopping_segments = stopping_sequences_df.stopping_segments.map(
            lambda y: y if type(y) != tuple else tuple(x for x in y if x))
    stopping_sequences_df['n_stopping_segments'] = stopping_sequences_df.stopping_segments.map(
            lambda y: y if type(y) != tuple else len(y)).fillna(1)
    unassigned = stopping_sequences_df.query('n_stopping_segments == 0')
    print(f'{unassigned.shape[0]} segments out of {stopping_sequences_df.shape[0]} can not be matched to a stop')
    stopping_sequences_df = stopping_sequences_df.query('n_stopping_segments >= 1')
    #  divide time and distance in tsi segments by number of segments post-explode
    joined_df = (joined_df.merge(stopping_sequences_df, on=['has_stop', 'shape_array_key', 'start_meters', 'tsi_segment_id'])
                 .explode('stopping_segments')
                )
    joined_df = joined_df.assign(tsi_segment_meters = joined_df.tsi_segment_meters / joined_df.n_stopping_segments,
                         segment_seconds = joined_df.segment_seconds / joined_df.n_stopping_segments
                        )
    #  replace tsi_segment_id with stopping_segment if present, df can now be aggregated normally on tsi_segment_id
    joined_df.stopping_segments = joined_df.stopping_segments.fillna(joined_df.tsi_segment_id)
    # joined_df.tsi_segment_id = new_id #  can't do this in one line, assignment issue...
    return joined_df

In [268]:
joined = attach_stopping_info(trip_tsi_alameda, sstb)
stopping_sequences_df = assign_stopping_sequences(joined)

Progress: 100%|██████████| 501/501 [00:30<00:00, 16.64it/s]


159 segments out of 9855 can not be matched to a stop


### handle snap to bordering tracts

In [269]:
borders = gpd.read_parquet('test_tracts_borders_2025-02-12.parquet').query('border')

In [270]:
border_cols = ['tsi_segment_id', 'border_tracts', 'border']

In [271]:
borders = borders.assign(border_tracts = tuple(zip(borders.tract_1, borders.tract_2)))[border_cols].drop_duplicates()

In [272]:
stopping_sequences_df.dtypes

shape_array_key         object
tsi_segment_id          object
start_meters           float64
tsi_segment_meters     float64
trip_instance_key       object
arrival_sec            float64
arrival_sec_next       float64
segment_seconds        float64
has_stop                  bool
stopping_segments       object
n_stopping_segments    float64
dtype: object

In [273]:
borders.dtypes

tsi_segment_id    object
border_tracts     object
border              bool
dtype: object

In [274]:
# stopping_sequences_df.tsi_segment_id.unique()

In [275]:
stopping_sequences_df.query('has_stop').merge(borders, on='tsi_segment_id')

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds,has_stop,stopping_segments,n_stopping_segments,border_tracts,border
0,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,7936e1ebd5e663bc3c0e621579b40329,87052.024038,87073.502298,21.478260,True,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,1.0,"(06001406602, 06001404800)",True
1,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,7936e1ebd5e663bc3c0e621579b40329,87052.024038,87073.502298,21.478260,True,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,1.0,"(06001406602, 06001404800)",True
2,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,7936e1ebd5e663bc3c0e621579b40329,87052.024038,87073.502298,21.478260,True,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,1.0,"(06001406602, 06001404800)",True
3,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,7936e1ebd5e663bc3c0e621579b40329,87052.024038,87073.502298,21.478260,True,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,1.0,"(06001406602, 06001404800)",True
4,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,7936e1ebd5e663bc3c0e621579b40329,87052.024038,87073.502298,21.478260,True,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,1.0,"(06001406602, 06001404800)",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
454772,c7a8ce6637e75cc2fe2bfb3543d7f85f,4d503f96-05f6-45c3-ba97-6049bbc38912,49577.589101,1046.375279,636e064586351d66bc554c3c53737db5,56957.830941,57003.662849,45.831908,True,4d503f96-05f6-45c3-ba97-6049bbc38912,1.0,"(06001436700, 06001436300)",True
454773,c7a8ce6637e75cc2fe2bfb3543d7f85f,4d503f96-05f6-45c3-ba97-6049bbc38912,49577.589101,1046.375279,636e064586351d66bc554c3c53737db5,56957.830941,57003.662849,45.831908,True,4d503f96-05f6-45c3-ba97-6049bbc38912,1.0,"(06001436700, 06001436300)",True
454774,1502383925b227495e52e6ea3a1ed3c2,4721b2cf-d063-4673-be8e-7213be5ec793,7491.432756,855.067157,4929ffb6b7a1cb045b1f0bc6adde2069,27566.528284,27694.786467,128.258183,True,4721b2cf-d063-4673-be8e-7213be5ec793,1.0,"(06001443105, 06001443102)",True
454775,1502383925b227495e52e6ea3a1ed3c2,4721b2cf-d063-4673-be8e-7213be5ec793,7491.432756,855.067157,34593d1d77330dab04bf975a875441a4,27326.528284,27454.786467,128.258183,True,4721b2cf-d063-4673-be8e-7213be5ec793,1.0,"(06001443105, 06001443102)",True


In [276]:
joined.query('tsi_segment_id == "4462356b-89b4-4d5c-8ee1-3926722dd2e5"')

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds,has_stop
24694,6eabeb0cbb0899a0c80ffd855531e5ec,4462356b-89b4-4d5c-8ee1-3926722dd2e5,4276.760064,491.241339,41a1dd74468b81baa6bfd1bbae852827,67702.179654,67781.56353,79.383877,True
24695,6eabeb0cbb0899a0c80ffd855531e5ec,4462356b-89b4-4d5c-8ee1-3926722dd2e5,4276.760064,491.241339,41a1dd74468b81baa6bfd1bbae852827,67702.179654,67781.56353,79.383877,True
24696,6eabeb0cbb0899a0c80ffd855531e5ec,4462356b-89b4-4d5c-8ee1-3926722dd2e5,4276.760064,491.241339,41a1dd74468b81baa6bfd1bbae852827,67702.179654,67781.56353,79.383877,True
24697,6eabeb0cbb0899a0c80ffd855531e5ec,4462356b-89b4-4d5c-8ee1-3926722dd2e5,4276.760064,491.241339,41a1dd74468b81baa6bfd1bbae852827,67702.179654,67781.56353,79.383877,True
24698,6eabeb0cbb0899a0c80ffd855531e5ec,4462356b-89b4-4d5c-8ee1-3926722dd2e5,4276.760064,491.241339,41a1dd74468b81baa6bfd1bbae852827,67702.179654,67781.56353,79.383877,True
...,...,...,...,...,...,...,...,...,...
688845,550f52ed44a5f60196bb37cfc227ba91,4462356b-89b4-4d5c-8ee1-3926722dd2e5,23195.683784,476.662109,581a814d3b8c2b795e111b5fd55c64d1,20780.947702,20856.82365,75.875947,True
688846,550f52ed44a5f60196bb37cfc227ba91,4462356b-89b4-4d5c-8ee1-3926722dd2e5,23195.683784,476.662109,7ab35be1b6901cbf58dc534d001ae3a2,6380.947702,6456.82365,75.875947,True
688847,550f52ed44a5f60196bb37cfc227ba91,4462356b-89b4-4d5c-8ee1-3926722dd2e5,23195.683784,476.662109,7ab35be1b6901cbf58dc534d001ae3a2,6380.947702,6456.82365,75.875947,True
688848,550f52ed44a5f60196bb37cfc227ba91,4462356b-89b4-4d5c-8ee1-3926722dd2e5,23195.683784,476.662109,7ab35be1b6901cbf58dc534d001ae3a2,6380.947702,6456.82365,75.875947,True


In [226]:
stopping_sequences_df.query('shape_array_key == @act_6_sa').sort_values(['start_meters', 'trip_instance_key'])

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds,has_stop,stopping_segments,n_stopping_segments
108150,3caab5c44277cbdc8fbc755bc0ea7633,06001403100,0.000000,375.863659,0d9b991fd66489617e4b841440a3ad6d,63720.000000,63835.513869,115.513869,True,06001403100,1.0
108151,3caab5c44277cbdc8fbc755bc0ea7633,06001403100,0.000000,375.863659,0d9b991fd66489617e4b841440a3ad6d,63720.000000,63835.513869,115.513869,True,06001403100,1.0
108298,3caab5c44277cbdc8fbc755bc0ea7633,06001403100,0.000000,375.863659,0f9cdeeec21d606374438df8debc96bb,51480.000000,51595.513869,115.513869,True,06001403100,1.0
108299,3caab5c44277cbdc8fbc755bc0ea7633,06001403100,0.000000,375.863659,0f9cdeeec21d606374438df8debc96bb,51480.000000,51595.513869,115.513869,True,06001403100,1.0
108290,3caab5c44277cbdc8fbc755bc0ea7633,06001403100,0.000000,375.863659,11759effdf52b2ef7f57e3aa7d9ba60d,53640.000000,53755.513869,115.513869,True,06001403100,1.0
...,...,...,...,...,...,...,...,...,...,...,...
110520,3caab5c44277cbdc8fbc755bc0ea7633,49eca057-d61e-4090-a4e5-cfac2de15bdc,8660.491989,378.770805,f648803a1912e0e8aa506cb5fb442cd9,60940.960348,61080.000000,139.039652,False,06001422800,1.0
110478,3caab5c44277cbdc8fbc755bc0ea7633,49eca057-d61e-4090-a4e5-cfac2de15bdc,8660.491989,378.770805,f9ecb55a37e73d7d807fa0faecca1a7e,20444.962727,20520.000000,75.037273,False,06001422800,1.0
110531,3caab5c44277cbdc8fbc755bc0ea7633,49eca057-d61e-4090-a4e5-cfac2de15bdc,8660.491989,378.770805,facc2913553362bd637f0c7be8970221,83123.628601,83220.000000,96.371399,False,06001422800,1.0
110492,3caab5c44277cbdc8fbc755bc0ea7633,49eca057-d61e-4090-a4e5-cfac2de15bdc,8660.491989,378.770805,fe526b1ac72453b43d21d929bc0e4058,65042.294474,65160.000000,117.705526,False,06001422800,1.0


In [232]:
borders.query('tsi_segment_id == "49eca057-d61e-4090-a4e5-cfac2de15bdc"')

Unnamed: 0,tsi_segment_id,border_tracts,border
480,49eca057-d61e-4090-a4e5-cfac2de15bdc,"(06001422900, 06001422600)",True


In [245]:
gpd.read_parquet('./test_tracts_borders_2025-02-12.parquet').query('shape_array_key == @act_6_sa').explore(column='border')