In [1]:
import sys

In [131]:
import intake
import pandas as pd
import geopandas as gpd
import numpy as np
from calitp_data_analysis import geography_utils
from siuba import *
import gtfs_segments
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates, rt_utils
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers

In [5]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [6]:
analysis_date = rt_dates.DATES['feb2025']

# Aggregations

Combine trip-level info with border zone info and stops in tract/border info, then aggregate.

## Methodology

* analysis segment in tract & shape has 1+ stops in tract -> allocate vrm, vrh to that tract
* analysis segment in border zone & shape has 1+ stops in zone -> allocate vrm, vrh to that zone
    * sub-allocate border zone vrm, vrh 50/50 to bordering tracts
* analysis segment in tract or border zone but shape has 0 stops in tract/zone
    * allocate 50/50 to adjacent tracts or zones, repeat above 

In [134]:
trip_tsi_alameda = pd.concat([pd.read_parquet('./trips_set1_tsi_segs_alameda_2025-02-12.parquet'),
                             pd.read_parquet('./trips_set2_tsi_segs_alameda_2025-02-12.parquet')])

In [135]:
trip_tsi_alameda.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds
71580,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,7936e1ebd5e663bc3c0e621579b40329,87052.024038,87073.502298,21.47826
71596,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,3b584e25a7ce90b5c7814e8ace9598ea,26030.851102,26053.502298,22.651195
71604,4dff2f7bd084547a24529a02806234d0,f4d8a196-1a7a-4a34-8ab3-1310c39bb429,10873.972623,266.319586,8b67bae021d2b2a32d8d99ab369f0762,34488.270644,34518.003063,29.73242


In [252]:
def read_shapes_stopping_in_seg(analysis_date):
    cols = ['shape_array_key', 'tsi_segment_id']
    sstb = pd.read_parquet(f'./shape_stops_tracts_borders_{analysis_date}.parquet')[cols]
    sstb['has_stop'] = True
    return sstb

In [253]:
sstb = read_shapes_stopping_in_seg(analysis_date)

In [312]:
sstb.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,has_stop
0,3c4985abe54a0185f7b7e9dc726d5e11,06001400100,True
2294,3c4985abe54a0185f7b7e9dc726d5e11,fa118075-aeb1-4986-81bc-b312385b09a9,True
2295,3c4985abe54a0185f7b7e9dc726d5e11,84aea543-aa00-4882-bbdc-08b596ba1456,True


In [255]:
def attach_stopping_info(trip_segment_df, shape_stopping_df):
    '''
    '''
    df = trip_segment_df.merge(shape_stopping_df, how='left', on=['shape_array_key', 'tsi_segment_id'])
    df.has_stop = df.has_stop.fillna(False)
    return df

In [256]:
joined = attach_stopping_info(trip_tsi_alameda, sstb)

In [258]:
bart_shape_array = 'db1920458bee7ea9de34b68eb9f4d8a5'

In [289]:
act_6_shape_array = '3caab5c44277cbdc8fbc755bc0ea7633'

## test aggregation

In [265]:
sstb_geo = gpd.read_parquet(f'./shape_stops_tracts_borders_{analysis_date}.parquet')
# sstb_geo.query('shape_array_key == @act_6_sa').explore()

### handle snap to tracts/borders with stops

In [266]:
def locate_stopping_segments(row, df):
    if row.has_stop:
        return row
    else:
        id_before = None
        id_after = None
        # print(row.name)
        stop_before = df.loc[:(row.name - 1)].query('has_stop')
        if not stop_before.empty:
            id_before = stop_before.query('start_meters == start_meters.max()').tsi_segment_id.iloc[0]
        stop_after = df.loc[(row.name + 1):].query('has_stop')
        if not stop_after.empty:
            id_after = stop_after.query('start_meters == start_meters.min()').tsi_segment_id.iloc[0]
        row['stopping_segments'] = (id_before, id_after)
        # return (id_before, id_after)
        return row

In [309]:
def assign_stopping_sequences(joined_df):
    '''
    with a joined trip tsi segment df and shape
    stopping df, create a new df by shape showing 
    which tsi segments (tracts or border zones) vrh & vrm
    should be allocated to when there are no stops for that
    shape in that segment
    '''
    cols = ['shape_array_key', 'start_meters', 'tsi_segment_id', 'has_stop']
    simple_sequence_df = (joined_df[cols]
                          .drop_duplicates()
                          .sort_values(['shape_array_key', 'start_meters'])
                          .reset_index(drop=True)
                         )
    fn = lambda df: df.apply(locate_stopping_segments, df=df, axis=1)
    #  tuples will be (None, id) where there are no previous stops, or (id, None) where no subsequent stops
    stopping_sequences_df = simple_sequence_df.groupby('shape_array_key', group_keys=False).progress_apply(fn)
    #  scrub nones from tuples for accurate count:
    stopping_sequences_df.stopping_segments = stopping_sequences_df.stopping_segments.map(
            lambda y: y if type(y) != tuple else tuple(x for x in y if x))
    stopping_sequences_df['n_stopping_segments'] = stopping_sequences_df.stopping_segments.map(
            lambda y: y if type(y) != tuple else len(y)).fillna(1)
    unassigned = stopping_sequences_df.query('n_stopping_segments == 0')
    print(f'{unassigned.shape[0]} segments out of {stopping_sequences_df.shape[0]} can not be matched to a stop')
    stopping_sequences_df = stopping_sequences_df.query('n_stopping_segments >= 1')
    #  divide time and distance in tsi segments by number of segments post-explode
    joined_df = (joined_df.merge(stopping_sequences_df, on=['has_stop', 'shape_array_key', 'start_meters', 'tsi_segment_id'])
                 .explode('stopping_segments')
                )
    joined_df = joined_df.assign(tsi_segment_meters = joined_df.tsi_segment_meters / joined_df.n_stopping_segments,
                         segment_seconds = joined_df.segment_seconds / joined_df.n_stopping_segments
                        )
    #  replace tsi_segment_id with stopping_segment if present, df can now be aggregated normally on tsi_segment_id
    joined_df.tsi_segment_id = joined_df.stopping_segments.fillna(joined_df.tsi_segment_id)
    joined_df = joined_df.drop(columns=['has_stop'])
    return joined_df

In [310]:
joined = attach_stopping_info(trip_tsi_alameda, sstb)
stopping_sequences_df = assign_stopping_sequences(joined)

Progress: 100%|██████████| 501/501 [00:29<00:00, 17.11it/s]


159 segments out of 9855 can not be matched to a stop


### handle snap to bordering tracts

In [327]:
borders = gpd.read_parquet('test_tracts_borders_2025-02-12.parquet')

In [328]:
border_cols = ['tsi_segment_id', 'border_tracts', 'border']

In [329]:
borders = borders.assign(border_tracts = tuple(zip(borders.tract_1, borders.tract_2)))[border_cols].drop_duplicates()

In [330]:
borders

Unnamed: 0,tsi_segment_id,border_tracts,border
0,06001450604,"(None, None)",False
1,06001450743,"(None, None)",False
2,06001450744,"(None, None)",False
3,06001450745,"(None, None)",False
4,06001451300,"(None, None)",False
...,...,...,...
8633,d2f2ae5a-171d-49d1-9a4b-23079b2a32d4,"(06001440332, 06001440331)",True
9117,9f05c3b7-d069-442d-a9be-e143a42a0764,"(06001440301, 06001440200)",True
9385,06001404501,"(None, None)",False
9670,5bed63b1-0c12-48b6-b3eb-08256dd363f6,"(06001423400, 06001424001)",True


In [331]:
stopping_sequences_df.shape

(734827, 10)

In [333]:
stopping_sequences_df.merge(borders, how='left', on='tsi_segment_id').border.value_counts()

True     465804
False    269023
Name: border, dtype: int64

In [287]:
# stopping_sequences_df.tsi_segment_id.unique()