In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
import numpy as np
from calitp_data_analysis import geography_utils
from siuba import *
import gtfs_segments
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates, rt_utils
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers

In [5]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [6]:
analysis_date = rt_dates.DATES['feb2025']

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

## trip - linestring interpolation

could use this again? https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/_shared_utils/shared_utils/rt_utils.py#L574-L579

https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/rt_delay/rt_analysis/rt_parser.py#L90

* fillna both directions arrival <-> departure time
* then dropna

## Projected st via `stop_times_direction`

### functions

In [7]:
def attach_projected_stop_times(analysis_date: str):
    '''
    
    '''
    path = f'{GTFS_DATA_DICT.rt_vs_schedule_tables.dir}{GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction}_{analysis_date}.parquet'
    ST_DIR_COLS = ['trip_instance_key', 'stop_sequence', 'stop_meters', 'stop_id']
    st_dir = gpd.read_parquet(path)[ST_DIR_COLS]
    st = helpers.import_scheduled_stop_times(analysis_date, get_pandas=True)
    trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_id', 'trip_instance_key', 'feed_key',
                                                                  'shape_array_key'])
    st = st.merge(trips, on = ['feed_key', 'trip_id'])
    return st.merge(st_dir, on = ['trip_instance_key', 'stop_sequence', 'stop_id'])

In [8]:
st_proj = attach_projected_stop_times(analysis_date)

In [9]:
st_proj.arrival_sec.isna().value_counts()

False    4232118
True       90428
Name: arrival_sec, dtype: int64

In [10]:
st_proj.departure_sec.isna().value_counts()

False    4232118
True       90428
Name: departure_sec, dtype: int64

In [11]:
# st_proj.query('departure_sec.isna() & arrival_sec.isna()') # same subset, no point in fillna

In [12]:
# st_proj.to_parquet(f'st_proj_{analysis_date}.parquet')

In [13]:
st_proj = pd.read_parquet(f'st_proj_{analysis_date}.parquet')

## projecting TSI interpolation points

In [14]:
shapes = helpers.import_scheduled_shapes(analysis_date, crs=geography_utils.CA_NAD83Albers_m)

In [15]:
def read_tracts_borders(analysis_date, shapes):
    test_tract_borders = gpd.read_parquet(f'test_tracts_borders_{analysis_date}.parquet')
    test_tract_borders = test_tract_borders.drop(columns=['geometry'])

    test_tract_borders = (shapes.merge(test_tract_borders, on='shape_array_key')
                         .rename(columns={'geometry': 'shape_geometry'}))

    test_tract_borders = test_tract_borders.assign(
        start_meters = test_tract_borders.shape_geometry.project(test_tract_borders.start)
    )
    test_tract_borders = test_tract_borders.sort_values('start_meters').reset_index(drop=True)
    return test_tract_borders

In [16]:
test_tract_borders = read_tracts_borders(analysis_date, shapes)

  return lib.line_locate_point(line, other)


## interpolation with arrays

### try one trip

In [17]:
%%time

one_trip = st_proj.query('trip_instance_key == "98c8b779600cc0c399755929110a83c4"').sort_values('stop_sequence')

CPU times: user 111 ms, sys: 565 μs, total: 112 ms
Wall time: 111 ms


In [18]:
one_trip.head(3)

Unnamed: 0,feed_key,feed_timezone,base64_url,trip_id,stop_id,stop_sequence,timepoint,arrival_sec,departure_sec,arrival_hour,departure_hour,trip_instance_key,shape_array_key,stop_meters
2812526,e47b82448b027d26835d46f2f7cbdc1c,US/Pacific,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,7071020,51636,1,1.0,47160.0,47160.0,13.0,13.0,98c8b779600cc0c399755929110a83c4,3caab5c44277cbdc8fbc755bc0ea7633,0.0
2812543,e47b82448b027d26835d46f2f7cbdc1c,US/Pacific,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,7071020,59444,2,0.0,47230.0,47230.0,13.0,13.0,98c8b779600cc0c399755929110a83c4,3caab5c44277cbdc8fbc755bc0ea7633,215.765913
2812555,e47b82448b027d26835d46f2f7cbdc1c,US/Pacific,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,7071020,53108,3,0.0,47329.0,47329.0,13.0,13.0,98c8b779600cc0c399755929110a83c4,3caab5c44277cbdc8fbc755bc0ea7633,537.650981


In [19]:
shape_array = one_trip.stop_meters.to_numpy()
dt_float_array = one_trip.arrival_sec.to_numpy()

In [20]:
test_tract_borders['arrival_sec'] = test_tract_borders.start_meters.apply(
    rt_utils.time_at_position_numba, shape_array=shape_array, dt_float_array = dt_float_array)
test_tract_borders = test_tract_borders.assign(arrival_sec_next = test_tract_borders.arrival_sec.shift(-1),
                                               trip_instance_key = one_trip.trip_instance_key.iloc[0])
test_tract_borders.loc[0,'arrival_sec'] = one_trip.arrival_sec.min()
test_tract_borders.loc[test_tract_borders.index.max(),'arrival_sec_next'] = one_trip.arrival_sec.max()
test_tract_borders = test_tract_borders.assign(segment_seconds = test_tract_borders.arrival_sec_next - test_tract_borders.arrival_sec)

### create function -- with `test_tract_borders`

In [21]:
test_tract_borders.columns

Index(['shape_array_key', 'shape_geometry', 'tract', 'tract_1', 'tract_2',
       'intersection_id', 'border', 'start', 'tsi_segment_id',
       'tsi_segment_meters', 'start_meters', 'arrival_sec', 'arrival_sec_next',
       'trip_instance_key', 'segment_seconds'],
      dtype='object')

In [22]:
tracts_borders_cols = ['shape_array_key', 'tsi_segment_id', 'start_meters', 'tsi_segment_meters']

In [23]:
tracts_borders = read_tracts_borders(analysis_date, shapes)[tracts_borders_cols]

  return lib.line_locate_point(line, other)


In [24]:
tracts_borders_trips = tracts_borders.merge(st_proj[['shape_array_key', 'trip_instance_key']].drop_duplicates(), on='shape_array_key')

In [25]:
tracts_borders_trips

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key
0,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,2799c68c8dc1bfca3e445f8b20eaa9ab
1,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,8890c04258cd7b8e550bc63f3f08b986
2,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,73b5546880e7ddf898b9a1f0b6dff637
3,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,264149fb99e6736ff4708b6c86cfd584
4,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,fb3898cce852c0ba6666f2d3fb615bca
...,...,...,...,...,...
163327,b5c44a21e6c36a10b038a66e70b7717b,9265173c-b36b-4239-8d8a-6c29f58406d1,3.185117e+06,847.647219,2e5efd4dfde2affbd02acbaf948b5893
163328,b5c44a21e6c36a10b038a66e70b7717b,06001425103,3.185149e+06,781.315317,2e5efd4dfde2affbd02acbaf948b5893
163329,b5c44a21e6c36a10b038a66e70b7717b,06001422000,3.185937e+06,3802.575398,2e5efd4dfde2affbd02acbaf948b5893
163330,b5c44a21e6c36a10b038a66e70b7717b,06001420400,3.189740e+06,585.207969,2e5efd4dfde2affbd02acbaf948b5893


In [26]:
tracts_borders_trips.drop_duplicates(subset=['trip_instance_key']).shape

(7684, 5)

In [27]:
def tract_border_time_by_trip(tracts_borders_trip_df: pd.DataFrame, st_proj_df: pd.DataFrame):
    '''
    '''
    
    one_trip = st_proj_df.query('trip_instance_key == @tracts_borders_trip_df.trip_instance_key.iloc[0]').sort_values('stop_sequence')
    shape_array = one_trip.stop_meters.to_numpy()
    dt_float_array = one_trip.arrival_sec.to_numpy()
    tracts_borders_trip_df['arrival_sec'] = tracts_borders_trip_df.start_meters.apply(
        rt_utils.time_at_position_numba, shape_array=shape_array, dt_float_array = dt_float_array)
    tracts_borders_trip_df = tracts_borders_trip_df.assign(arrival_sec_next = tracts_borders_trip_df.arrival_sec.shift(-1),
                                                   trip_instance_key = one_trip.trip_instance_key.iloc[0])
    tracts_borders_trip_df.loc[tracts_borders_trip_df.index.min(),'arrival_sec'] = one_trip.arrival_sec.min()
    tracts_borders_trip_df.loc[tracts_borders_trip_df.index.max(),'arrival_sec_next'] = one_trip.arrival_sec.max()
    tracts_borders_trip_df = tracts_borders_trip_df.assign(segment_seconds = tracts_borders_trip_df.arrival_sec_next - tracts_borders_trip_df.arrival_sec)
    
    return tracts_borders_trip_df

In [28]:
tracts_borders_trips

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key
0,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,2799c68c8dc1bfca3e445f8b20eaa9ab
1,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,8890c04258cd7b8e550bc63f3f08b986
2,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,73b5546880e7ddf898b9a1f0b6dff637
3,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,264149fb99e6736ff4708b6c86cfd584
4,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000e+00,595.587122,fb3898cce852c0ba6666f2d3fb615bca
...,...,...,...,...,...
163327,b5c44a21e6c36a10b038a66e70b7717b,9265173c-b36b-4239-8d8a-6c29f58406d1,3.185117e+06,847.647219,2e5efd4dfde2affbd02acbaf948b5893
163328,b5c44a21e6c36a10b038a66e70b7717b,06001425103,3.185149e+06,781.315317,2e5efd4dfde2affbd02acbaf948b5893
163329,b5c44a21e6c36a10b038a66e70b7717b,06001422000,3.185937e+06,3802.575398,2e5efd4dfde2affbd02acbaf948b5893
163330,b5c44a21e6c36a10b038a66e70b7717b,06001420400,3.189740e+06,585.207969,2e5efd4dfde2affbd02acbaf948b5893


In [29]:
many_trip_test =(tracts_borders_trips.head(1000)
                .groupby('trip_instance_key', group_keys=False)
                .progress_apply(tract_border_time_by_trip, st_proj_df = st_proj))

Progress: 100%|██████████| 123/123 [00:24<00:00,  5.02it/s]


In [30]:
many_trip_test

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds
0,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000,595.587122,2799c68c8dc1bfca3e445f8b20eaa9ab,30300.000000,30389.956253,89.956253
1,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000,595.587122,8890c04258cd7b8e550bc63f3f08b986,67500.000000,67577.105360,77.105360
2,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000,595.587122,73b5546880e7ddf898b9a1f0b6dff637,24600.000000,24702.807146,102.807146
3,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000,595.587122,264149fb99e6736ff4708b6c86cfd584,70800.000000,70877.105360,77.105360
4,48c37aba59d6976367d77d8bf5ccfb05,06001425103,0.000000,595.587122,fb3898cce852c0ba6666f2d3fb615bca,21000.000000,21077.105360,77.105360
...,...,...,...,...,...,...,...,...
995,0d9bcf2fd73ef369e69b5db1289f0856,42517509-014f-4e0c-ae89-9b83d2f9fe86,2069.992586,147.298115,d7d8c2a94857973120b9cd6a9c0c701a,60516.630063,63840.000000,3323.369937
996,0d9bcf2fd73ef369e69b5db1289f0856,42517509-014f-4e0c-ae89-9b83d2f9fe86,2069.992586,147.298115,2c6335ddf08eda5e4a39aca61a181520,50976.630063,53820.000000,2843.369937
997,0d9bcf2fd73ef369e69b5db1289f0856,42517509-014f-4e0c-ae89-9b83d2f9fe86,2069.992586,147.298115,bfd2ba6a7c17fe49acb6e8b1fdd30396,83606.495695,85620.000000,2013.504305
998,0d9bcf2fd73ef369e69b5db1289f0856,42517509-014f-4e0c-ae89-9b83d2f9fe86,2069.992586,147.298115,4cbd98fa20dc81c23eefcf31995f7ec1,39456.630063,42300.000000,2843.369937


In [31]:
meta = many_trip_test[:0]

In [32]:
# many_trip_test.query('trip_instance_key == "2799c68c8dc1bfca3e445f8b20eaa9ab"')

# many_trip_test.loc[36, 'tsi_segment_id']

In [33]:
import dask.dataframe as dd
import dask_geopandas as dg

In [34]:
trips = tracts_borders_trips.trip_instance_key.unique()

In [35]:
set1 = trips[:4000]

In [36]:
set2 = trips[4000:]

In [37]:
# ddf = dd.from_pandas(tracts_borders_trips.query('trip_instance_key.isin(@set1)'), npartitions=10)

# ddf = (ddf.groupby('trip_instance_key', group_keys=False)
#        .apply(tract_border_time_by_trip, st_proj_df = st_proj, meta=meta))

# %%time 
# df = ddf.compute()

# df.to_parquet(f'trips_set1_tsi_segs_alameda_{analysis_date}.parquet')

In [38]:
ddf = dd.from_pandas(tracts_borders_trips.query('trip_instance_key.isin(@set2)'), npartitions=10)

In [39]:
ddf = (ddf.groupby('trip_instance_key', group_keys=False)
       .apply(tract_border_time_by_trip, st_proj_df = st_proj, meta=meta))

In [40]:
%%time 
df = ddf.compute()

CPU times: user 9min 26s, sys: 1min 20s, total: 10min 47s
Wall time: 8min 58s


In [41]:
df.to_parquet(f'trips_set2_tsi_segs_alameda_{analysis_date}.parquet')

In [42]:
all_trips =(tracts_borders_trips.head(5000)
                .groupby('trip_instance_key', group_keys=False)
                .progress_apply(tract_border_time_by_trip, st_proj_df = st_proj))

Progress: 100%|██████████| 299/299 [00:59<00:00,  4.99it/s]


In [None]:
all_trips.to_parquet(f'trips_tsi_segs_alameda{analysis_date}.parquet')

In [None]:
analysis_date

In [None]:
def read_shapes_stopping_in_seg(analysis_date):
    cols = ['shape_array_key', 'tsi_segment_id']
    sstb = pd.read_parquet(f'./shape_stops_tracts_borders_{analysis_date}.parquet')[cols]
    sstb['has_stop'] = True
    return sstb

In [None]:
sstb = read_shapes_stopping_in_seg(analysis_date)

In [None]:
sstb

In [None]:
def attach_stopping_info(trip_segment_df, shape_stopping_df):
    '''
    '''
    df = trip_segment_df.merge(shape_stopping_df, how='left', on=['shape_array_key', 'tsi_segment_id'])
    df.has_stop = df.has_stop.fillna(False)
    return df

In [None]:
attach_stopping_info(many_trip_test, sstb)

In [None]:
many_trip_test.merge(sstb, how='left', on=['shape_array_key', 'tsi_segment_id'])