In [None]:
import sys

In [None]:
import intake
import pandas as pd
import geopandas as gpd
import numpy as np
from calitp_data_analysis import geography_utils
from siuba import *
import gtfs_segments
import shapely

In [None]:
from shared_utils import catalog_utils, rt_dates, rt_utils
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [None]:
from segment_speed_utils import helpers

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [None]:
analysis_date = rt_dates.DATES['feb2025']

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

## trip - linestring interpolation

could use this again? https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/_shared_utils/shared_utils/rt_utils.py#L574-L579

https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/rt_delay/rt_analysis/rt_parser.py#L90

* fillna both directions arrival <-> departure time
* then dropna

## Projected st via `stop_times_direction`

### functions

In [None]:
def attach_projected_stop_times(analysis_date: str):
    '''
    
    '''
    path = f'{GTFS_DATA_DICT.rt_vs_schedule_tables.dir}{GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction}_{analysis_date}.parquet'
    ST_DIR_COLS = ['trip_instance_key', 'stop_sequence', 'stop_meters', 'stop_id']
    st_dir = gpd.read_parquet(path)[ST_DIR_COLS]
    st = helpers.import_scheduled_stop_times(analysis_date, get_pandas=True)
    trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_id', 'trip_instance_key', 'feed_key',
                                                                  'shape_array_key'])
    st = st.merge(trips, on = ['feed_key', 'trip_id'])
    return st.merge(st_dir, on = ['trip_instance_key', 'stop_sequence', 'stop_id'])

In [None]:
st_proj = attach_projected_stop_times(analysis_date)

In [None]:
st_proj.arrival_sec.isna().value_counts()

In [None]:
st_proj.departure_sec.isna().value_counts()

In [None]:
# st_proj.query('departure_sec.isna() & arrival_sec.isna()') # same subset, no point in fillna

In [None]:
# st_proj.to_parquet(f'st_proj_{analysis_date}.parquet')

In [None]:
st_proj = pd.read_parquet(f'st_proj_{analysis_date}.parquet')

## projecting TSI interpolation points

In [None]:
shapes = helpers.import_scheduled_shapes(analysis_date, crs=geography_utils.CA_NAD83Albers_m)

In [None]:
def read_tracts_borders(analysis_date, shapes):
    test_tract_borders = gpd.read_parquet(f'test_tracts_borders_{analysis_date}.parquet')
    test_tract_borders = test_tract_borders.drop(columns=['geometry'])

    test_tract_borders = (shapes.merge(test_tract_borders, on='shape_array_key')
                         .rename(columns={'geometry': 'shape_geometry'}))

    test_tract_borders = test_tract_borders.assign(
        start_meters = test_tract_borders.shape_geometry.project(test_tract_borders.start)
    )
    test_tract_borders = test_tract_borders.sort_values('start_meters').reset_index(drop=True)
    return test_tract_borders

In [None]:
test_tract_borders = read_tracts_borders(analysis_date, shapes)

## interpolation with arrays

### try one trip

In [None]:
%%time

one_trip = st_proj.query('trip_instance_key == "98c8b779600cc0c399755929110a83c4"').sort_values('stop_sequence')

In [None]:
one_trip.head(3)

In [None]:
shape_array = one_trip.stop_meters.to_numpy()
dt_float_array = one_trip.arrival_sec.to_numpy()

In [None]:
test_tract_borders['arrival_sec'] = test_tract_borders.start_meters.apply(
    rt_utils.time_at_position_numba, shape_array=shape_array, dt_float_array = dt_float_array)
test_tract_borders = test_tract_borders.assign(arrival_sec_next = test_tract_borders.arrival_sec.shift(-1),
                                               trip_instance_key = one_trip.trip_instance_key.iloc[0])
test_tract_borders.loc[0,'arrival_sec'] = one_trip.arrival_sec.min()
test_tract_borders.loc[test_tract_borders.index.max(),'arrival_sec_next'] = one_trip.arrival_sec.max()
test_tract_borders = test_tract_borders.assign(segment_seconds = test_tract_borders.arrival_sec_next - test_tract_borders.arrival_sec)

### create function -- with `test_tract_borders`

In [None]:
test_tract_borders.columns

In [None]:
tracts_borders_cols = ['shape_array_key', 'tsi_segment_id', 'start_meters', 'tsi_segment_meters']

In [None]:
tracts_borders = read_tracts_borders(analysis_date, shapes)[tracts_borders_cols]

In [None]:
tracts_borders_trips = tracts_borders.merge(st_proj[['shape_array_key', 'trip_instance_key']].drop_duplicates(), on='shape_array_key')

In [None]:
tracts_borders_trips

In [None]:
tracts_borders_trips.drop_duplicates(subset=['trip_instance_key']).shape

In [None]:
def tract_border_time_by_trip(tracts_borders_trip_df: pd.DataFrame, st_proj_df: pd.DataFrame):
    '''
    '''
    
    one_trip = st_proj_df.query('trip_instance_key == @tracts_borders_trip_df.trip_instance_key.iloc[0]').sort_values('stop_sequence')
    shape_array = one_trip.stop_meters.to_numpy()
    dt_float_array = one_trip.arrival_sec.to_numpy()
    tracts_borders_trip_df['arrival_sec'] = tracts_borders_trip_df.start_meters.apply(
        rt_utils.time_at_position_numba, shape_array=shape_array, dt_float_array = dt_float_array)
    tracts_borders_trip_df = tracts_borders_trip_df.assign(arrival_sec_next = tracts_borders_trip_df.arrival_sec.shift(-1),
                                                   trip_instance_key = one_trip.trip_instance_key.iloc[0])
    tracts_borders_trip_df.loc[tracts_borders_trip_df.index.min(),'arrival_sec'] = one_trip.arrival_sec.min()
    tracts_borders_trip_df.loc[tracts_borders_trip_df.index.max(),'arrival_sec_next'] = one_trip.arrival_sec.max()
    tracts_borders_trip_df = tracts_borders_trip_df.assign(segment_seconds = tracts_borders_trip_df.arrival_sec_next - tracts_borders_trip_df.arrival_sec)
    
    return tracts_borders_trip_df

In [None]:
tracts_borders_trips

In [None]:
many_trip_test =(tracts_borders_trips.head(1000)
                .groupby('trip_instance_key', group_keys=False)
                .progress_apply(tract_border_time_by_trip, st_proj_df = st_proj))

In [None]:
many_trip_test

In [None]:
meta = many_trip_test[:0]

In [None]:
# many_trip_test.query('trip_instance_key == "2799c68c8dc1bfca3e445f8b20eaa9ab"')

# many_trip_test.loc[36, 'tsi_segment_id']

In [None]:
import dask.dataframe as dd
import dask_geopandas as dg

In [None]:
trips = tracts_borders_trips.trip_instance_key.unique()

In [None]:
set1 = trips[:4000]

In [None]:
set2 = trips[4000:]

In [None]:
# ddf = dd.from_pandas(tracts_borders_trips.query('trip_instance_key.isin(@set1)'), npartitions=10)

# ddf = (ddf.groupby('trip_instance_key', group_keys=False)
#        .apply(tract_border_time_by_trip, st_proj_df = st_proj, meta=meta))

# %%time 
# df = ddf.compute()

# df.to_parquet(f'trips_set1_tsi_segs_alameda_{analysis_date}.parquet')

In [None]:
ddf = dd.from_pandas(tracts_borders_trips.query('trip_instance_key.isin(@set2)'), npartitions=10)

In [None]:
ddf = (ddf.groupby('trip_instance_key', group_keys=False)
       .apply(tract_border_time_by_trip, st_proj_df = st_proj, meta=meta))

In [None]:
%%time 
df = ddf.compute()

In [None]:
df.to_parquet(f'trips_set2_tsi_segs_alameda_{analysis_date}.parquet')

In [None]:
all_trips =(tracts_borders_trips.head(5000)
                .groupby('trip_instance_key', group_keys=False)
                .progress_apply(tract_border_time_by_trip, st_proj_df = st_proj))

In [None]:
all_trips.to_parquet(f'trips_tsi_segs_alameda{analysis_date}.parquet')