In [1]:
import sys

In [34]:
import intake
import pandas as pd
import geopandas as gpd
from calitp_data_analysis import geography_utils
from siuba import *
import gtfs_segments
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers

In [5]:
analysis_date = rt_dates.DATES['feb2025']

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

## trip - linestring interpolation

could use this again? https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/_shared_utils/shared_utils/rt_utils.py#L574-L579

https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/rt_delay/rt_analysis/rt_parser.py#L90

* fillna both directions arrival <-> departure time
* then dropna

## Projected st via `stop_times_direction`

In [6]:
def attach_projected_stop_times(analysis_date: str):
    '''
    
    '''
    path = f'{GTFS_DATA_DICT.rt_vs_schedule_tables.dir}{GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction}_{analysis_date}.parquet'
    ST_DIR_COLS = ['trip_instance_key', 'stop_sequence', 'stop_meters']
    st_dir = gpd.read_parquet(path)[ST_DIR_COLS]
    st = helpers.import_scheduled_stop_times(analysis_date, get_pandas=True)
    trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_id', 'trip_instance_key', 'feed_key'])
    st = st.merge(trips, on = ['feed_key', 'trip_id'])
    return st.merge(st_dir, on = ['trip_instance_key', 'stop_sequence'])

In [None]:
stops = helpers.import_scheduled_stops(analysis_date, columns=['feed_key', 'stop_id', 'geometry'])

In [7]:
st_proj = attach_projected_stop_times(analysis_date)

In [10]:
st_proj.arrival_sec.isna().value_counts()

False    4236225
True       90804
Name: arrival_sec, dtype: int64

In [11]:
st_proj.departure_sec.isna().value_counts()

False    4236225
True       90804
Name: departure_sec, dtype: int64

In [12]:
st_proj.query('departure_sec.isna()')

Unnamed: 0,feed_key,feed_timezone,base64_url,trip_id,stop_id,stop_sequence,timepoint,arrival_sec,departure_sec,arrival_hour,departure_hour,trip_instance_key,stop_meters
3103,c8c2eeefda895cf2e9971029b998af1a,US/Pacific,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,t_5674198_b_33098_tn_0,7015,17,0.0,,,,,c6a594134f701ab6f78ef49fd413a124,6081.455776
3104,c8c2eeefda895cf2e9971029b998af1a,US/Pacific,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,t_5674198_b_33098_tn_0,801,21,0.0,,,,,c6a594134f701ab6f78ef49fd413a124,5060.554260
3108,c8c2eeefda895cf2e9971029b998af1a,US/Pacific,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,t_5674198_b_33098_tn_0,7002,2,0.0,,,,,c6a594134f701ab6f78ef49fd413a124,2517.475724
3109,c8c2eeefda895cf2e9971029b998af1a,US/Pacific,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,t_5674198_b_33098_tn_0,803,23,0.0,,,,,c6a594134f701ab6f78ef49fd413a124,5060.554260
3110,c8c2eeefda895cf2e9971029b998af1a,US/Pacific,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,t_5674198_b_33098_tn_0,805,25,0.0,,,,,c6a594134f701ab6f78ef49fd413a124,5341.559600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325324,0319b0617cc88876f918f8c63abc1ac0,America/Los_Angeles,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,VL_13_outbound_0755,22286,3,0.0,,,,,977e6569b6ca81091c34d146f897c585,0.000000
4325325,0319b0617cc88876f918f8c63abc1ac0,America/Los_Angeles,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,VL_13_outbound_0755,22306,4,0.0,,,,,977e6569b6ca81091c34d146f897c585,435.273048
4325329,0319b0617cc88876f918f8c63abc1ac0,America/Los_Angeles,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,D_13_outbound_0755,22225,3,0.0,,,,,3b1e14196ab4aab37d25b36bca550f95,1535.322402
4326230,32db8314a8fb7d99b61296c37545c460,America/Los_Angeles,aHR0cHM6Ly93d3cuY2l0eW9mZ2xlbmRvcmEuZ292L2ZpbG...,Gold-Line-Commuter-Shuttle-North_Eastbound-wkd...,2619576,3,0.0,,,,,48d7051b118a9cab8357908c201deae1,4518.785645


In [16]:
st_proj.query('trip_id == "VL_13_outbound_0755"').sort_values('stop_sequence')

Unnamed: 0,feed_key,feed_timezone,base64_url,trip_id,stop_id,stop_sequence,timepoint,arrival_sec,departure_sec,arrival_hour,departure_hour,trip_instance_key,stop_meters
4325323,0319b0617cc88876f918f8c63abc1ac0,America/Los_Angeles,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,VL_13_outbound_0755,22256,1,1.0,28500.0,28500.0,7.0,7.0,977e6569b6ca81091c34d146f897c585,4654.515682
4325326,0319b0617cc88876f918f8c63abc1ac0,America/Los_Angeles,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,VL_13_outbound_0755,22361,2,1.0,28680.0,28680.0,7.0,7.0,977e6569b6ca81091c34d146f897c585,322.88297
4325324,0319b0617cc88876f918f8c63abc1ac0,America/Los_Angeles,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,VL_13_outbound_0755,22286,3,0.0,,,,,977e6569b6ca81091c34d146f897c585,0.0
4325325,0319b0617cc88876f918f8c63abc1ac0,America/Los_Angeles,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,VL_13_outbound_0755,22306,4,0.0,,,,,977e6569b6ca81091c34d146f897c585,435.273048
4325327,0319b0617cc88876f918f8c63abc1ac0,America/Los_Angeles,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,VL_13_outbound_0755,22304,5,1.0,28920.0,28920.0,8.0,8.0,977e6569b6ca81091c34d146f897c585,2899.976133


In [18]:
st_proj.to_parquet(f'st_proj_{analysis_date}.parquet')

## projecting TSI interpolation points

### let's use `gtfs_segments`

* project all stop x shape combos either natively or using `gtfs_segments`
    * https://github.com/UTEL-UIUC/gtfs_segments/blob/871447705f7058da3f05f86aa9da42b75996808c/gtfs_segments/geom_utils.py#L437
    * `nearest_points` should be usable...
* need to use either tract or intersection_hash as stop_id

In [169]:
shapes = helpers.import_scheduled_shapes(analysis_date, crs=geography_utils.WGS84)

In [167]:
test_tract_borders = gpd.read_parquet('test_tracts_borders_2025-02-12.parquet')

In [168]:
test_tract_borders = (test_tract_borders.drop(columns=['geometry'])
                     .assign(tsi_segment_id = test_tract_borders.tract.combine_first(test_tract_borders.intersection_hash))
                     )

In [170]:
test_tract_borders = shapes.merge(test_tract_borders, on='shape_array_key')

In [171]:
test_tract_borders.head(3)

Unnamed: 0,shape_array_key,geometry,tract,tract_1,tract_2,intersection_hash,border,start,end,tsi_segment_id
0,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",6001400300,,,,False,POINT (-122.26217 37.83817),POINT (-122.26171 37.84107),6001400300
1,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",6001401100,,,,False,POINT (-122.26587 37.82385),POINT (-122.26233 37.83705),6001401100
2,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",6001401300,,,,False,POINT (-122.26898 37.81194),POINT (-122.26588 37.82382),6001401300


In [172]:
gtfs_segments_rename = {'shape_array_key': 'trip_id',
                       'tsi_segment_id': 'stop_id'}

In [175]:
test_tract_borders = test_tract_borders.rename(columns=gtfs_segments_rename)
test_tract_borders['arrival_time'] = None

In [176]:
import contextily as cx
import folium
import matplotlib.pyplot as plt
import utm
from matplotlib.figure import Figure
from pyproj import Geod
from scipy.spatial import cKDTree
from shapely.geometry import LineString, Point
from shapely.ops import split

geod = Geod(ellps="WGS84")

In [192]:
def segment_bootstrap(stop_df, k_neighbors=3):
    '''
    `gtfs_segments` wants the df to be _in order_ for the algorithm to work.
    
    We can't rely on sjoining stops to tracts/borders and using stop_sequence,
    since we still need to track tracts/borders without any stops.
    
    Apply the first part of gtfs_segments.geom_utils.nearest_points to 
    get a first order estimate, use that to sort
    '''
    geo_const = 6371000 * np.pi / 180
    
    for name, group in stop_df.groupby("trip_id"):
        neighbors = k_neighbors
        geom_line = group["geometry"].iloc[0]
        tree = cKDTree(data=np.array(geom_line.coords))
        stops = [x.coords[0] for x in group["start"]]
        np_dist, np_inds = tree.query(stops, workers=-1, k=neighbors)
        group = group.assign(stop_meters = [min(n) for n in np_inds])
        stop_df.loc[stop_df.trip_id == name, "bootstrap_meters"] = [min(n) for n in np_inds]
    return stop_df.sort_values('bootstrap_meters')

In [193]:
stop_df = segment_bootstrap(test_tract_borders)

In [194]:
gtfs_segments.geom_utils.nearest_points(stop_df)

Total trips processed:  16


Unnamed: 0,trip_id,geometry,tract,tract_1,tract_2,intersection_hash,border,start,end,stop_id,arrival_time,bootstrap_meters,snap_start_id
0,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",6001403100.0,,,,False,POINT (-122.27446 37.80225),POINT (-122.27280 37.80239),6001403100.0,,0.0,0
1,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",,6001403000.0,6001403100.0,-3.476483e+18,True,POINT (-122.27292 37.80244),POINT (-122.27101 37.80459),-3.4764827078068393e+18,,19.0,20
2,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",,6001402900.0,6001402800.0,8.444363e+18,True,POINT (-122.27137 37.80405),POINT (-122.26866 37.80924),8.444362949197193e+18,,35.0,36
3,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",6001402800.0,,,,False,POINT (-122.26853 37.80919),POINT (-122.26898 37.81194),6001402800.0,,70.0,71
4,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",6001401300.0,,,,False,POINT (-122.26898 37.81194),POINT (-122.26588 37.82382),6001401300.0,,92.0,93
5,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",6001401100.0,,,,False,POINT (-122.26587 37.82385),POINT (-122.26233 37.83705),6001401100.0,,173.0,173
6,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",,6001401100.0,6001400300.0,-8.684109e+18,True,POINT (-122.26234 37.83699),POINT (-122.26211 37.83831),-8.684109331624109e+18,,267.0,268
7,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",6001400300.0,,,,False,POINT (-122.26217 37.83817),POINT (-122.26171 37.84107),6001400300.0,,276.0,277
8,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",,6001400600.0,6001400400.0,1.701263e+18,True,POINT (-122.26171 37.84106),POINT (-122.26109 37.84529),1.7012630256245944e+18,,296.0,297
9,3caab5c44277cbdc8fbc755bc0ea7633,"LINESTRING (-122.27446 37.80225, -122.27468 37...",,6001400500.0,6001400400.0,9.114762e+18,True,POINT (-122.26127 37.84471),POINT (-122.26007 37.85303),9.114761875024976e+18,,319.0,320
