In [None]:
import sys

In [None]:
import intake
import pandas as pd
import geopandas as gpd
import numpy as np
from calitp_data_analysis import geography_utils
from siuba import *
import gtfs_segments
import shapely

In [None]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [None]:
from segment_speed_utils import helpers

In [None]:
analysis_date = rt_dates.DATES['feb2025']

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

## trip - linestring interpolation

could use this again? https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/_shared_utils/shared_utils/rt_utils.py#L574-L579

https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/rt_delay/rt_analysis/rt_parser.py#L90

* fillna both directions arrival <-> departure time
* then dropna

## Projected st via `stop_times_direction`

### is this accurate?

In [None]:
path = f'{GTFS_DATA_DICT.rt_vs_schedule_tables.dir}{GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction}_{analysis_date}.parquet'
# ST_DIR_COLS = ['trip_instance_key', 'stop_sequence', 'stop_meters', 'stop_id', 'geometry']
st_dir = gpd.read_parquet(path)

In [None]:
test = st_dir.query('trip_instance_key == "98c8b779600cc0c399755929110a83c4"').sort_values('stop_sequence')

In [None]:
test.crs

In [None]:
shapes_proj = shapes.to_crs(test.crs)

In [None]:
test_geom = shapes_proj.query('shape_array_key == "3caab5c44277cbdc8fbc755bc0ea7633"').geometry.iloc[0]

In [None]:
test_geom.project(test.geometry.iloc[0])

In [None]:
test

### functions

In [None]:
def attach_projected_stop_times(analysis_date: str):
    '''
    
    '''
    path = f'{GTFS_DATA_DICT.rt_vs_schedule_tables.dir}{GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction}_{analysis_date}.parquet'
    ST_DIR_COLS = ['trip_instance_key', 'stop_sequence', 'stop_meters', 'stop_id']
    st_dir = gpd.read_parquet(path)[ST_DIR_COLS]
    st = helpers.import_scheduled_stop_times(analysis_date, get_pandas=True)
    trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_id', 'trip_instance_key', 'feed_key',
                                                                  'shape_array_key'])
    st = st.merge(trips, on = ['feed_key', 'trip_id'])
    return st.merge(st_dir, on = ['trip_instance_key', 'stop_sequence', 'stop_id'])

In [None]:
stops = helpers.import_scheduled_stops(analysis_date, columns=['feed_key', 'stop_id', 'geometry'])

In [None]:
st_proj = attach_projected_stop_times(analysis_date)

In [None]:
st_proj.arrival_sec.isna().value_counts()

In [None]:
st_proj.departure_sec.isna().value_counts()

In [None]:
st_proj.query('departure_sec.isna()')

In [None]:
st_proj.query('trip_id == "VL_13_outbound_0755"').sort_values('stop_sequence')

In [None]:
st_proj.to_parquet(f'st_proj_{analysis_date}.parquet')

## projecting TSI interpolation points

### let's use `gtfs_segments`

* project all stop x shape combos either natively or using `gtfs_segments`
    * https://github.com/UTEL-UIUC/gtfs_segments/blob/871447705f7058da3f05f86aa9da42b75996808c/gtfs_segments/geom_utils.py#L437
    * `nearest_points` should be usable...
* need to use either tract or intersection_hash as stop_id

In [None]:
shapes = helpers.import_scheduled_shapes(analysis_date, crs=geography_utils.WGS84)

In [None]:
test_tract_borders = gpd.read_parquet('test_tracts_borders_2025-02-12.parquet')

In [None]:
test_tract_borders = (test_tract_borders.drop(columns=['geometry'])
                     .assign(tsi_segment_id = test_tract_borders.tract.combine_first(test_tract_borders.intersection_hash))
                     )

In [None]:
test_tract_borders = shapes.merge(test_tract_borders, on='shape_array_key')

In [None]:
test_tract_borders.head(3)

In [None]:
gtfs_segments_rename = {'shape_array_key': 'trip_id',
                       'tsi_segment_id': 'stop_id'}

In [None]:
test_tract_borders = test_tract_borders.rename(columns=gtfs_segments_rename)
test_tract_borders['arrival_time'] = None

In [None]:
import contextily as cx
import folium
import matplotlib.pyplot as plt
import utm
from matplotlib.figure import Figure
from pyproj import Geod
from scipy.spatial import cKDTree
from shapely.geometry import LineString, Point
from shapely.ops import split

geod = Geod(ellps="WGS84")

In [None]:
def segment_bootstrap(stop_df, k_neighbors=3):
    '''
    `gtfs_segments` wants the df to be _in order_ for the algorithm to work.
    
    We can't rely on sjoining stops to tracts/borders and using stop_sequence,
    since we still need to track tracts/borders without any stops.
    
    Apply the first part of gtfs_segments.geom_utils.nearest_points to 
    get a first order estimate, use that to sort
    '''
    geo_const = 6371000 * np.pi / 180
    
    for name, group in stop_df.groupby("trip_id"):
        neighbors = k_neighbors
        geom_line = group["geometry"].iloc[0]
        tree = cKDTree(data=np.array(geom_line.coords))
        stops = [x.coords[0] for x in group["start"]]
        np_dist, np_inds = tree.query(stops, workers=-1, k=neighbors)
        group = group.assign(stop_meters = [min(n) for n in np_inds])
        stop_df.loc[stop_df.trip_id == name, "bootstrap_meters"] = [min(n) for n in np_inds]
    return stop_df.sort_values('bootstrap_meters')

In [None]:
stop_df = segment_bootstrap(test_tract_borders)

In [None]:
gtfs_segments_rename_inverse = {v: k for k, v in gtfs_segments_rename.items()}

In [None]:
segments_interpolated = gtfs_segments.geom_utils.nearest_points(stop_df).rename(columns=gtfs_segments_rename_inverse)

## interpolation with arrays

In [None]:
st_proj

In [None]:
segments_interpolated

### try one trip

In [None]:
one_trip = st_proj.query('trip_instance_key == "98c8b779600cc0c399755929110a83c4"').sort_values('stop_sequence')

In [None]:
one_trip

In [None]:
one_trip.stop_meters.to_numpy()

In [None]:
one_trip.arrival_sec.to_numpy()

In [None]:
segments_interpolated.explore()