In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
import numpy as np
from calitp_data_analysis import geography_utils
from siuba import *
import gtfs_segments
import shapely

from update_vars import ANALYSIS_DATE, BORDER_BUFFER_METERS, GCS_PATH

In [3]:
from shared_utils import catalog_utils, rt_dates, rt_utils
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers

In [5]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

# Prepare df for aggregation

## trip - linestring interpolation

could use this again? https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/_shared_utils/shared_utils/rt_utils.py#L574-L579

https://github.com/cal-itp/data-analyses/blob/530f2d5cf4419a2403d6485845d91ac4bc65e672/rt_delay/rt_analysis/rt_parser.py#L90

* fillna both directions arrival <-> departure time
* then dropna

## Projected st via `stop_times_direction`

### functions

* could I just use helpers import w/ dir here?

In [6]:
def attach_projected_stop_times(analysis_date: str):
    '''
    
    '''
    st_dir_cols = ['trip_instance_key', 'stop_sequence', 'stop_meters', 'stop_id']
    st_dir = helpers.import_scheduled_stop_times(analysis_date, columns=st_dir_cols, get_pandas=True,
                                                 with_direction=True)
    st = helpers.import_scheduled_stop_times(analysis_date, get_pandas=True)
    trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_id', 'trip_instance_key', 'feed_key',
                                                                  'shape_array_key'])
    st = st.merge(trips, on = ['feed_key', 'trip_id'])
    return st.merge(st_dir, on = ['trip_instance_key', 'stop_sequence', 'stop_id'])

In [7]:
st_proj = attach_projected_stop_times(ANALYSIS_DATE)

In [8]:
st_proj.departure_sec.isna().value_counts()

False    4399313
True      102084
Name: departure_sec, dtype: int64

In [39]:
trips = helpers.import_scheduled_trips(ANALYSIS_DATE, columns=['trip_id', 'trip_instance_key', 'feed_key',
                                                              'shape_array_key'])

In [9]:
# st_proj.query('departure_sec.isna() & arrival_sec.isna()') # same subset, no point in fillna

## projecting TSI interpolation points

In [10]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE, crs=geography_utils.CA_NAD83Albers_m)

In [27]:
def read_tsi_segs(analysis_date, shapes):
    tsi_segs = gpd.read_parquet(f'tsi_segments_{analysis_date}.parquet')
    tsi_segs = tsi_segs.drop(columns=['geometry'])

    shape_merged = (shapes.merge(tsi_segs, on='shape_array_key')
                         .rename(columns={'geometry': 'shape_geometry'}))

    shape_merged = shape_merged.assign(
        start_meters = shape_merged.shape_geometry.project(shape_merged.start)
    )
    shape_merged = shape_merged.sort_values('start_meters').reset_index(drop=True)
    cols = ['shape_array_key', 'tsi_segment_id', 'start_meters', 'tsi_segment_meters']
    shape_merged = shape_merged[cols]
    return shape_merged

## interpolation with arrays

### create function -- with `test_tract_borders`

In [28]:
shape_merged = read_tsi_segs(ANALYSIS_DATE, shapes)

  return lib.line_locate_point(line, other)


In [29]:
tsi_segments_trips = shape_merged.merge(st_proj[['shape_array_key', 'trip_instance_key']].drop_duplicates(), on='shape_array_key')

In [30]:
tsi_segments_trips

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key
0,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,6d53e9536be4eeea9e1648484a0a409f
1,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,c5f8fa23a7660fe300aaadc535fb7e24
2,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,dbcf145f2cb2ae3375a4b2d3d9c0cd43
3,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,d073966d625a8adffa950a02169e05ed
4,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,5ca59a4662b19ed0705c6ca4e7b7e3b6
...,...,...,...,...,...
2844392,ba7cb0c49ed0e16c54cf79c5e3f4f809,06057001206,3.506928e+06,4118.166756,7d4f8a2f1ba86b13f249515230df155c
2844393,ba7cb0c49ed0e16c54cf79c5e3f4f809,06057001205,3.511047e+06,8225.176570,7d4f8a2f1ba86b13f249515230df155c
2844394,ba7cb0c49ed0e16c54cf79c5e3f4f809,06057000900,3.519280e+06,4698.406032,7d4f8a2f1ba86b13f249515230df155c
2844395,ba7cb0c49ed0e16c54cf79c5e3f4f809,06057000900,3.524024e+06,15218.109616,7d4f8a2f1ba86b13f249515230df155c


In [32]:
tsi_segments_trips.drop_duplicates(subset=['trip_instance_key']).shape

(154141, 5)

In [41]:
def tract_border_time_by_trip(tracts_borders_trip_df: pd.DataFrame, st_proj_df: pd.DataFrame):
    '''
    '''
    
    one_trip = st_proj_df.query('trip_instance_key == @tracts_borders_trip_df.trip_instance_key.iloc[0]').sort_values('stop_sequence')
    shape_array = one_trip.stop_meters.to_numpy()
    dt_float_array = one_trip.arrival_sec.to_numpy()
    tracts_borders_trip_df['arrival_sec'] = tracts_borders_trip_df.start_meters.apply(
        rt_utils.time_at_position_numba, shape_array=shape_array, dt_float_array = dt_float_array)
    tracts_borders_trip_df = tracts_borders_trip_df.assign(arrival_sec_next = tracts_borders_trip_df.arrival_sec.shift(-1),
                                                   trip_instance_key = one_trip.trip_instance_key.iloc[0])
    tracts_borders_trip_df.loc[tracts_borders_trip_df.index.min(),'arrival_sec'] = one_trip.arrival_sec.min()
    tracts_borders_trip_df.loc[tracts_borders_trip_df.index.max(),'arrival_sec_next'] = one_trip.arrival_sec.max()
    tracts_borders_trip_df = tracts_borders_trip_df.assign(segment_seconds = tracts_borders_trip_df.arrival_sec_next - tracts_borders_trip_df.arrival_sec)
    
    return tracts_borders_trip_df

In [43]:
tsi_segments_trips

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key
0,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,6d53e9536be4eeea9e1648484a0a409f
1,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,c5f8fa23a7660fe300aaadc535fb7e24
2,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,dbcf145f2cb2ae3375a4b2d3d9c0cd43
3,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,d073966d625a8adffa950a02169e05ed
4,0f4a8a4dda3a6a1fa8da45aa0ff09060,06067000600,0.000000e+00,616.350887,5ca59a4662b19ed0705c6ca4e7b7e3b6
...,...,...,...,...,...
2844392,ba7cb0c49ed0e16c54cf79c5e3f4f809,06057001206,3.506928e+06,4118.166756,7d4f8a2f1ba86b13f249515230df155c
2844393,ba7cb0c49ed0e16c54cf79c5e3f4f809,06057001205,3.511047e+06,8225.176570,7d4f8a2f1ba86b13f249515230df155c
2844394,ba7cb0c49ed0e16c54cf79c5e3f4f809,06057000900,3.519280e+06,4698.406032,7d4f8a2f1ba86b13f249515230df155c
2844395,ba7cb0c49ed0e16c54cf79c5e3f4f809,06057000900,3.524024e+06,15218.109616,7d4f8a2f1ba86b13f249515230df155c


In [45]:
many_trip_test =(tsi_segments_trips.head(1000)
                .groupby('trip_instance_key', group_keys=False)
                .progress_apply(tract_border_time_by_trip, st_proj_df = st_proj))

Progress: 100%|██████████| 109/109 [00:30<00:00,  3.55it/s]


In [None]:
many_trip_test

In [None]:
meta = many_trip_test[:0]

In [None]:
# many_trip_test.query('trip_instance_key == "2799c68c8dc1bfca3e445f8b20eaa9ab"')

# many_trip_test.loc[36, 'tsi_segment_id']

In [None]:
import dask.dataframe as dd
import dask_geopandas as dg

In [None]:
trips = tracts_borders_trips.trip_instance_key.unique()

In [None]:
set1 = trips[:4000]

In [None]:
set2 = trips[4000:]

In [None]:
# ddf = dd.from_pandas(tracts_borders_trips.query('trip_instance_key.isin(@set1)'), npartitions=10)

# ddf = (ddf.groupby('trip_instance_key', group_keys=False)
#        .apply(tract_border_time_by_trip, st_proj_df = st_proj, meta=meta))

# %%time 
# df = ddf.compute()

# df.to_parquet(f'trips_set1_tsi_segs_alameda_{analysis_date}.parquet')

In [None]:
ddf = dd.from_pandas(tracts_borders_trips.query('trip_instance_key.isin(@set2)'), npartitions=10)

In [None]:
ddf = (ddf.groupby('trip_instance_key', group_keys=False)
       .apply(tract_border_time_by_trip, st_proj_df = st_proj, meta=meta))

In [None]:
%%time 
df = ddf.compute()

In [None]:
df.to_parquet(f'trips_set2_tsi_segs_alameda_{analysis_date}.parquet')