In [None]:
import pandas as pd
import geopandas as gpd
from siuba import *
import shared_utils

from calitp_data_analysis import get_fs, geography_utils
from segment_speed_utils import helpers, time_series_utils, gtfs_schedule_wrangling
from segment_speed_utils.project_vars import SCHED_GCS, SEGMENT_GCS, GTFS_DATA_DICT, analysis_date

i'd start with 5, checking if the columns you want are already there in this file (saved out at the end of stop_arrivals_to_speed, and is a speeds by trip table for that day)

In [None]:
catalog = shared_utils.catalog_utils.get_catalog('gtfs_analytics_data')

In [None]:
catalog.speedmap_segments

In [None]:
analysis_date

In [None]:
# path = f'{catalog.speedmap_segments.dir}{catalog.speedmap_segments.stage4}_{analysis_date}.parquet'

In [None]:
path = f'{catalog.speedmap_segments.dir}{catalog.speedmap_segments.shape_stop_single_segment_detail}_{analysis_date}.parquet'

In [None]:
path

In [None]:
detail = gpd.read_parquet(path)

In [None]:
detail >> head(3)

## need trip-level (pre-aggregation) gdf to properly calculate metrics

In [None]:
path = f'{catalog.speedmap_segments.dir}{catalog.speedmap_segments.stage4}_{analysis_date}.parquet'

In [None]:
st4 = pd.read_parquet(path)

In [None]:
no_speeds = st4[st4['speed_mph'].isna()]

In [None]:
no_speeds_pct = round((no_speeds.shape[0] / st4.shape[0]) * 100, 3)
print(f"{no_speeds_pct} percent of trip segments have no speed, dropping")

In [None]:
st4 = st4[~st4['speed_mph'].isna()]

In [None]:
st4.columns

In [None]:
st4 >> head(3)

## corridor specification

In [None]:
orgs = detail.drop_duplicates(subset=['organization_source_record_id', 'organization_name'])[['organization_source_record_id', 'organization_name']]

In [None]:
orgs[orgs.organization_name.str.contains('San Fr')]

In [None]:
def corridor_from_segments(
    speed_segments_gdf: gpd.GeoDataFrame,
    organization_source_record_id: str,
    shape_id: str,
    start_seg_id: str,
    end_seg_id: str
) -> gpd.GeoDataFrame:
    '''
    
    '''
    
    shape_filtered = speed_segments_gdf.query("organization_source_record_id == @organization_source_record_id & shape_id == @shape_id")
    
    shape_filtered = shape_filtered.assign(start_point = shape_filtered.geometry.apply(lambda x: x.boundary.geoms[0]),
                      end_point = shape_filtered.geometry.apply(lambda x: x.boundary.geoms[1])
                     )

    filter_ids = [start_seg_id, end_seg_id]
    current_seg_id = start_seg_id
    assert start_seg_id in shape_filtered.segment_id.values and end_seg_id in shape_filtered.segment_id.values

    for _ in shape_filtered.segment_id:
        if current_seg_id == end_seg_id: break
        current_end = shape_filtered.loc[shape_filtered['segment_id'] == current_seg_id]['end_point'].iloc[0]
        next_segment = shape_filtered.loc[shape_filtered['start_point'] == current_end]
        assert not next_segment.empty, f'unable to locate segment after {current_seg_id}'
        current_seg_id = next_segment.segment_id.iloc[0]
        filter_ids += next_segment.segment_id.to_list()
        
    relevant_segments = shape_filtered.query("segment_id in @filter_ids").drop_duplicates(subset='segment_id')
    corridor = relevant_segments.dissolve()[['schedule_gtfs_dataset_key', 'shape_array_key', 'shape_id',
                                            'name', 'organization_source_record_id', 'geometry']]
    corridor_start = corridor.geometry.iloc[0].boundary.geoms[0]
    corridor_end = corridor.geometry.iloc[0].boundary.geoms[1]
    print(corridor_start, corridor_end)
    corridor = corridor.to_crs(geography_utils.CA_NAD83Albers).assign(distance_meters = lambda x: x.geometry.length)
    corridor.geometry = corridor.buffer(100) #  100m corridor buffer
    
    return corridor

## move to shared_utils/geo_utils...

In [None]:
# corridor_start = corridor.geometry.iloc[0].boundary.geoms[0]
# corridor_end = corridor.geometry.iloc[0].boundary.geoms[1]

# import pyproj
# geodesic = pyproj.Geod(ellps="WGS84")

# long1, lat1 = (-122.40550254785889, 37.76900326502991)
# long2, lat2 = (-122.40217200000001, 37.724137999999975)

# fwd_azimuth,back_azimuth,distance = geodesic.inv(long1, lat1, long2, lat2)

## define corridor

In [None]:
# shape_id = '4953'
# start_seg_id = '18088-18089-1'
# end_seg_id = '16800-16806-1'

sf = 'rechaapWbeffO33OX'
shape_id = '800'
start_seg_id = '14970-17900-2'
end_seg_id = '16357-16358-1'

In [None]:
gdf = corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=sf, shape_id=shape_id,
                      start_seg_id=start_seg_id, end_seg_id=end_seg_id)

In [None]:
# gdf.explore()

In [None]:
gdf

In [None]:
st4.columns

In [None]:
detail.columns

## Corridor Measurements

Previous logic:

For each trip, get from the last stop before entering corridor to the first stop after exiting corridor. This was done on stop_sequence

Now,

* first sjoin with aggregated data (has geom). Sjoining on segments is equivalent to previous methodology, since it will yield the last stop before entry to the first stop after exiting...
* avoid doing scheduled delay metric for now...
* 

In [None]:
def find_corridor_data(
    speed_segments_gdf: gpd.GeoDataFrame,
    corridor_gdf: gpd.GeoDataFrame,
    trip_speeds_df: pd.DataFrame
) -> gpd.GeoDataFrame:
    '''
    
    '''
    speed_segments_gdf = speed_segments_gdf.to_crs(geography_utils.CA_NAD83Albers)
    corridor_segments = speed_segments_gdf.clip(corridor_gdf)
    attach_geom = corridor_segments[['shape_array_key', 'segment_id', 'geometry']].drop_duplicates()
    trip_speeds_df = attach_geom.merge(trip_speeds_df, on=['shape_array_key', 'segment_id'])
    return trip_speeds_df

In [None]:
corridor_trips = find_corridor_data(detail, gdf, st4)

In [None]:
corridor_trips.value_counts(subset=['route_short_name', 'direction_id', 'trip_instance_key'])

In [None]:
test = corridor_trips[corridor_trips['shape_id'] == '800']

In [None]:
test[['geometry', 'arrival_time_sec', 'segment_id', 'speed_mph', 'stop_meters', 'subseq_stop_meters']].explore()

In [None]:
corridor_trips.columns

In [None]:
grouped = corridor_trips.groupby(['trip_instance_key'])
min_stops = grouped[['stop_meters', 'arrival_time_sec']].min().add_suffix('_min')
max_stops = grouped[['subseq_stop_meters', 'subseq_arrival_time_sec']].max().add_suffix('_max')

In [None]:
min_stops

In [None]:
max_stops

In [None]:
df = min_stops.join(max_stops)
df = df.assign(
corridor_meters = df['subseq_stop_meters_max'] - df['stop_meters_min'],
corridor_seconds = df['subseq_arrival_time_sec_max'] - df['arrival_time_sec_min']
)
df = df.assign(corridor_speed_mps = df['corridor_meters'] / df['corridor_seconds'])

In [None]:
df[df['corridor_seconds'] == 0]

In [None]:
test = corridor_trips[corridor_trips['trip_instance_key'] == '2562dc12764584ea9af42252b6e9a6c6']

In [None]:
test[['geometry', 'arrival_time_sec', 'segment_id', 'speed_mph', 'stop_meters', 'subseq_stop_meters', 'route_short_name']].explore()

In [None]:
gdf.explore()