In [None]:
import pandas as pd
import geopandas as gpd

from shared_utils import rt_utils, catalog_utils

from calitp_data_analysis import get_fs, geography_utils
from segment_speed_utils import helpers, time_series_utils, gtfs_schedule_wrangling, corridor_analysis
from segment_speed_utils.project_vars import SCHED_GCS, SEGMENT_GCS, GTFS_DATA_DICT, analysis_date

# develop and test some basic tools for corridor analysis

In [None]:
catalog = catalog_utils.get_catalog('gtfs_analytics_data')

In [None]:
catalog.speedmap_segments

In [None]:
analysis_date

In [None]:
# path = f'{catalog.speedmap_segments.dir}{catalog.speedmap_segments.stage4}_{analysis_date}.parquet'

In [None]:
path = f'{catalog.speedmap_segments.dir}{catalog.speedmap_segments.shape_stop_single_segment_detail}_{analysis_date}.parquet'

In [None]:
path

In [None]:
detail = gpd.read_parquet(path).round(2)

In [None]:
detail.head(3)

## need trip-level (pre-aggregation) gdf to properly calculate metrics

In [None]:
st4 = corridor_analysis.import_trip_speeds(analysis_date)

## corridor specification

In [None]:
# rt_utils.show_full_df(pd.read_parquet('../ca_transit_speed_maps/_rt_progress_2024-12-11.parquet').sort_values(['caltrans_district', 'organization_name']))

## define corridor

In [None]:
# shape_id = '4953'
# start_seg_id = '18088-18089-1'
# end_seg_id = '16800-16806-1'

sf = 'rechaapWbeffO33OX'
shape_id = '800'
start_seg_id = '14970-17900-2'
end_seg_id = '16357-16358-1'

# hum = 'recynxkqEoo9dJEvw'
# shape_id = 'p_1435936'
# start_seg_id = '1252-4209812-1'
# end_seg_id = '1276-1277-1'

In [None]:
gdf = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=sf, shape_id=shape_id,
                      start_seg_id=start_seg_id, end_seg_id=end_seg_id)

## Corridor Measurements

Previous logic:

For each trip, get from the last stop before entering corridor to the first stop after exiting corridor. This was done on stop_sequence

Now,

* first sjoin with aggregated data (has geom). Sjoining on segments is equivalent to previous methodology, since it will yield the last stop before entry to the first stop after exiting...
* avoid doing scheduled delay metric for now...
* 

In [None]:
def find_corridor_data(
    speed_segments_gdf: gpd.GeoDataFrame,
    corridor_gdf: gpd.GeoDataFrame,
    trip_speeds_df: pd.DataFrame
) -> gpd.GeoDataFrame:
    '''
    With a buffered corridor defined, use the aggregated speed segments data to find relevant segments,
    then merge with trip-level speeds.
    '''
    speed_segments_gdf = speed_segments_gdf.to_crs(geography_utils.CA_NAD83Albers_m)
    corridor_segments = speed_segments_gdf.clip(corridor_gdf)
    attach_geom = corridor_segments[['shape_array_key', 'segment_id', 'trips_hr_sch',
                                     'geometry']].drop_duplicates()
    trip_speeds_df = attach_geom.merge(trip_speeds_df, on=['shape_array_key', 'segment_id']).assign(
                        corridor_id = corridor_gdf.corridor_id.iloc[0])
    
    trip_speeds_df['shape_length'] = trip_speeds_df.geometry.apply(lambda x: x.length)
    shape_lengths = (trip_speeds_df.drop_duplicates(
        subset=['schedule_gtfs_dataset_key', 'segment_id']).groupby(
        ['shape_id', 'schedule_gtfs_dataset_key'])[['shape_length']].sum(
        ).reset_index()
                    )
    return trip_speeds_df
    trip_speeds_df = trip_speeds_df.drop(columns=['shape_length']).merge(shape_lengths, on=['shape_id', 'schedule_gtfs_dataset_key'])
    half_corr = corridor_gdf.corridor_distance_meters.iloc[0] / 2
    corridor_relevance_threshold = min(half_corr, corridor_analysis.CORRIDOR_RELEVANCE)
    trip_speeds_df = trip_speeds_df.query('shape_length >= @corridor_relevance_threshold')
    
    return trip_speeds_df

In [None]:
corridor_trips = find_corridor_data(detail, gdf, st4)

In [None]:
# corridor_trips = corridor_analysis.find_corridor_data(detail, gdf, st4)
# corridor_results = corridor_analysis.analyze_corridor_trips(corridor_trips)

In [None]:
corridor_trips

In [None]:
corridor_trips['shape_length'] = corridor_trips.geometry.apply(lambda x: x.length)
shape_lengths = corridor_trips.drop_duplicates(subset=['schedule_gtfs_dataset_key', 'segment_id', 'shape_id']).groupby(['shape_id', 'schedule_gtfs_dataset_key', 'route_id'])[['shape_length']].sum().reset_index()

In [None]:
corridor_trips['shape_length'] = corridor_trips.geometry.apply(lambda x: x.length)
shape_lengths = corridor_trips.drop_duplicates(subset=['schedule_gtfs_dataset_key', 'segment_id']).groupby(['shape_id', 'schedule_gtfs_dataset_key'])[['shape_length']].sum().reset_index()

corridor_trips = corridor_trips.drop(columns=['shape_length']).merge(shape_lengths, on=['shape_id', 'schedule_gtfs_dataset_key'])

In [None]:
corr

In [None]:
m = gdf.explore(color='gray')

In [None]:
corridor_trips[['route_short_name', 'shape_length', 'geometry']].explore(m=m, column='shape_length')

In [None]:
corridor_trips.plot.scatter(x='route_short_name', y='shape_length')

In [None]:
# corridor_results.head(3)

In [None]:
# routes = ['8AX', '8BX', '8']
# rt8 = corridor_results.query('route_short_name in @routes')
# df = corridor_analysis.analyze_corridor_improvements(rt8, trip_seconds_saved=30)

df = corridor_analysis.analyze_corridor_improvements(corridor_results, trip_seconds_saved=30)

In [None]:
df.head(3)

In [None]:
frequencies = detail[['route_id', 'schedule_gtfs_dataset_key', 'trips_hr_sch']].drop_duplicates()
frequencies = frequencies.groupby(['route_id', 'schedule_gtfs_dataset_key']).max().reset_index().sort_values('trips_hr_sch', ascending=False)

In [None]:
SUMMARY_GROUP_COLS = ['route_short_name', 'route_id', 'time_of_day',
                     'corridor_id']

def summarize_corridor_improvements(df: pd.DataFrame, group_cols = SUMMARY_GROUP_COLS):
    '''
    
    '''
    sum_cols = ['corridor_seconds', 'improved_corridor_seconds', 'delay_seconds',
                   'delay_minutes']
    array_cols = ['route_short_name', 'route_id']
    df = df.assign(delay_seconds = df.corridor_seconds - df.improved_corridor_seconds)
                  # corridor_miles = df.corridor_meters / rt_utils.METERS_PER_MILE)
    df = df.assign(delay_minutes = df.delay_seconds / 60)
    df = df.merge(frequencies, on=['route_id', 'schedule_gtfs_dataset_key'])
    group = df.groupby(group_cols)[sum_cols + array_cols + ['schedule_gtfs_dataset_key']]
    df = group.agg({**{x:'sum' for x in sum_cols},
                    **{x:'unique' for x in array_cols},
                    'schedule_gtfs_dataset_key': 'min'})
    freq = df.explode(['route_short_name', 'route_id']).merge(frequencies, on=['route_id', 'schedule_gtfs_dataset_key'])
    df = df.assign(trips_hr_sch = [freq.trips_hr_sch.to_list()], total_trips_hr = freq.trips_hr_sch.sum())
    
    return df.round(1)

In [None]:
test = summarize_corridor_improvements(df, group_cols=['corridor_id'])

In [None]:
test

In [None]:
corr = pd.read_excel('./_temp/corr_hs.xlsx', sheet_name='Corridors')

In [None]:
hs = pd.read_excel('./_temp/corr_hs.xlsx', sheet_name='Hotspots')

In [None]:
corr = corr.astype({'start_segment_id': str, 'end_segment_id': str, 'shape_id':str})
hs = hs.astype({'start_segment_id': str, 'end_segment_id': str, 'shape_id':str})

In [None]:
row = corr.iloc[0,:]

In [None]:
row

In [None]:
gdf = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=row.organization_source_record_id, shape_id=row.shape_id,
                      start_seg_id=row.start_segment_id, end_seg_id=row.end_segment_id)

In [None]:
def corridor_from_row(df, intervention_dict):
    all_corridors = []
    for _, row in df.iterrows():
        try:
            print(row["SHS Segment"])
            corr = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=row.organization_source_record_id, shape_id=row.shape_id,
                          start_seg_id=row.start_segment_id, end_seg_id=row.end_segment_id, name=row['SHS Segment'])
            corridor_trips = corridor_analysis.find_corridor_data(detail, corr, st4)
            corridor_results = corridor_analysis.analyze_corridor_trips(corridor_trips)
            df = corridor_analysis.analyze_corridor_improvements(corridor_results, **intervention_dict)
            summ = summarize_corridor_improvements(df, group_cols=['corridor_id']).reset_index(drop=True)
            corr = pd.concat([corr, summ], axis=1)
            corr = corr.assign(corridor_miles = corr.corridor_distance_meters / rt_utils.METERS_PER_MILE) #  from corridor def, not trip distance
            corr = corr.assign(minutes_per_mile = corr.delay_minutes / corr.corridor_miles)
            all_corridors += [corr]
        except Exception as e:
            print(f'failed for{row["SHS Segment"]}')
            print(e)
            pass
    return pd.concat(all_corridors)

In [None]:
corr.loc[15]

In [None]:
corr_gdf = corridor_from_row(corr, intervention_dict={'trip_mph_target': 16})
# corr.iloc[:3,:].apply(corridor_from_row, axis=1)

In [None]:
corr_gdf

In [None]:
corr_gdf = pd.concat(all_corridors)[['corridor_name', 'name', 'corridor_miles', 'delay_minutes',
        'minutes_per_mile', 'geometry']]

In [None]:
corr_gdf.explore(column='minutes_per_mile')

In [None]:
all_corridors = []
hs.apply(corridor_from_row, axis=1, intervention_dict={'trip_mph_target': 16})

In [None]:
hs_gdf = pd.concat(all_corridors)[['corridor_name', 'name', 'corridor_miles', 'delay_minutes',
        'minutes_per_mile', 'geometry']]

In [None]:
hs_gdf.explore(column='delay_minutes')

In [None]:
corr_gdf.sort_values('minutes_per_mile', ascending=False)

In [None]:
hs_gdf.sort_values('delay_minutes', ascending=False)

## Discussion

* Which other metrics?

after screening:

ridership/person-hours of delay
accessibility, equity

* Is our list complete?

Include Tempo, Van Ness, can we compare?

* Exclude routes where necessary (current corridor join is just spatial...)

* Add location-specific interventions, and [guidance](https://caltrans.sharepoint.com/:w:/s/DOTPMPHQ-DataandDigitalServices/EdG0YNQcQMBJmKncAuNva9wBjpxVq2sD8p3C5azumXFNRA?e=TO7CbB)

* How much should we focus on freeways?

Yes, include (Bay Bridge, others, SDMTS freeway service, launch service with new HOT...)

District transit plans, express/rapid on freeways...?