In [None]:
import pandas as pd
import geopandas as gpd

from shared_utils import rt_utils, catalog_utils

from calitp_data_analysis import get_fs, geography_utils
from segment_speed_utils import helpers, time_series_utils, gtfs_schedule_wrangling, corridor_analysis
from segment_speed_utils.project_vars import SCHED_GCS, SEGMENT_GCS, GTFS_DATA_DICT, analysis_date
import numpy as np

In [None]:
import importlib
importlib.reload(corridor_analysis)

# develop and test some basic tools for corridor analysis

In [None]:
catalog = catalog_utils.get_catalog('gtfs_analytics_data')

In [None]:
analysis_date

In [None]:
path = f'{catalog.speedmap_segments.dir}{catalog.speedmap_segments.shape_stop_single_segment_detail}_{analysis_date}.parquet'

In [None]:
path

In [None]:
detail = gpd.read_parquet(path)

## need trip-level (pre-aggregation) gdf to properly calculate metrics

In [None]:
st4 = corridor_analysis.import_trip_speeds(analysis_date)

## corridor specification

In [None]:
# rt_utils.show_full_df(pd.read_parquet('../ca_transit_speed_maps/_rt_progress_2024-12-11.parquet').sort_values(['caltrans_district', 'organization_name']))

## Corridor Measurements

Previous logic:

For each trip, get from the last stop before entering corridor to the first stop after exiting corridor. This was done on stop_sequence

Now,

* first sjoin with aggregated data (has geom). Sjoining on segments is equivalent to previous methodology, since it will yield the last stop before entry to the first stop after exiting...
* avoid doing scheduled delay metric for now...
* 

In [None]:
row = hs.loc[7]

In [None]:
gdf = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail,
                                               organization_source_record_id=row.organization_source_record_id,
                                               shape_id=row.shape_id,
                                               start_seg_id=row.start_segment_id,
                                               end_seg_id=row.end_segment_id)

In [None]:
corridor_trips = corridor_analysis.find_corridor_data(detail, gdf, st4)
corridor_results = corridor_analysis.analyze_corridor_trips(corridor_trips)

In [None]:
validate_corridor_routes(gdf, corridor_trips)

In [None]:
# routes = ['8AX', '8BX', '8']
# rt8 = corridor_results.query('route_short_name in @routes')
# df = corridor_analysis.analyze_corridor_improvements(rt8, trip_seconds_saved=30)

# df = corridor_analysis.analyze_corridor_improvements(corridor_results, trip_seconds_saved=30)
corridor_improvements = corridor_analysis.analyze_corridor_improvements(corridor_results, trip_percent_speedup=.30)

In [None]:
corridor_improvements.head(3)

In [None]:
def summarize_corridor_improvements(analysis_df: pd.DataFrame, extra_group_cols: list = []):
    '''
    
    '''
    group_cols=['corridor_id', 'schedule_gtfs_dataset_key', 'intervention_assumption'] + extra_group_cols
    sum_cols = ['corridor_seconds', 'improved_corridor_seconds', 'delay_seconds',
                   'delay_minutes']
    array_cols = ['route_short_name', 'route_id']
    analysis_df = analysis_df.assign(delay_seconds = analysis_df.corridor_seconds - analysis_df.improved_corridor_seconds)
                  # corridor_miles = analysis_df.corridor_meters / rt_utils.METERS_PER_MILE)
    analysis_df = analysis_df.assign(delay_minutes = analysis_df.delay_seconds / 60)
    
    group = analysis_df.groupby(group_cols)

    analysis_df = group.agg({**{x:'sum' for x in sum_cols},
                    **{x:'unique' for x in array_cols},
                            'corridor_speed_mph': np.median})
    analysis_df = analysis_df.rename(columns={'corridor_speed_mph': 'median_corridor_mph'})
    analysis_df = analysis_df.merge(group.agg({'corridor_speed_mph':'count'}).rename(
        columns={'corridor_speed_mph':'n_trips_daily'}), on=group_cols)
    #  join in max route frequencies
    freq = (analysis_df.explode(['route_short_name', 'route_id']).reset_index(
                ).merge(frequencies, on=['route_id', 'schedule_gtfs_dataset_key'])
           )
    #  add frequencies to output to match array cols; allow inspection before summing delay metrics and frequencies
    analysis_df = (analysis_df.reset_index().merge(
        freq.groupby('schedule_gtfs_dataset_key').agg(
            {'trips_hr_sch': lambda x: list(x)}), on='schedule_gtfs_dataset_key')
         )
    return analysis_df.round(1)

In [None]:
df = summarize_corridor_improvements(corridor_improvements)

In [None]:
df

In [None]:
corr = pd.read_excel('./_temp/corr_hs.xlsx', sheet_name='Corridors')

In [None]:
hs = pd.read_excel('./_temp/corr_hs.xlsx', sheet_name='Hotspots')

In [None]:
corr = corr.astype({'start_segment_id': str, 'end_segment_id': str, 'shape_id':str})
hs = hs.astype({'start_segment_id': str, 'end_segment_id': str, 'shape_id':str})

In [None]:
row = corr.iloc[10,:]

In [None]:
row

In [None]:
gdf = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=row.organization_source_record_id, shape_id=row.shape_id,
                      start_seg_id=row.start_segment_id, end_seg_id=row.end_segment_id)

In [None]:
def combine_corridor_operators(corridor_gdf):
    '''
    aggregate all transit operators in each corridor
    '''
    group_cols = ['corridor_id', 'corridor_name', 'geometry',
                 'intervention_assumption']
    overall = corridor_gdf.groupby(group_cols).agg({
        'corridor_miles': 'min', 'delay_minutes': 'sum', 'minutes_per_mile': 'sum', 'median_corridor_mph': np.median,
        'trips_per_hr_peak_directional': 'sum', 'n_trips_daily':'sum',
    }).reset_index()
    return overall.sort_values('minutes_per_mile', ascending=False)

In [None]:
def corridor_from_row(df, intervention_dict, fwy_xpwy_floor = None):
    all_corridors = []
    for _, row in df.iterrows():
        try:
            print(row["SHS Segment"])
            corr = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=row.organization_source_record_id, shape_id=row.shape_id,
                          start_seg_id=row.start_segment_id, end_seg_id=row.end_segment_id, name=row['SHS Segment'])
            corridor_trips = corridor_analysis.find_corridor_data(detail, corr, st4)
            display(corridor_analysis.validate_corridor_routes(corr, corridor_trips))
            corridor_results = corridor_analysis.analyze_corridor_trips(corridor_trips)
            if hasattr(row, 'fwy_xpwy')  and row.fwy_xpwy:
                analyzed_interventions = intervention_dict.copy()
                analyzed_interventions['trip_mph_floor'] = fwy_xpwy_floor
                df = corridor_analysis.analyze_corridor_improvements(corridor_results, **analyzed_interventions)
            else:
                df = corridor_analysis.analyze_corridor_improvements(corridor_results, **intervention_dict)
            summ = summarize_corridor_improvements(df).reset_index(drop=True)
            corr = pd.merge(corr, summ, on='corridor_id')
            corr = corr.assign(corridor_miles = corr.corridor_distance_meters / rt_utils.METERS_PER_MILE) #  from corridor def, not trip distance
            corr = corr.assign(minutes_per_mile = corr.delay_minutes / corr.corridor_miles)
            all_corridors += [corr]
        except Exception as e:
            print(f'failed for{row["SHS Segment"]}')
            print(e)
            pass
    all_corridors = pd.concat(all_corridors)
    all_corridors = all_corridors.assign(trips_per_hr_peak_directional = all_corridors.trips_hr_sch.map(lambda x: sum(x)))
    
    all_corridors = combine_corridor_operators(all_corridors)
    return all_corridors

In [None]:
corr.loc[3]

In [None]:
corr_gdf = corridor_from_row(corr, intervention_dict={'trip_mph_floor': 16, 'trip_percent_speedup': 15}, fwy_xpwy_floor = 35)
# corr_gdf = corridor_from_row(corr.iloc[3:4,:], intervention_dict={'trip_mph_floor': 16, 'trip_percent_speedup': 15}, fwy_xpwy_floor = 35)

In [None]:
corr_gdf

In [None]:
gpd.GeoDataFrame(corr_gdf, crs=geography_utils.CA_NAD83Albers_m).explore(column='minutes_per_mile')

In [None]:
hs_gdf = corridor_from_row(hs, intervention_dict={'trip_seconds_saved': 30})

## Discussion

* Which other metrics?

after screening:

ridership/person-hours of delay
accessibility, equity

* Is our list complete?

Include Tempo, Van Ness, can we compare?

* Exclude routes where necessary (current corridor join is just spatial...)

* Add location-specific interventions, and [guidance](https://caltrans.sharepoint.com/:w:/s/DOTPMPHQ-DataandDigitalServices/EdG0YNQcQMBJmKncAuNva9wBjpxVq2sD8p3C5azumXFNRA?e=TO7CbB)

* How much should we focus on freeways?

Yes, include (Bay Bridge, others, SDMTS freeway service, launch service with new HOT...)

District transit plans, express/rapid on freeways...?

In [None]:
old_sfmta = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/rt_delay/stop_delay_views/282_2022-02-08.parquet')

In [None]:
import rt_analysis

## bespoke van ness

* v2 warehouse doesn't include RT before Sep 2022, and Van Ness opened Apr 2022
* get old rt_analysis code working

In [None]:
rt_analysis.rt_filter_map_plot.from_gcs?

In [None]:
from rt_analysis import rt_filter_map_plot

In [None]:
importlib.reload(rt_filter_map_plot)

In [None]:
sfmta = rt_analysis.rt_filter_map_plot.from_gcs(282, '2022-02-08')

In [None]:
sfmta.set_filter(route_names=['49'])

In [None]:
sfmta.segment_speed_map?

In [None]:
# sfmta.segment_speed_map()

In [None]:
sfmta.autocorridor?

In [None]:
sfmta.autocorridor('194252', [27, 37])

In [None]:
old_van_ness = sfmta.corridor_metrics()

In [None]:
new_van_ness = corr_gdf.query('corridor_name == "US101 Van Ness" & schedule_gtfs_dataset_key_y == "7cc0cb1871dfd558f11a2885c145d144"')

In [None]:
old_van_ness

In [None]:
old_segments = sfmta.stop_segment_speed_view[sfmta.stop_segment_speed_view.corridor]

In [None]:
new_van_ness

In [None]:
row = corr.iloc[3,:]

In [None]:
row

In [None]:
gdf = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail,
                                               organization_source_record_id=row.organization_source_record_id,
                                               shape_id=row.shape_id,
                                               start_seg_id=row.start_segment_id,
                                               end_seg_id=row.end_segment_id)

In [None]:
corridor_trips = corridor_analysis.find_corridor_data(detail, gdf, st4)
# corridor_results = corridor_analysis.analyze_corridor_trips(corridor_trips)

In [None]:
corridor_trips = corridor_trips.query('speed_mph <= 30')

In [None]:
corridor_trips = corridor_trips.query('route_short_name == "49"')

In [None]:
corridor_trips.columns

In [None]:
old_segments = old_segments.query('speed_mph <= 30')

In [None]:
fig = old_segments.speed_mph.hist(bins=30)

In [None]:
corridor_trips.speed_mph.hist(bins=30)

In [None]:
import matplotlib.pyplot as plt

plt.hist(old_segments['speed_mph'],  
         alpha=0.5, # the transaparency parameter 
         label='feb 2022',
         bins=20) 
  
plt.hist(corridor_trips['speed_mph'], 
         alpha=0.5, 
         label='dec 2024',
         bins=20) 
  
plt.legend(loc='upper right') 
plt.title('Van Ness BRT segment speeds') 
plt.show()

In [None]:
len(corridor_trips.trip_instance_key.unique())

In [None]:
len(old_segments.trip_key.unique())

In [None]:
old_segments.shape

In [None]:
corridor_trips.shape

In [None]:
corridor_trips.segment_id.unique().shape

In [None]:
old_segments.stop_sequence.unique().shape