In [1]:
import pandas as pd
import geopandas as gpd

from shared_utils import rt_utils, catalog_utils

from calitp_data_analysis import get_fs, geography_utils
from segment_speed_utils import helpers, time_series_utils, gtfs_schedule_wrangling, corridor_analysis
from segment_speed_utils.project_vars import SCHED_GCS, SEGMENT_GCS, GTFS_DATA_DICT, analysis_date

# develop and test some basic tools for corridor analysis

In [2]:
catalog = catalog_utils.get_catalog('gtfs_analytics_data')

In [3]:
catalog.speedmap_segments

{'dir': '${gcs_paths.SEGMENT_GCS}', 'stage1': '${speeds_tables.vp_dwell}', 'proxy_stop_times': 'stop_time_expansion/speedmap_stop_times', 'stage2': 'nearest/nearest_vp_speedmap_proxy', 'stage3': 'speedmap/stop_arrivals_proxy', 'stage3b': 'speedmap/stop_arrivals', 'stage4': 'speedmap/speeds', 'trip_stop_cols': ['trip_instance_key', 'stop_sequence', 'stop_sequence1'], 'shape_stop_cols': ['shape_array_key', 'shape_id'], 'stop_pair_cols': ['stop_pair', 'stop_pair_name', 'segment_id'], 'route_dir_cols': ['route_id', 'direction_id'], 'segments_file': 'segment_options/speedmap_segments', 'shape_stop_single_segment': 'rollup_singleday/speeds_shape_speedmap_segments', 'shape_stop_single_segment_detail': 'rollup_singleday/speeds_shape_speedmap_segments_detail', 'route_dir_single_segment': 'rollup_singleday/speeds_route_dir_speedmap_segments', 'route_dir_multi_segment': 'rollup_multiday/speeds_route_dir_speedmap_segments', 'min_trip_minutes': '${speed_vars.time_min_cutoff}', 'max_trip_minutes': 1

In [4]:
analysis_date

'2024-12-11'

In [5]:
# path = f'{catalog.speedmap_segments.dir}{catalog.speedmap_segments.stage4}_{analysis_date}.parquet'

In [6]:
path = f'{catalog.speedmap_segments.dir}{catalog.speedmap_segments.shape_stop_single_segment_detail}_{analysis_date}.parquet'

In [7]:
path

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_speedmap_segments_detail_2024-12-11.parquet'

In [8]:
detail = gpd.read_parquet(path)

In [9]:
detail.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,shape_array_key,shape_id,route_id,direction_id,stop_pair,stop_pair_name,segment_id,time_of_day,p50_mph,...,p80_mph,name,caltrans_district,organization_source_record_id,organization_name,base64_url,geometry,n_trips_sch,trips_hr_sch,route_short_name
0,015d67d5b75b5cf2b710bbadadfb75f5,43f0d67e5131502a51f9330e04bacc55,20,228,0.0,40512__40514,Sir Francis Drake Blvd & Olema Rd__Sir Francis...,40512-40514-1,AM Peak,11.83,...,11.83,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,"LINESTRING (-122.60413 38.00022, -122.60416 38...",3,1.0,228
1,015d67d5b75b5cf2b710bbadadfb75f5,43f0d67e5131502a51f9330e04bacc55,20,228,0.0,40514__40516,Sir Francis Drake Blvd & Alhambra Circle__Sir ...,40514-40516-1,AM Peak,15.52,...,19.79,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,"LINESTRING (-122.60045 37.99908, -122.60037 37...",3,1.0,228
2,015d67d5b75b5cf2b710bbadadfb75f5,43f0d67e5131502a51f9330e04bacc55,20,228,0.0,40516__40518,Sir Francis Drake Blvd At Drake Manor Apts__Si...,40516-40518-1,AM Peak,15.69,...,20.79,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,"LINESTRING (-122.59907 37.99704, -122.59894 37...",3,1.0,228


## need trip-level (pre-aggregation) gdf to properly calculate metrics

In [10]:
st4 = corridor_analysis.import_trip_speeds(analysis_date)

2.2 percent of segments have no speed


## corridor specification

In [11]:
orgs = detail.drop_duplicates(subset=['organization_source_record_id', 'organization_name'])[['organization_source_record_id', 'organization_name']]

In [12]:
orgs[orgs.organization_name.str.contains('San Fr')]

Unnamed: 0,organization_source_record_id,organization_name
365619,rechaapWbeffO33OX,City and County of San Francisco
403280,recmatCuQAUrNcs8j,San Francisco Bay Area Water Emergency Transit...


## define corridor

In [13]:
# shape_id = '4953'
# start_seg_id = '18088-18089-1'
# end_seg_id = '16800-16806-1'

sf = 'rechaapWbeffO33OX'
shape_id = '800'
start_seg_id = '14970-17900-2'
end_seg_id = '16357-16358-1'

# hum = 'recynxkqEoo9dJEvw'
# shape_id = 'p_1435936'
# start_seg_id = '1252-4209812-1'
# end_seg_id = '1276-1277-1'

In [14]:
gdf = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=sf, shape_id=shape_id,
                      start_seg_id=start_seg_id, end_seg_id=end_seg_id)

POINT (-122.40550254785889 37.76900326502991) POINT (-122.40217200000001 37.724137999999975)


## Corridor Measurements

Previous logic:

For each trip, get from the last stop before entering corridor to the first stop after exiting corridor. This was done on stop_sequence

Now,

* first sjoin with aggregated data (has geom). Sjoining on segments is equivalent to previous methodology, since it will yield the last stop before entry to the first stop after exiting...
* avoid doing scheduled delay metric for now...
* 

In [31]:
corridor_trips = corridor_analysis.find_corridor_data(detail, gdf, st4)
corridor_results = corridor_analysis.analyze_corridor_trips(corridor_trips)

0.9 percent of zero seconds
0.0 percent of speeds > 80mph


In [32]:
# corridor_results.head(3)

In [33]:
routes = ['8AX', '8BX', '8']
rt8 = corridor_results.query('route_short_name in @routes')

df = corridor_analysis.analyze_corridor_improvements(rt8, trip_seconds_saved=30)

In [34]:
df.head(3)

Unnamed: 0,trip_instance_key,corridor_meters,corridor_seconds,corridor_speed_mps,corridor_speed_mph,route_short_name,route_id,shape_array_key,shape_id,schedule_gtfs_dataset_key,time_of_day,corridor_id,improved_corridor_seconds,improved_corridor_speed_mps,improved_corridor_speed_mph


In [35]:
# rt_utils.show_full_df(pd.read_parquet('../ca_transit_speed_maps/_rt_progress_2024-12-11.parquet').sort_values(['caltrans_district', 'organization_name']))

In [36]:
SUMMARY_GROUP_COLS = ['route_short_name', 'route_id', 'time_of_day',
                     'corridor_id']

def summarize_corridor_improvements(df: pd.DataFrame, group_cols = SUMMARY_GROUP_COLS):
    '''
    
    '''
    sum_cols = ['corridor_seconds', 'improved_corridor_seconds', 'delay_seconds',
                   'delay_minutes']
    distance_cols = ['corridor_meters']
    df = df.assign(delay_seconds = df.corridor_seconds - df.improved_corridor_seconds)
                  # corridor_miles = df.corridor_meters / rt_utils.METERS_PER_MILE)
    df = df.assign(delay_minutes = df.delay_seconds / 60)
    group = df.groupby(group_cols)[sum_cols + distance_cols]
    #  TODO revise, using min dist here is by trip...
    df = group.agg({**{x:'sum' for x in sum_cols}, **{x:'min' for x in distance_cols}})
    # df = df.assign(minutes_per_mile = df.delay_minutes / df.corridor_miles)
    
    return df

In [37]:
summarize_corridor_improvements(df, group_cols=['corridor_id'])

Unnamed: 0_level_0,corridor_seconds,improved_corridor_seconds,delay_seconds,delay_minutes,corridor_meters
corridor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [38]:
summarize_corridor_improvements(df).groupby('corridor_id')[['delay_minutes']].sum()

Unnamed: 0_level_0,delay_minutes
corridor_id,Unnamed: 1_level_1


In [39]:
gdf

Unnamed: 0,schedule_gtfs_dataset_key,shape_array_key,shape_id,name,organization_source_record_id,geometry,distance_meters,corridor_id,corridor_name
0,a253a8d7acd57657bb98050f37dd6b0f,a9518009af798f5e073e561b267305de,p_1435936,Humboldt Schedule,recynxkqEoo9dJEvw,"POLYGON ((-353635.350 314804.404, -353644.983 ...",4161.042525,586004430520181286,


In [109]:
corr = pd.read_excel('./_temp/corr_hs.xlsx', sheet_name='Corridors')

In [115]:
hs = pd.read_excel('./_temp/corr_hs.xlsx', sheet_name='Hotspots')

In [116]:
corr = corr.astype({'start_segment_id': str, 'end_segment_id': str, 'shape_id':str})
hs = hs.astype({'start_segment_id': str, 'end_segment_id': str, 'shape_id':str})

In [42]:
row = corr.iloc[0,:]

In [43]:
row

SHS Segment                                                  SR-1 Broadway/4th/5th
County                                                                    Humboldt
District                                                                         1
Corridor Length (mi)                                                           NaN
Transit Operators                              City of Arcata, City of Eureka, HTA
Treatments                                                                     NaN
organization_source_record_id                                    recynxkqEoo9dJEvw
start_segment_id                                                    1252-4209812-1
end_segment_id                                                         1276-1277-1
shape_id                                                                 p_1435936
Notes                            Not a ton of delay but the primary SHS delay i...
Name: 0, dtype: object

In [44]:
gdf = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=row.organization_source_record_id, shape_id=row.shape_id,
                      start_seg_id=row.start_segment_id, end_seg_id=row.end_segment_id)

POINT (-124.188822 40.78044300000002) POINT (-124.16029600000002 40.80271199999996)


In [71]:
def corridor_from_row(row, intervention_dict):
    global all_corridors
    try:
        print(row["SHS Segment"])
        corr = corridor_analysis.corridor_from_segments(speed_segments_gdf=detail, organization_source_record_id=row.organization_source_record_id, shape_id=row.shape_id,
                      start_seg_id=row.start_segment_id, end_seg_id=row.end_segment_id, name=row['SHS Segment'])
        corridor_trips = corridor_analysis.find_corridor_data(detail, corr, st4)
        corridor_results = corridor_analysis.analyze_corridor_trips(corridor_trips)
        df = corridor_analysis.analyze_corridor_improvements(corridor_results, **intervention_dict)
        summ = summarize_corridor_improvements(df, group_cols=['corridor_id']).reset_index(drop=True)
        corr = pd.concat([corr, summ], axis=1)
        corr = corr.assign(corridor_miles = corr.distance_meters / rt_utils.METERS_PER_MILE) #  from corridor def, not trip distance
        corr = corr.assign(minutes_per_mile = corr.delay_minutes / corr.corridor_miles)
        all_corridors += [corr]
    except Exception as e:
        print(f'failed for{row["SHS Segment"]}')
        print(e)
        pass
    

In [103]:
corr.loc[15]

SHS Segment                      SR66 Foothill Blvd
County                                  Los Angeles
District                                          7
Corridor Length (mi)                            5.4
Transit Operators                  Foothill Transit
Treatments                                      NaN
organization_source_record_id     recSqgaa8QiQ8CRjl
start_segment_id                        1308-1316-1
end_segment_id                          1228-1201-1
shape_id                                  17996_shp
Notes                                           NaN
Name: 15, dtype: object

In [121]:
all_corridors = []

corr.apply(corridor_from_row, axis=1, intervention_dict={'trip_mph_target': 16})
# corr.iloc[:3,:].apply(corridor_from_row, axis=1)

SR-1 Broadway/4th/5th
POINT (-124.188822 40.78044300000002) POINT (-124.16029600000002 40.80271199999996)
0.9 percent of zero seconds
0.0 percent of speeds > 80mph
SR1 Junipero Serra/19th Ave
POINT (-122.47109500000002 37.784514999999985) POINT (-122.46901 37.70597799999998)
2.4 percent of zero seconds
10.7 percent of speeds > 80mph
US101 Lombard/Presidio Pkwy
POINT (-122.47414053795225 37.80735892390237) POINT (-122.42477900000002 37.802431)
1.4 percent of zero seconds
7.6 percent of speeds > 80mph
SR123 San Pablo Blvd
POINT (-122.316573 37.92455300000004) POINT (-122.27945095392592 37.8299470222603)
2.9 percent of zero seconds
1.1 percent of speeds > 80mph
SR13 Ashby Ave
POINT (-122.29215299999998 37.84904500000002) POINT (-122.25298800000002 37.85728699999996)
0.6 percent of zero seconds
0.6 percent of speeds > 80mph
SR185 E 16th St
POINT (-122.15636699999999 37.725209999999976) POINT (-122.12119699999998 37.69934199999998)
1.3 percent of zero seconds
1.5 percent of speeds > 80mph
S

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
dtype: object

In [122]:
corr_gdf = pd.concat(all_corridors)[['name', 'corridor_miles', 'delay_minutes',
        'minutes_per_mile', 'geometry']]

In [132]:
# corr_gdf.explore(column='minutes_per_mile')

In [124]:
all_corridors = []
hs.apply(corridor_from_row, axis=1, intervention_dict={'trip_mph_target': 16})

SR-281
POINT (-122.74313696750094 38.93045308891572) POINT (-122.73538076730095 38.944933404381025)
21.1 percent of zero seconds
5.3 percent of speeds > 80mph
US-101
POINT (-124.040182976 41.531234425) POINT (-124.03277840500002 41.52298986799994)
0.0 percent of zero seconds
0.0 percent of speeds > 80mph
SR-44
POINT (-122.388229 40.584889000000004) POINT (-122.39237 40.5829)
31.1 percent of zero seconds
4.9 percent of speeds > 80mph
I-80
failed forI-80
empty shape, check shape_id
SR-99
POINT (-119.79791000000002 36.732010000000024) POINT (-119.80086 36.72993000000005)
0.3 percent of zero seconds
0.5 percent of speeds > 80mph
SR-99
POINT (-119.03828599999999 35.33746200000005) POINT (-119.04338 35.339589999999994)
1.9 percent of zero seconds
0.7 percent of speeds > 80mph
I-5
POINT (-118.15988745500002 34.011077479000036) POINT (-118.15867108400002 34.00771021100003)
0.0 percent of zero seconds
0.7 percent of speeds > 80mph
SR-83
failed forSR-83
index out of range
SR-108
POINT (-120.3602

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
dtype: object

In [125]:
hs_gdf = pd.concat(all_corridors)[['name', 'corridor_miles', 'delay_minutes',
        'minutes_per_mile', 'geometry']]

In [131]:
# hs_gdf.explore(column='delay_minutes')

In [130]:
corr_gdf.sort_values('minutes_per_mile', ascending=False)

Unnamed: 0,name,corridor_miles,delay_minutes,minutes_per_mile,geometry
0,Bay Area 511 Muni Schedule,3.219778,6568.154911,2039.940435,"POLYGON ((-217609.307 -20376.350, -217609.271 ..."
0,Bay Area 511 Muni Schedule,6.174015,12532.004535,2029.79825,"POLYGON ((-218152.597 -24502.018, -218170.814 ..."
0,San Diego Schedule,1.274946,2099.611015,1646.822864,"POLYGON ((272724.417 -599039.241, 272551.922 -..."
0,LA Metro Bus Schedule,2.301013,3113.958552,1353.299051,"POLYGON ((153028.275 -434865.075, 152920.507 -..."
0,Bay Area 511 AC Transit Schedule,6.928382,9070.180492,1309.133988,"POLYGON ((-203442.685 -7734.573, -203443.442 -..."
0,San Diego Schedule,2.126868,2000.795379,940.723867,"POLYGON ((264750.856 -587810.341, 264750.944 -..."
0,Bay Area 511 Santa Clara Transit Schedule,16.403031,14123.672216,861.040416,"POLYGON ((-191421.407 -61494.912, -191423.478 ..."
0,LA Metro Bus Schedule,4.947284,4140.299188,836.883208,"POLYGON ((152660.296 -466553.426, 152659.189 -..."
0,Bay Area 511 AC Transit Schedule,2.674901,2199.936433,822.436431,"POLYGON ((-189808.077 -30309.901, -189813.839 ..."
0,Long Beach Schedule,5.18901,4196.387422,808.706739,"POLYGON ((164989.834 -467823.021, 164991.149 -..."


In [129]:
hs_gdf.sort_values('delay_minutes', ascending=False)

Unnamed: 0,name,corridor_miles,delay_minutes,minutes_per_mile,geometry
0,Bay Area 511 SamTrans Schedule,0.556265,1980.335068,3560.059726,"POLYGON ((-210624.594 -43758.081, -210625.249 ..."
0,LA Metro Bus Schedule,0.28932,1483.598818,5127.885094,"POLYGON ((169981.112 -443628.614, 169977.271 -..."
0,GET Schedule,0.453996,1165.172077,2566.482859,"POLYGON ((87177.454 -297142.036, 87166.684 -29..."
0,Bay Area 511 SamTrans Schedule,0.703772,785.676155,1116.378445,"POLYGON ((-203078.572 -50557.181, -203080.704 ..."
0,Bay Area 511 Santa Clara Transit Schedule,0.66195,752.436686,1136.697247,"POLYGON ((-186775.206 -66223.238, -186768.396 ..."
0,Redding Schedule,0.617853,666.14071,1078.153418,"POLYGON ((-202610.478 287825.394, -202650.621 ..."
0,Bay Area 511 Santa Clara Transit Schedule,1.566094,572.389438,365.488675,"POLYGON ((-184466.367 -67821.321, -184213.039 ..."
0,OCTA Schedule,0.49153,513.961198,1045.636204,"POLYGON ((186186.066 -470794.970, 186184.269 -..."
0,Fresno Schedule,0.217764,463.86378,2130.124994,"POLYGON ((17832.673 -142786.500, 17840.152 -14..."
0,OCTA Schedule,0.361223,182.797455,506.052229,"POLYGON ((223743.907 -509890.883, 223743.695 -..."


## TO-DO

* Which other metrics?

* Is our list complete?

* Exclude routes where necessary (current corridor join is just spatial...)

* Add location-specific interventions, and [guidance](https://caltrans.sharepoint.com/:w:/s/DOTPMPHQ-DataandDigitalServices/EdG0YNQcQMBJmKncAuNva9wBjpxVq2sD8p3C5azumXFNRA?e=TO7CbB)