In [92]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (GTFS_DATA_DICT,
                                              SEGMENT_GCS, 
                                              COMPILED_CACHED_VIEWS,
                                              RT_SCHED_GCS, 
                                              SCHED_GCS,
                                              PROJECT_CRS)

In [31]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [60]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

In [33]:
july_date = '2024-07-17'

'2024-07-17'

### Schedule

In [34]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [35]:
og = pd.read_parquet(schd_vp_url)

In [36]:
og.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district,ntd_id_2022,counties_served,hq_city,hq_county,is_public_entity,is_publicly_operating,funding_sources,on_demand_vehicles_at_max_service,vehicles_at_max_service,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,voms_pt,voms_do,year,route_primary_direction
0,014d0998350083249a9eb310635548c2,1.0,all_day,45.0,0.31,8,0.33,0.0,1.0,0.0,1.0,0.0,0.0,2023-10-11,rapid,0,0,,,0,0,0,0,0,0,,,,,,,,,schedule_only,,SLO Schedule,Broad/Airport/Johnson,1B,1B Broad/Airport/Johnson,10866826,recRIGtFV5uxrTx05,aHR0cHM6Ly9hcHAubWVjYXRyYW4uY29tL3VyYi93cy9mZWVkL2MybDBaVDF6Ykc5MGNtRnVjMmwwTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDB6WlRNd016TTFPVFJpTVRFMk56QTBOMkl4TmpRd05qQTBaalF3TUdNek16ZGlNMkUxTVRRMA==,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,05 - San Luis Obispo,90206,San Luis Obispo,San Luis Obispo,San Luis Obispo,True,True,5307;5311;5339,41.0,41.0,,"San Luis Obispo, CA",4034.0,,,130.0,56904.0,206008.0,,,Full Reporter,Public Agency or Authority of Transit Service,,41.0,2022.0,Northbound
1,014d0998350083249a9eb310635548c2,1.0,all_day,45.0,0.31,8,0.33,0.0,1.0,0.0,1.0,0.0,0.0,2023-11-15,rapid,0,0,,,0,0,0,0,0,0,,,,,,,,,schedule_only,,SLO Schedule,Broad/Airport/Johnson,1B,1B Broad/Airport/Johnson,10866826,recRIGtFV5uxrTx05,aHR0cHM6Ly9hcHAubWVjYXRyYW4uY29tL3VyYi93cy9mZWVkL2MybDBaVDF6Ykc5MGNtRnVjMmwwTzJOc2FXVnVkRDF6Wld4bU8yVjRjR2x5WlQwN2RIbHdaVDFuZEdaek8ydGxlVDB6WlRNd016TTFPVFJpTVRFMk56QTBOMkl4TmpRd05qQTBaalF3TUdNek16ZGlNMkUxTVRRMA==,reciakGBN1DP9dK9N,San Luis Obispo Regional Transit Authority,05 - San Luis Obispo,90206,San Luis Obispo,San Luis Obispo,San Luis Obispo,True,True,5307;5311;5339,41.0,41.0,,"San Luis Obispo, CA",4034.0,,,130.0,56904.0,206008.0,,,Full Reporter,Public Agency or Authority of Transit Service,,41.0,2022.0,Northbound


### Find frequent routes
* Try to figure out where `frequency` is calculated
    * `gtfs_funnel/schedule_stats_by_route`

In [37]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [38]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_trip_metrics

'schedule_trip/schedule_trip_metrics'

In [39]:
july_trip_metrics = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_trip/schedule_trip_metrics_2024-07-17.parquet"
)

In [40]:
july_trip_metrics.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,0037af189a7c9b56c25838f7dde0c9a3,381.67,Midday,40.0,T2,
1,0139b1253130b33adcd4b3a4490530d2,034760aa2aaacfe37245acfbeb167c2d,585.05,PM Peak,22.0,D3,


In [41]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [42]:
service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    july_trip_metrics, route_group_merge_cols, long_or_wide="long"
)

In [43]:
service_freq_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,n_trips,time_period,frequency
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,22,all_day,0.92
1,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,22,all_day,0.92


In [44]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [52]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [53]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [55]:
july_dir_export = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-07-17.parquet"
)

In [57]:
july_dir_export.head(2).drop(columns=["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,route_primary_direction
0,c00acf594cd3612865b836c9cef64e2e,1X,1.0,shp-1X-01,Line 1X,37.14,0.23,14,all_day,0.58,0.0,1.0,0.0,0.0,0.0,0.0,Eastbound
1,c00acf594cd3612865b836c9cef64e2e,1X,1.0,shp-1X-01,Line 1X,37.14,0.23,6,offpeak,0.38,0.0,1.0,0.0,0.0,0.0,0.0,Eastbound


#### Use `merge_data.concatenate_schedule_by_route_direction()`

In [58]:
import merge_data

In [61]:
route_dir = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [62]:
route_dir.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,014d0998350083249a9eb310635548c2,10866826,1.0,all_day,Northbound,45.0,0.31,8,0.33,0.0,1.0,0.0,1.0,0.0,0.0,2023-10-11
1,014d0998350083249a9eb310635548c2,10866826,1.0,all_day,Northbound,45.0,0.31,8,0.33,0.0,1.0,0.0,1.0,0.0,0.0,2023-11-15


In [63]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [67]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency_in_minutes",
]

In [68]:
route_dir2 = route_dir[subset]

In [69]:
route_dir2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,service_date,frequency_in_minutes
0,014d0998350083249a9eb310635548c2,10866826,1.0,Northbound,2023-10-11,181.82
1,014d0998350083249a9eb310635548c2,10866826,1.0,Northbound,2023-11-15,181.82


In [70]:
route_dir2.frequency_in_minutes.describe()

count   166802.00
mean       202.54
std        316.19
min          4.00
25%         38.46
50%         80.00
75%        193.55
max       1500.00
Name: frequency_in_minutes, dtype: float64

In [72]:
route_dir3 = route_dir2.loc[route_dir2.frequency_in_minutes <= 10]

In [109]:
route_dir3.frequency_in_minutes.describe()

count   1995.00
mean       8.01
std        1.61
min        4.00
25%        6.96
50%        8.13
75%        9.40
max       10.00
Name: frequency_in_minutes, dtype: float64

In [73]:
len(route_dir3), len(route_dir2)

(1995, 166802)

In [74]:
route_dir3.route_id.nunique()

123

In [75]:
route_dir3.schedule_gtfs_dataset_key.nunique()

22

In [76]:
route_dir3.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,service_date,frequency_in_minutes
1812,0666caf3ec1ecc96b74f4477ee4bc939,16-13172,0.0,Eastbound,2024-05-22,8.94
1813,0666caf3ec1ecc96b74f4477ee4bc939,16-13172,0.0,Eastbound,2024-06-12,8.94


In [88]:
operators_with_high_frequency_routes = list(route_dir3.schedule_gtfs_dataset_key.unique())

### Look at scheduled trips

In [93]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [94]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{july_date}.parquet"

In [95]:
trips = pd.read_parquet(FILE)

In [96]:
trips.head(2)

Unnamed: 0,feed_key,gtfs_dataset_key,name,regional_feed_type,service_date,trip_start_date_pacific,trip_id,trip_instance_key,route_key,route_id,route_type,route_short_name,route_long_name,route_desc,direction_id,shape_array_key,shape_id,trip_first_departure_datetime_pacific,trip_last_arrival_datetime_pacific,service_hours,trip_start_date_local_tz,trip_first_departure_datetime_local_tz,trip_last_arrival_datetime_local_tz
0,1b40eed2d0435ee6a620f0d31688ce64,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,Combined Regional Feed,2024-07-17,2024-07-17,139-887,2d929fcdddcc3d5863c14250a9db3b87,58abd0536184591223fd0c2e3d2b3628,3408,3,Route 21,Route 21,PACIFIC VIEW MALL via VICTORIA AVE,0.0,2d525bcdcc47d88c6ff5b7aa5ed3a0bc,8270,2024-07-17 08:45:00,2024-07-17 09:32:00,0.78,2024-07-17,2024-07-17 08:45:00,2024-07-17 09:32:00
1,1b40eed2d0435ee6a620f0d31688ce64,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,Combined Regional Feed,2024-07-17,2024-07-17,139-372,ed8a7e444c378b5f6911f85d33df46f7,58abd0536184591223fd0c2e3d2b3628,3408,3,Route 21,Route 21,PACIFIC VIEW MALL via VICTORIA AVE,0.0,2d525bcdcc47d88c6ff5b7aa5ed3a0bc,8270,2024-07-17 14:45:00,2024-07-17 15:35:00,0.83,2024-07-17,2024-07-17 14:45:00,2024-07-17 15:35:00


In [101]:
frequent_routes = list(route_dir3.route_id.unique())

In [110]:
trips2 = trips.loc[trips.route_id.isin(frequent_routes )]

In [112]:
len(trips2)

21157

In [131]:
trip_instance_key = trips2[['gtfs_dataset_key','route_id','trip_instance_key','shape_array_key','feed_key']].drop_duplicates()

In [132]:
trip_instance_key = trip_instance_key.rename(columns = {'gtfs_dataset_key':'schedule_gtfs_dataset_key'})

### Look at scheduled stops
* This took forever to load.
* Use trip instance key to figure out which route this is? 
* Where is the column that tells me what time the bus should arrive at the stop?

In [84]:
july_scheduled_stops = helpers.import_scheduled_stop_times(july_date,
                                    get_pandas=True,
                                   with_direction = True)

In [86]:
len(july_scheduled_stops)

3400860

In [87]:
july_scheduled_stops.head(1)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
0,67e66865189900c114b39a6579eb51bf,360379,0,bff13f8993ff18e43577db1f5596e014,000084643c3504d0e0815be35c3c883d,c6b4d32d39151d9a513c5ad32acd1d6a,Transpo (small),POINT (-43138.178 -79349.544),,1,360379__360107,Transpo (small)__G St @ East Campus (northbound),Unknown,


In [89]:
july_scheduled_stops2 = july_scheduled_stops.loc[july_scheduled_stops.schedule_gtfs_dataset_key.isin(operators_with_high_frequency_routes)]

In [91]:
len(july_scheduled_stops)- len(july_scheduled_stops2)

1482682

In [116]:
july_scheduled_stops.head(2)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
0,67e66865189900c114b39a6579eb51bf,360379,0,bff13f8993ff18e43577db1f5596e014,000084643c3504d0e0815be35c3c883d,c6b4d32d39151d9a513c5ad32acd1d6a,Transpo (small),POINT (-43138.178 -79349.544),,1,360379__360107,Transpo (small)__G St @ East Campus (northbound),Unknown,
1,67e66865189900c114b39a6579eb51bf,360107,1,bff13f8993ff18e43577db1f5596e014,000084643c3504d0e0815be35c3c883d,c6b4d32d39151d9a513c5ad32acd1d6a,G St @ East Campus (northbound),POINT (-42015.205 -79466.737),0.0,2,360107__360133,G St @ East Campus (northbound)__G St @ Olive (northbound),Eastbound,1129.07


In [127]:
trip_instance_key.head(2)

Unnamed: 0,schedule_gtfs_key,route_id,trip_instance_key,shape_array_key,feed_key
585,bff13f8993ff18e43577db1f5596e014,24,358c4a12b94fccf348407c47b18e0f3b,98b7db927385c715225d4d07904c6acf,67e66865189900c114b39a6579eb51bf
586,bff13f8993ff18e43577db1f5596e014,24,e707252a1ae723b49081cd29b628de69,98b7db927385c715225d4d07904c6acf,67e66865189900c114b39a6579eb51bf


In [133]:
july_scheduled_stops2 = pd.merge(july_scheduled_stops, trip_instance_key,
        on = ["schedule_gtfs_dataset_key","trip_instance_key",'feed_key','shape_array_key','feed_key'],
        how = "inner")

In [134]:
july_scheduled_stops2.head(2)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters,route_id
0,3de56f52621869e0f5d56d999fe7a500,17915,1,7cc0cb1871dfd558f11a2885c145d144,000239aa52624c26eb43ff57c0ed0e29,4cf439c717856c5d1420264448ee11b0,Transit Center Bay C,POINT (-210647.492 -22507.796),,2,17915__14725,Transit Center Bay C__Fremont St & Market St,Unknown,,38R
1,3de56f52621869e0f5d56d999fe7a500,14725,2,7cc0cb1871dfd558f11a2885c145d144,000239aa52624c26eb43ff57c0ed0e29,4cf439c717856c5d1420264448ee11b0,Fremont St & Market St,POINT (-210876.427 -22314.298),1.0,3,14725__15689,Fremont St & Market St__Market St & Sansome St,Westbound,299.75,38R


### Look at scheduled stoptimes 
* Where's stop_pair name? 
* Which one should I use?
* I'm having trouble finding a dataframe that will tell me what time the bus should arrive at a stop.

In [152]:
TABLE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction
FILE = f"{RT_SCHED_GCS}{TABLE}_{july_date}.parquet"

In [135]:
# stop_times = helpers.import_scheduled_stop_times(july_date, get_pandas = True)

In [153]:
stop_times = pd.read_parquet(FILE)

In [156]:
stop_times.shape

(3400860, 14)

In [157]:
stop_times.sample(2)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
1342580,9b19da84c9b35a4624b5afbddb56dc5d,63505,28,fb467982dcc77a7f9199bebe709bb700,64cff63866002fbd4dfdf73d188261d8,78d25a28d5c1749b9a9b08923a254efb,Jackson & Bambi,b'\x01\x01\x00\x00\x00\xfd\xaf\x0f!e\xd4\x03\xc1\x00\xf2\xc7\x9e\xcd\xaa\xf1\xc0',27.0,29,63505__63506,Jackson & Bambi__Jackson & Kammerer,Northbound,515.92
2426859,f4098e44e7274f77da6e5f4c7ea6adb4,89024,1,baeeb157e85a901e47b828ef9fe75091,b6eb29c2cab2400ee3377750cb1287be,60fb5f1dc42cf64a16cff2817e1ab59d,Otay Mesa Transit Center,"b'\x01\x01\x00\x00\x00\x04\xd5\x10\x92\x83\x90\x11A(\xb0\xd7i\x14^""\xc1'",,2,89024__60535,Otay Mesa Transit Center__Siempre Viva Rd & Otay Center Dr,Unknown,


In [159]:
stop_times.columns

Index(['feed_key', 'stop_id', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'trip_instance_key', 'shape_array_key', 'stop_name', 'geometry',
       'prior_stop_sequence', 'subseq_stop_sequence', 'stop_pair',
       'stop_pair_name', 'stop_primary_direction', 'stop_meters'],
      dtype='object')

In [None]:
stop_times['always_matches2'] = stop_times.departure_sec - stop_times.arrival_sec

In [None]:
stop_times.always_matches.value_counts().head()

In [None]:
stop_times.loc[stop_times.always_matches2 == 60].sample(3)

In [None]:
stop_times.always_matches2.value_counts().head()

In [160]:
TABLE2 = GTFS_DATA_DICT.schedule_downloads.stop_times
FILE2 = f"{COMPILED_CACHED_VIEWS}{TABLE2}_{july_date}.parquet"

In [162]:
stop_times2 = pd.read_parquet(FILE2)

In [163]:
stop_times2.columns

Index(['feed_key', 'feed_timezone', 'base64_url', 'trip_id', 'stop_id',
       'stop_sequence', 'timepoint', 'arrival_sec', 'departure_sec',
       'arrival_hour', 'departure_hour'],
      dtype='object')

### `gtfs_funnel/stop_times_with_direction`

In [166]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [165]:
GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction

'stop_times_direction'

In [167]:
july_gtfs_funnel = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/stop_times_direction_2024-07-17.parquet")

In [168]:
july_gtfs_funnel.head(2)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
0,67e66865189900c114b39a6579eb51bf,360379,0,bff13f8993ff18e43577db1f5596e014,000084643c3504d0e0815be35c3c883d,c6b4d32d39151d9a513c5ad32acd1d6a,Transpo (small),b'\x01\x01\x00\x00\x00\x04\x18(\xafE\x10\xe5\xc0 ds\xb2X_\xf3\xc0',,1,360379__360107,Transpo (small)__G St @ East Campus (northbound),Unknown,
1,67e66865189900c114b39a6579eb51bf,360107,1,bff13f8993ff18e43577db1f5596e014,000084643c3504d0e0815be35c3c883d,c6b4d32d39151d9a513c5ad32acd1d6a,G St @ East Campus (northbound),b'\x01\x01\x00\x00\x00\xda4V\x8f\xe6\x83\xe4\xc0\x00]5\xcb\xabf\xf3\xc0',0.0,2,360107__360133,G St @ East Campus (northbound)__G St @ Olive (northbound),Eastbound,1129.07
