In [10]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (GTFS_DATA_DICT,
                                              SEGMENT_GCS, 
                                              COMPILED_CACHED_VIEWS,
                                              RT_SCHED_GCS, 
                                              SCHED_GCS,
                                              PROJECT_CRS)

In [11]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [12]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

In [13]:
analysis_date_list

['2024-01-17',
 '2024-02-14',
 '2024-03-13',
 '2024-04-17',
 '2024-05-22',
 '2024-06-12',
 '2024-07-17',
 '2023-03-15',
 '2023-04-12',
 '2023-05-17',
 '2023-06-14',
 '2023-07-12',
 '2023-08-15',
 '2023-09-13',
 '2023-10-11',
 '2023-11-15',
 '2023-12-13']

In [14]:
may_date = '2024-05-22'

### Find frequent routes
* Try to figure out where `frequency` is calculated
    * `gtfs_funnel/schedule_stats_by_route`

In [15]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [16]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_trip_metrics

'schedule_trip/schedule_trip_metrics'

In [17]:
trip_metrics = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_trip/schedule_trip_metrics_2024-05-22.parquet"
)

In [18]:
trip_metrics.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,0077e9925280810fd23d6f594b4c2d67,719.38,Early AM,71.0,e430d571-76bd-45d4-8b01-76e3ef7c3ae1,
1,0139b1253130b33adcd4b3a4490530d2,00bdede5b26827d02eaf1e91a60cff87,315.95,PM Peak,30.0,T3,


In [19]:
route_group_merge_cols = ["schedule_gtfs_dataset_key", "route_id", "direction_id"]

In [20]:
service_freq_df = gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
    trip_metrics, route_group_merge_cols, long_or_wide="long"
)

In [21]:
service_freq_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,n_trips,time_period,frequency
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,22,all_day,0.92
1,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,22,all_day,0.92


In [22]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [23]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [24]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [25]:
may_dir_export = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-05-22.parquet"
)

In [26]:
may_dir_export.head(2).drop(columns=["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,route_primary_direction
0,e359e3617344263ad00858db2149a288,6,1.0,p_178727,,24.0,0.21,25,all_day,1.04,1.0,0.0,0.0,1.0,0.0,0.0,Northbound
1,e359e3617344263ad00858db2149a288,6,1.0,p_178727,,24.0,0.21,11,offpeak,0.69,1.0,0.0,0.0,1.0,0.0,0.0,Northbound


#### Use `merge_data.concatenate_schedule_by_route_direction()`

In [27]:
import merge_data

In [28]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])

In [29]:
route_dir.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22


In [30]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [31]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency_in_minutes",
]

In [32]:
route_dir2 = route_dir[subset]

In [33]:
route_dir2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,service_date,frequency_in_minutes
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,2024-05-22,65.22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,2024-05-22,96.77


In [34]:
route_dir2.frequency_in_minutes.describe()

count   9650.00
mean     203.94
std      317.84
min        4.00
25%       37.97
50%       80.00
75%      193.55
max     1500.00
Name: frequency_in_minutes, dtype: float64

In [35]:
route_dir3 = route_dir2.loc[route_dir2.frequency_in_minutes <= 10]

In [36]:
route_dir3.frequency_in_minutes.describe()

count   122.00
mean      8.05
std       1.60
min       4.00
25%       6.99
50%       8.28
75%       9.38
max      10.00
Name: frequency_in_minutes, dtype: float64

In [37]:
len(route_dir3), len(route_dir2)

(122, 9650)

In [38]:
route_dir3.route_id.nunique()

51

In [39]:
route_dir3.schedule_gtfs_dataset_key.nunique()

9

In [40]:
route_dir3.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,service_date,frequency_in_minutes
200,0666caf3ec1ecc96b74f4477ee4bc939,16-13172,0.0,Eastbound,2024-05-22,8.94
202,0666caf3ec1ecc96b74f4477ee4bc939,16-13172,0.0,Eastbound,2024-05-22,6.76


In [41]:
operators_with_high_frequency_routes = list(route_dir3.schedule_gtfs_dataset_key.unique())

### Look at scheduled trips

In [42]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [43]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [44]:
trips = pd.read_parquet(FILE)

In [45]:
trips.head(2)

Unnamed: 0,feed_key,gtfs_dataset_key,name,regional_feed_type,service_date,trip_start_date_pacific,trip_id,trip_instance_key,route_key,route_id,route_type,route_short_name,route_long_name,route_desc,direction_id,shape_array_key,shape_id,trip_first_departure_datetime_pacific,trip_last_arrival_datetime_pacific,service_hours,trip_start_date_local_tz,trip_first_departure_datetime_local_tz,trip_last_arrival_datetime_local_tz
0,926867fdee73d5fbfe4f011871bcd830,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,Combined Regional Feed,2024-05-22,2024-05-22,139-555,c256553e28c4bba693e3136240b35419,9465bf51581b5592ffa90617975dbf7f,3408,3,Route 21,Route 21,PACIFIC VIEW MALL via VICTORIA AVE,0.0,8f644f847e987de68e0cb6fcd339cf41,8270,2024-05-22 07:10:00,2024-05-22 08:01:00,0.85,2024-05-22,2024-05-22 07:10:00,2024-05-22 08:01:00
1,926867fdee73d5fbfe4f011871bcd830,1770249a5a2e770ca90628434d4934b1,VCTC GMV Schedule,Combined Regional Feed,2024-05-22,2024-05-22,139-1167,488e9e227288606249d0508961c0fa15,9465bf51581b5592ffa90617975dbf7f,3408,3,Route 21,Route 21,PACIFIC VIEW MALL via VICTORIA AVE,0.0,8f644f847e987de68e0cb6fcd339cf41,8270,2024-05-22 07:45:00,2024-05-22 08:36:00,0.85,2024-05-22,2024-05-22 07:45:00,2024-05-22 08:36:00


In [46]:
frequent_routes = list(route_dir3.route_id.unique())

In [47]:
trips2 = trips.loc[trips.route_id.isin(frequent_routes )]

In [48]:
len(trips2)

16393

In [49]:
trip_instance_key = trips2[['gtfs_dataset_key','route_id','trip_instance_key','shape_array_key','feed_key']].drop_duplicates()

In [50]:
trip_instance_key = trip_instance_key.rename(columns = {'gtfs_dataset_key':'schedule_gtfs_dataset_key'})

### Look at scheduled stops
* This took forever to load.
* Use trip instance key to figure out which route this is? 
* Where is the column that tells me what time the bus should arrive at the stop?

In [51]:
may_scheduled_stops = helpers.import_scheduled_stop_times(may_date,
                                    get_pandas=True,
                                   with_direction = True)

In [52]:
len(may_scheduled_stops)

3413933

In [53]:
may_scheduled_stops.head(1)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters
0,7f69c2fdaa134642f14064a0b64d1495,16063,1,7cc0cb1871dfd558f11a2885c145d144,000027078253a63212042c7ef413c8b6,813abf5a0645e066e49ff15b20aa94cc,Powell St & Market St,POINT (-211733.946 -23065.083),,2,16063__16068,Powell St & Market St__Powell St & O'Farrell St,Unknown,


In [54]:
may_scheduled_stops2 = may_scheduled_stops.loc[may_scheduled_stops.schedule_gtfs_dataset_key.isin(operators_with_high_frequency_routes)]

In [55]:
len(may_scheduled_stops)- len(may_scheduled_stops2)

1804104

In [57]:
trip_instance_key.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key
574,bff13f8993ff18e43577db1f5596e014,24,29eac2f21d4654489fac796538cd81f3,98b7db927385c715225d4d07904c6acf,67e66865189900c114b39a6579eb51bf
575,bff13f8993ff18e43577db1f5596e014,24,d093b27a506d3f5df17a3f9d902b33e6,98b7db927385c715225d4d07904c6acf,67e66865189900c114b39a6579eb51bf


In [59]:
may_scheduled_stops3 = pd.merge(may_scheduled_stops2, trip_instance_key,
        on = ["schedule_gtfs_dataset_key","trip_instance_key",'feed_key','shape_array_key','feed_key'],
        how = "inner")

In [60]:
may_scheduled_stops3.head(2)

Unnamed: 0,feed_key,stop_id,stop_sequence,schedule_gtfs_dataset_key,trip_instance_key,shape_array_key,stop_name,geometry,prior_stop_sequence,subseq_stop_sequence,stop_pair,stop_pair_name,stop_primary_direction,stop_meters,route_id
0,7f69c2fdaa134642f14064a0b64d1495,16063,1,7cc0cb1871dfd558f11a2885c145d144,000027078253a63212042c7ef413c8b6,813abf5a0645e066e49ff15b20aa94cc,Powell St & Market St,POINT (-211733.946 -23065.083),,2,16063__16068,Powell St & Market St__Powell St & O'Farrell St,Unknown,,PH
1,7f69c2fdaa134642f14064a0b64d1495,16068,2,7cc0cb1871dfd558f11a2885c145d144,000027078253a63212042c7ef413c8b6,813abf5a0645e066e49ff15b20aa94cc,Powell St & O'Farrell St,POINT (-211751.430 -22884.046),1.0,3,16068__16058,Powell St & O'Farrell St__Powell St & Geary St,Northbound,181.88,PH


### Look at scheduled stoptimes 
* Where's stop_pair name? 
* Which one should I use?
* I'm having trouble finding a dataframe that will tell me what time the bus should arrive at a stop.

In [61]:
TABLE = GTFS_DATA_DICT.rt_vs_schedule_tables.stop_times_direction
FILE = f"{RT_SCHED_GCS}{TABLE}_{may_date}.parquet"

In [65]:
stop_times = helpers.import_scheduled_stop_times(may_date, get_pandas = True)

In [66]:
stop_times.sample(2)

Unnamed: 0,feed_key,feed_timezone,base64_url,trip_id,stop_id,stop_sequence,timepoint,arrival_sec,departure_sec,arrival_hour,departure_hour
2204527,608992664173210532aa3e6cc573be2f,America/Los_Angeles,aHR0cHM6Ly9naXRsYWIuY29tL0xBQ01UQS9ndGZzX2J1cy9yYXcvbWFzdGVyL2d0ZnNfYnVzLnppcA==,10002011241301-DEC23,16320,38,0.0,49740.0,49740.0,13.0,13.0
3450749,608992664173210532aa3e6cc573be2f,America/Los_Angeles,aHR0cHM6Ly9naXRsYWIuY29tL0xBQ01UQS9ndGZzX2J1cy9yYXcvbWFzdGVyL2d0ZnNfYnVzLnppcA==,10092003031522-DEC23,10788,32,0.0,57420.0,57420.0,15.0,15.0


### `rt_scheduled_v_ran/scripts/rt_stop_times.py`
* Tiffany already combined realtime and scheduled arrivals

In [68]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [67]:
GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times

'schedule_rt_stop_times'

In [81]:
rt_stop_times = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet")

In [94]:
len(rt_stop_times)

2804661

In [95]:
rt_stop_times.columns

Index(['trip_id', 'stop_id', 'stop_sequence', 'scheduled_arrival_sec',
       'schedule_gtfs_dataset_key', 'trip_instance_key', 'rt_arrival_sec',
       'sched_arrival_min', 'rt_arrival_min', 'sched_arrival_hr',
       'rt_arrival_hr', 'sched_arrival_time'],
      dtype='object')

In [96]:
trip_instance_key.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key
574,bff13f8993ff18e43577db1f5596e014,24,29eac2f21d4654489fac796538cd81f3,98b7db927385c715225d4d07904c6acf,67e66865189900c114b39a6579eb51bf


In [97]:
# Get only relevant trips that are of "frequent routes"
pd.merge(rt_stop_times,
        trip_instance_key,
        on = ['schedule_gtfs_dataset_key', 'trip_instance_key', ],
        how="outer",
        indicator = True)[['_merge']].value_counts()

_merge    
left_only     2262953
both           541708
right_only       2861
dtype: int64

In [99]:
rt_stop_times2 = pd.merge(rt_stop_times,
        trip_instance_key,
        on = ['schedule_gtfs_dataset_key', 'trip_instance_key', ],
        how="inner")

In [71]:
# How to use scheduled_arrival and rt_arrival

In [100]:
rt_stop_times2 = rt_stop_times2.assign(
            sched_arrival_min=pd.to_datetime(rt_stop_times2.scheduled_arrival_sec, unit="s").dt.minute,
            rt_arrival_min=pd.to_datetime(rt_stop_times2.rt_arrival_sec, unit="s").dt.minute,
            sched_arrival_hr=pd.to_datetime(rt_stop_times2.scheduled_arrival_sec, unit="s").dt.hour,
            rt_arrival_hr=pd.to_datetime(rt_stop_times2.rt_arrival_sec, unit="s").dt.hour,
        )


In [102]:
rt_stop_times2.sched_arrival_hr  = rt_stop_times2.sched_arrival_hr.fillna(0).astype(int)

In [103]:
rt_stop_times2.sched_arrival_min  = rt_stop_times2.sched_arrival_min.fillna(0).astype(int)

In [121]:
rt_stop_times2 = rt_stop_times2.assign(
    sched_arrival_time=pd.to_datetime(rt_stop_times2.sched_arrival_hr.astype(str) + ':' + rt_stop_times2.sched_arrival_min.astype(str), format='%H:%M').dt.time
)

In [107]:
rt_stop_times2.rt_arrival_hr  = rt_stop_times2.rt_arrival_hr.fillna(0).astype(int)

In [108]:
rt_stop_times2.rt_arrival_min  = rt_stop_times2.rt_arrival_min.fillna(0).astype(int)

In [120]:
rt_stop_times2 = rt_stop_times2.assign(
    rt_arrival_time=pd.to_datetime(rt_stop_times2.rt_arrival_hr.astype(str) + ':' + rt_stop_times2.rt_arrival_min.astype(str), format='%H:%M').dt.time
)

In [122]:
rt_stop_times2.sample(5)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,sched_arrival_min,rt_arrival_min,sched_arrival_hr,rt_arrival_hr,sched_arrival_time,route_id,shape_array_key,feed_key,rt_arrival_time
310075,10051003551941-DEC23,258,43,73320.0,0666caf3ec1ecc96b74f4477ee4bc939,ca8efff5e50ea9cca579b292dd30b48d,73345,22,22,20,20,20:22:00,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,20:22:00
299830,10060001601759-DEC23,3004,13,65520.0,0666caf3ec1ecc96b74f4477ee4bc939,34d083e16c303ef55653349fe319d5d9,65486,12,11,18,18,18:12:00,60-13172,98fbbbc43763f5016cdb30497a051410,608992664173210532aa3e6cc573be2f,18:11:00
354569,10016004392416-DEC23,4735,49,90120.0,0666caf3ec1ecc96b74f4477ee4bc939,662caf0e84528cd957537d5172105a75,3828,2,3,1,1,01:02:00,16-13172,4db83b86ad3aa1e6dc0924ffc439091b,608992664173210532aa3e6cc573be2f,01:03:00
462929,11486534_M31,15621,32,101820.0,7cc0cb1871dfd558f11a2885c145d144,1fe870eec5aea7845e01dfc0a7e8f113,15895,17,24,4,4,04:17:00,14,9c51f5391578cf13adc10ed131063ddf,7f69c2fdaa134642f14064a0b64d1495,04:24:00
429466,11490156_M31,17768,6,41763.0,7cc0cb1871dfd558f11a2885c145d144,4ac16b19bf3b14aa986c39019982072a,41686,36,34,11,11,11:36:00,22,3c9fd7a0a26e2a9dd9c0a22ffe55dec4,7f69c2fdaa134642f14064a0b64d1495,11:34:00


In [123]:
len(rt_stop_times2)

541708

In [130]:
one_route = rt_stop_times2.loc[rt_stop_times2.route_id == "14"]

In [131]:
one_route = one_route.loc[one_route.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144"]

In [136]:
one_route.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11982 entries, 402713 to 541302
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   trip_id                    11982 non-null  object 
 1   stop_id                    11982 non-null  object 
 2   stop_sequence              11982 non-null  int64  
 3   scheduled_arrival_sec      11982 non-null  float64
 4   schedule_gtfs_dataset_key  11982 non-null  object 
 5   trip_instance_key          11982 non-null  object 
 6   rt_arrival_sec             11982 non-null  int64  
 7   sched_arrival_min          11982 non-null  int64  
 8   rt_arrival_min             11982 non-null  int64  
 9   sched_arrival_hr           11982 non-null  int64  
 10  rt_arrival_hr              11982 non-null  int64  
 11  sched_arrival_time         11982 non-null  object 
 12  route_id                   11982 non-null  object 
 13  shape_array_key            11982 non-nul

In [137]:
one_stop = one_route.loc[one_route.stop_sequence == 2]

In [138]:
len(one_stop)

305

In [140]:
one_stop.head(1)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,sched_arrival_min,rt_arrival_min,sched_arrival_hr,rt_arrival_hr,sched_arrival_time,route_id,shape_array_key,feed_key,rt_arrival_time
402713,11486631_M31,15585,2,40252.0,7cc0cb1871dfd558f11a2885c145d144,443136ba7321bc60b9ceccdfb6c04805,40299,10,11,11,11,11:10:00,14,b45d36e9e45937987ecfdb501fe6ef3b,7f69c2fdaa134642f14064a0b64d1495,11:11:00


In [143]:
one_stop.sort_values(by = ['stop_sequence','sched_arrival_hr','rt_arrival_hr'])[['trip_id','stop_sequence','scheduled_arrival_sec','rt_arrival_sec',]]

Unnamed: 0,trip_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec
414336,11486564_M31,2,88240.0,1849
426743,11486724_M31,2,86452.0,176
431341,11486611_M31,2,87352.0,1013
433587,11486563_M31,2,89140.0,2908
436647,11486616_M31,2,88252.0,1819
472331,11486594_M31,2,89152.0,2772
475145,11486567_M31,2,86440.0,385
517454,11486565_M31,2,87340.0,770
416951,11486562_M31,2,90040.0,3649
437129,11486593_M31,2,90052.0,4208
