## I tried turning `stop_times` to actual dates but it seems like seconds is easier to manipulate.
* 10_transit_bunching.ipynb contains timestamps attempts
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
import merge_data

### Get high frequency routes

In [5]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [6]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[subset]

In [7]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [8]:
route_dir2 = route_dir.loc[route_dir.frequency_in_minutes <= 10]

In [9]:
operators_with_high_frequency_routes = list(
    route_dir2.schedule_gtfs_dataset_key.unique()
)

In [10]:
frequent_routes = list(route_dir2.route_id.unique())

### Get trips of high frequency routes

In [11]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [12]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [13]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
]

In [14]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [15]:
trips_freq_routes = trips.loc[
    trips.route_id.isin(frequent_routes)
    & (trips.schedule_gtfs_dataset_key.isin(operators_with_high_frequency_routes))
]

### Get Stop Times

In [16]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [17]:
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

###  Some scheduled arrival seconds span longer than a day: filter them out

In [18]:
len(rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec > 86400])

15695

In [19]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec <= 86400
].reset_index(drop=True)

In [20]:
# Rearrange
rt_stop_times4 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "feed_key",
        "shape_array_key",
        "route_id",
        "stop_sequence",
        "scheduled_arrival_sec",
    ]
).reset_index(drop=True)

In [21]:
def check_delay(df):
    df = df.assign(delay=df.rt_arrival_sec - df.scheduled_arrival_sec)

    print(df.delay.describe(percentiles=[0.05, 0.1, 0.9, 0.95]))

    max_delay_min = df.delay.max() / 60
    p95_delay_min = df.delay.quantile(q=0.95) / 60

    min_delay_min = df.delay.min() / 60
    p5_delay_min = df.delay.quantile(q=0.05) / 60

    print(f"min / max delay (minutes): {min_delay_min}, {max_delay_min}")
    print(f"5th / 95th delay (minutes): {p5_delay_min}, {p95_delay_min}")

    return df

In [22]:
rt_stop_times4 = check_delay(rt_stop_times4)

count   378730.00
mean        48.74
std       3085.71
min     -86400.00
5%        -171.00
10%       -109.00
50%         86.00
90%        513.00
95%        730.00
max      18588.00
Name: delay, dtype: float64
min / max delay (minutes): -1440.0, 309.8
5th / 95th delay (minutes): -2.85, 12.166666666666666


In [23]:
rt_stop_times4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378730 entries, 0 to 378729
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   trip_id                    378730 non-null  object 
 1   stop_id                    378730 non-null  object 
 2   stop_sequence              378730 non-null  int64  
 3   scheduled_arrival_sec      378730 non-null  float64
 4   schedule_gtfs_dataset_key  378730 non-null  object 
 5   trip_instance_key          378730 non-null  object 
 6   rt_arrival_sec             378730 non-null  int64  
 7   route_id                   378730 non-null  object 
 8   shape_array_key            378730 non-null  object 
 9   feed_key                   378730 non-null  object 
 10  delay                      378730 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 31.8+ MB


In [24]:
# Delete out delays that are more than an hour
60*60

3600

In [25]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"]  <= 60*60].reset_index(drop = True)

In [26]:
# Filter to only delays that are no less than
rt_stop_times5 = rt_stop_times5[rt_stop_times5["delay"]  >= -3600].reset_index(drop = True)

In [27]:
len(rt_stop_times5), len(rt_stop_times4)

(377944, 378730)

In [28]:
rt_stop_times5.delay.describe()

count   377944.00
mean       157.04
std        315.71
min      -3559.00
25%        -25.00
50%         87.00
75%        263.00
max       3592.00
Name: delay, dtype: float64

In [29]:
rt_stop_times5.columns

Index(['trip_id', 'stop_id', 'stop_sequence', 'scheduled_arrival_sec',
       'schedule_gtfs_dataset_key', 'trip_instance_key', 'rt_arrival_sec',
       'route_id', 'shape_array_key', 'feed_key', 'delay'],
      dtype='object')

In [30]:
rt_stop_times5["actual_headway"] = rt_stop_times5.groupby([
        "schedule_gtfs_dataset_key",
        "feed_key",
        "shape_array_key",
        "route_id",
        "stop_sequence",
        "stop_id"])["rt_arrival_sec"].diff()

In [31]:
rt_stop_times5["schd_headway"] = rt_stop_times5.groupby([
        "schedule_gtfs_dataset_key",
        "feed_key",
        "shape_array_key",
        "route_id",
        "stop_sequence",
        "stop_id"])["scheduled_arrival_sec"].diff()

In [32]:
rt_stop_times5.head(10)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,delay,actual_headway,schd_headway
0,10051003551612-DEC23,7383,2,58440.0,0666caf3ec1ecc96b74f4477ee4bc939,05acba0950087c1b3db1ea91b46d63cb,59254,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,814.0,,
1,10051003551653-DEC23,7383,2,60900.0,0666caf3ec1ecc96b74f4477ee4bc939,c30e53e3d705e2ef2ed047eb814558fb,61455,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,555.0,2201.0,2460.0
2,10051003551713-DEC23,7383,2,62100.0,0666caf3ec1ecc96b74f4477ee4bc939,9f7e18664145905aefdbe88f4ee6e8a8,62272,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,172.0,817.0,1200.0
3,10051003551748-DEC23,7383,2,64200.0,0666caf3ec1ecc96b74f4477ee4bc939,6f42b513017077148b6933f735c59350,64334,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,134.0,2062.0,2100.0
4,10051003551802-DEC23,7383,2,65040.0,0666caf3ec1ecc96b74f4477ee4bc939,ac320c0cac03f52506a093e90db88013,65276,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,236.0,942.0,840.0
5,10051003551846-DEC23,7383,2,67680.0,0666caf3ec1ecc96b74f4477ee4bc939,9745f9700061ac6485687946af5b334d,68515,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,835.0,3239.0,2640.0
6,10051003551909-DEC23,7383,2,69060.0,0666caf3ec1ecc96b74f4477ee4bc939,1d8dbfd8568345a08259b3b2033b95da,69076,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,16.0,561.0,1380.0
7,10051003551941-DEC23,7383,2,70980.0,0666caf3ec1ecc96b74f4477ee4bc939,ca8efff5e50ea9cca579b292dd30b48d,70932,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,-48.0,1856.0,1920.0
8,10051003551612-DEC23,7408,3,58560.0,0666caf3ec1ecc96b74f4477ee4bc939,05acba0950087c1b3db1ea91b46d63cb,59386,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,826.0,,
9,10051003551653-DEC23,7408,3,61020.0,0666caf3ec1ecc96b74f4477ee4bc939,c30e53e3d705e2ef2ed047eb814558fb,61491,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,471.0,2105.0,2460.0


In [33]:
agg1 = (
    rt_stop_times5.groupby(
        [
            "schedule_gtfs_dataset_key",
            "feed_key",
            "shape_array_key",
            "route_id",
            "stop_sequence",
            "stop_id",
        ]
    )
    .agg({"schd_headway": "mean"})
    .reset_index()
    .rename(columns={"schd_headway": "avg_schd_headway_sec"})
)

In [34]:
m1 = pd.merge(rt_stop_times5, agg1, on = [
            "schedule_gtfs_dataset_key",
            "feed_key",
            "shape_array_key",
            "route_id",
            "stop_sequence",
            "stop_id",
        ])

In [35]:
std_dev = m1.groupby(["schedule_gtfs_dataset_key",
            "feed_key",
            "shape_array_key",
            "route_id",
            "stop_sequence",
            "stop_id",])\
            .apply(lambda x: (x["actual_headway"] - x["avg_schd_headway_sec"]).std())\
            .reset_index(name="std_dev_headway")

In [36]:
std_dev.head()

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,shape_array_key,route_id,stop_sequence,stop_id,std_dev_headway
0,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,0314d8368e4f695949837e289b644d3e,51-13172,2,7383,950.39
1,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,0314d8368e4f695949837e289b644d3e,51-13172,3,7408,926.9
2,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,0314d8368e4f695949837e289b644d3e,51-13172,4,7412,901.77
3,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,0314d8368e4f695949837e289b644d3e,51-13172,5,7413,927.53
4,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,0314d8368e4f695949837e289b644d3e,51-13172,6,7374,926.81


In [37]:
m2 = pd.merge(m1, std_dev, on = [
            "schedule_gtfs_dataset_key",
            "feed_key",
            "shape_array_key",
            "route_id",
            "stop_sequence",
            "stop_id",
        ])

In [38]:
drop_for_preview = ['schedule_gtfs_dataset_key','trip_instance_key','shape_array_key','feed_key', 'trip_id']

In [40]:
m2['bunching_coefficient'] = m2.std_dev_headway/m2.avg_schd_headway_sec

In [41]:
m2.drop(columns = drop_for_preview).head(40)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient
0,7383,2,58440.0,59254,51-13172,814.0,,,1791.43,950.39,0.53
1,7383,2,60900.0,61455,51-13172,555.0,2201.0,2460.0,1791.43,950.39,0.53
2,7383,2,62100.0,62272,51-13172,172.0,817.0,1200.0,1791.43,950.39,0.53
3,7383,2,64200.0,64334,51-13172,134.0,2062.0,2100.0,1791.43,950.39,0.53
4,7383,2,65040.0,65276,51-13172,236.0,942.0,840.0,1791.43,950.39,0.53
5,7383,2,67680.0,68515,51-13172,835.0,3239.0,2640.0,1791.43,950.39,0.53
6,7383,2,69060.0,69076,51-13172,16.0,561.0,1380.0,1791.43,950.39,0.53
7,7383,2,70980.0,70932,51-13172,-48.0,1856.0,1920.0,1791.43,950.39,0.53
8,7408,3,58560.0,59386,51-13172,826.0,,,1782.86,926.9,0.52
9,7408,3,61020.0,61491,51-13172,471.0,2105.0,2460.0,1782.86,926.9,0.52


In [44]:
m2.drop_duplicates(subset = ['route_id','schedule_gtfs_dataset_key','shape_array_key','feed_key']).head(20).drop(columns = drop_for_preview)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival_sec,route_id,delay,actual_headway,schd_headway,avg_schd_headway_sec,std_dev_headway,bunching_coefficient
0,7383,2,58440.0,59254,51-13172,814.0,,,1791.43,950.39,0.53
442,8847,2,85500.0,85765,51-13172,265.0,,,,,
460,1213,2,18420.0,16894,16-13172,-1526.0,,,621.87,524.39,0.84
6350,4819,2,14820.0,15292,4-13172,472.0,,,,,
6436,3105,2,22020.0,21992,33-13172,-28.0,,,933.0,240.06,0.26
9083,3033,2,18300.0,18388,720-13172,88.0,,,747.33,459.26,0.61
11530,11234,2,80040.0,79993,2-13172,-47.0,,,1950.0,384.67,0.2
11765,1213,2,26820.0,26965,720-13172,145.0,,,4009.09,9754.09,2.43
11992,7345,2,14940.0,15195,18-13172,255.0,,,,,
12035,7383,2,16380.0,16358,51-13172,-22.0,,,1916.25,725.51,0.38
