## Transit Bunching 
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)


In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

In [4]:
may_date = "2024-05-22"

#### Use `merge_data.concatenate_schedule_by_route_direction()`

In [5]:
import merge_data

In [6]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[subset]

In [8]:
route_dir.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,service_date,frequency
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,2024-05-22,0.92
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,2024-05-22,0.62


In [9]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [10]:
route_dir2 = route_dir.loc[route_dir.frequency_in_minutes <= 10]

In [11]:
route_dir2.frequency_in_minutes.describe()

count   122.00
mean      8.05
std       1.60
min       4.00
25%       6.99
50%       8.28
75%       9.38
max      10.00
Name: frequency_in_minutes, dtype: float64

In [12]:
route_dir.frequency_in_minutes.describe()

count   9650.00
mean     203.94
std      317.84
min        4.00
25%       37.97
50%       80.00
75%      193.55
max     1500.00
Name: frequency_in_minutes, dtype: float64

#### Crosswalk stuff
* Operators who run these high freq routes (by gtfs key)
* Route ID of high frequency routes

In [13]:
operators_with_high_frequency_routes = list(
    route_dir2.schedule_gtfs_dataset_key.unique()
)

In [14]:
frequent_routes = list(route_dir2.route_id.unique())

In [15]:
route_dir.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,service_date,frequency,frequency_in_minutes
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,2024-05-22,0.92,65.22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,2024-05-22,0.62,96.77


### Look at scheduled trips

In [16]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [17]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [18]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
]

In [19]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [20]:
trips_freq_routes = trips.loc[
    trips.route_id.isin(frequent_routes)
    & (trips.schedule_gtfs_dataset_key.isin(operators_with_high_frequency_routes))
]

In [21]:
len(trips_freq_routes), len(trips)

(12965, 96398)

In [22]:
trips_freq_routes.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key
710,cc53a0dbf5df90e3009b9cb5d89d80ba,29,1a2a3d2225f7213ce3010552f21f7fd6,106e1a8460c3c6def89b79a424a54508,2cfdf0e33e9229d6b0ad124d956f5856
711,cc53a0dbf5df90e3009b9cb5d89d80ba,29,7cadcfd9b4009cac135417450983d220,d4617acc9813913cfdb4956544cb701b,2cfdf0e33e9229d6b0ad124d956f5856


### `rt_scheduled_v_ran/scripts/rt_stop_times.py`
* Tiffany already combined realtime and scheduled arrivals

In [23]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [24]:
GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times

'schedule_rt_stop_times'

In [25]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22_ah_test.parquet"
)

In [26]:
# Get only relevant trips that are of "frequent routes"
pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
left_only     2206837
both           394425
right_only       2349
dtype: int64

In [27]:
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [28]:
anchor_date = pd.to_datetime("5-22-2024")
rt_stop_times2["sched_arrival"] = anchor_date + pd.to_timedelta(
    rt_stop_times2["scheduled_arrival_sec"], unit="s"
)

In [29]:
rt_stop_times2.rt_arrival.describe()

  rt_stop_times2.rt_arrival.describe()


count                  394425
unique                  79898
top       2024-05-22 16:03:53
freq                       26
first     2024-05-21 23:59:00
last      2024-05-23 05:47:18
Name: rt_arrival, dtype: object

In [30]:
rt_stop_times2.sched_arrival.describe()

  rt_stop_times2.sched_arrival.describe()


count                  394425
unique                  53382
top       2024-05-22 07:25:00
freq                      375
first     2024-05-22 02:56:00
last      2024-05-23 06:07:11
Name: sched_arrival, dtype: object

###  Some scheduled arrival seconds span longer than a day: filter them out

In [31]:
len(rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec > 86400])

15695

In [32]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec <= 86400
].reset_index(drop=True)

In [33]:
len(rt_stop_times3)

378730

In [34]:
preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
]

In [35]:
rt_stop_times3.sample(10).drop(columns=preview)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival,route_id,sched_arrival
87722,10033006641045-DEC23,6954,36,41340.0,2024-05-22 11:29:14,33-13172,2024-05-22 11:29:00
233507,10018003450624-DEC23,8284,41,25800.0,2024-05-22 07:08:13,18-13172,2024-05-22 07:10:00
376930,11496673_M31,14288,11,32886.0,2024-05-22 09:11:35,38R,2024-05-22 09:08:06
20948,183-o4t8tfcqr,6715958,3,70680.0,2024-05-22 19:39:06,4446,2024-05-22 19:38:00
296699,11508533_M31,15206,8,83847.0,2024-05-22 23:19:57,N,2024-05-22 23:17:27
230778,10901000551908-DEC23,15432,5,69600.0,2024-05-22 19:22:32,901-13172,2024-05-22 19:20:00
72212,10207003040914-DEC23,3990,54,37320.0,2024-05-22 10:21:43,207-13172,2024-05-22 10:22:00
303827,11491421_M31,15096,9,54580.0,2024-05-22 15:19:03,24,2024-05-22 15:09:40
138598,60200866,80407,7,68580.0,2024-05-22 19:04:03,804,2024-05-22 19:03:00
62654,10033006640813-DEC23,1131,64,33900.0,2024-05-22 09:24:56,33-13172,2024-05-22 09:25:00


In [36]:
# Rearrange
rt_stop_times4 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "feed_key",
        "shape_array_key",
        "route_id",
        "stop_sequence",
        "stop_id",
        "sched_arrival",
    ]
).reset_index(drop=True)

### Throw out more outliers
* [Tiffany's NB](https://github.com/cal-itp/data-analyses/blob/actual-scheduled-arrival-diff/rt_scheduled_v_ran/10_delay_distribution.ipynb)

In [37]:
def check_delay(df):
    df = df.assign(delay=df.rt_arrival - df.sched_arrival)

    print(df.delay.describe(percentiles=[0.05, 0.1, 0.9, 0.95]))

    max_delay_min = df.delay.max() / 60
    p95_delay_min = df.delay.quantile(q=0.95) / 60

    min_delay_min = df.delay.min() / 60
    p5_delay_min = df.delay.quantile(q=0.05) / 60

    print(f"min / max delay (minutes): {min_delay_min}, {max_delay_min}")
    print(f"5th / 95th delay (minutes): {p5_delay_min}, {p95_delay_min}")

    return df

In [38]:
rt_stop_times4 = check_delay(rt_stop_times4)

count                       378730
mean     0 days 00:02:37.105927705
std      0 days 00:09:14.532949024
min              -1 days +00:00:00
5%               -1 days +23:57:11
10%              -1 days +23:58:12
50%                0 days 00:01:27
90%                0 days 00:08:34
95%                0 days 00:12:13
max                0 days 22:26:01
Name: delay, dtype: object
min / max delay (minutes): -1 days +23:36:00, 0 days 00:22:26.016666666
5th / 95th delay (minutes): -1 days +23:59:57.183333334, 0 days 00:00:12.216666666


In [39]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"].dt.total_seconds() / 3600 <= 1]

In [40]:
len(rt_stop_times5) - len(rt_stop_times4)

-228

### Not sure how to ahndle stuff that's a day behind??
* Filter out for now.

In [41]:
rt_stop_times5.delay.describe()

count                       378502
mean     0 days 00:02:33.675438967
std      0 days 00:08:39.148702562
min              -1 days +00:00:00
25%              -1 days +23:59:35
50%                0 days 00:01:27
75%                0 days 00:04:24
max                0 days 00:59:52
Name: delay, dtype: object

In [42]:
rt_stop_times6 = rt_stop_times5[rt_stop_times5["delay"].dt.total_seconds() >= 0]

In [43]:
len(rt_stop_times6) - len(rt_stop_times5)

-116617

In [44]:
len(rt_stop_times6)

261885

### Calculate the actual and scheduled headways
* Maybe it's better to work with seconds since I have to convert everything back and forth all the time.

In [45]:
rt_stop_times6["actual_headway"] = rt_stop_times6["rt_arrival"].diff()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rt_stop_times6["actual_headway"] = rt_stop_times6["rt_arrival"].diff()


In [46]:
rt_stop_times6["schd_headway"] = rt_stop_times6["sched_arrival"].diff()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rt_stop_times6["schd_headway"] = rt_stop_times6["sched_arrival"].diff()


In [47]:
# df['actual_headway_minus_schd_seconds'] = (df['actual_headway'] - df['schd_headway'])
rt_stop_times6["actual_headway_minus_schd_minutes"] = (
    rt_stop_times6["actual_headway"].dt.total_seconds()
    - rt_stop_times6["schd_headway"].dt.total_seconds()
) / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rt_stop_times6['actual_headway_minus_schd_minutes'] = (rt_stop_times6['actual_headway'].dt.total_seconds() - rt_stop_times6['schd_headway'].dt.total_seconds()) / 60


In [48]:
rt_stop_times6["actual_headway_minutes"] = (
    rt_stop_times6["actual_headway"].dt.total_seconds() / 60
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rt_stop_times6['actual_headway_minutes'] = rt_stop_times6['actual_headway'].dt.total_seconds() / 60


In [49]:
rt_stop_times6["schd_headway_minutes"] = (
    rt_stop_times6["schd_headway"].dt.total_seconds() / 60
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rt_stop_times6['schd_headway_minutes'] = rt_stop_times6['schd_headway'].dt.total_seconds() / 60


In [52]:
rt_stop_times6.schd_headway_minutes.describe()

count   261884.00
mean        -0.00
std        155.82
min      -1215.00
25%          9.00
50%         15.00
75%         24.00
max       1080.00
Name: schd_headway_minutes, dtype: float64

In [55]:
# Delete extremes
rt_stop_times7 = rt_stop_times6.loc[
    (rt_stop_times6.schd_headway_minutes < 1080.00)
    & (rt_stop_times6.schd_headway_minutes > 0)
]

In [56]:
len(rt_stop_times7), len(rt_stop_times6)

(252028, 261885)

In [59]:
rt_stop_times6["actual_headway_minutes"] = (
    rt_stop_times6["actual_headway"].dt.total_seconds() / 60
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rt_stop_times6['actual_headway_minutes'] = rt_stop_times6['actual_headway'].dt.total_seconds() / 60


In [60]:
rt_stop_times7[
    [
        "schd_headway",
        "schd_headway_minutes",
        "actual_headway_minus_schd_minutes",
        "actual_headway",
        "actual_headway_minutes",
    ]
].head(10)

Unnamed: 0,schd_headway,schd_headway_minutes,actual_headway_minus_schd_minutes,actual_headway,actual_headway_minutes
1,0 days 00:41:00,41.0,-4.32,0 days 00:36:41,36.68
2,0 days 00:20:00,20.0,-6.38,0 days 00:13:37,13.62
3,0 days 00:35:00,35.0,-0.63,0 days 00:34:22,34.37
4,0 days 00:14:00,14.0,1.7,0 days 00:15:42,15.7
5,0 days 00:44:00,44.0,9.98,0 days 00:53:59,53.98
6,0 days 00:23:00,23.0,-13.65,0 days 00:09:21,9.35
9,0 days 00:41:00,41.0,-5.92,0 days 00:35:05,35.08
10,0 days 00:20:00,20.0,-5.67,0 days 00:14:20,14.33
11,0 days 00:35:00,35.0,-0.9,0 days 00:34:06,34.1
12,0 days 00:13:00,13.0,2.52,0 days 00:15:31,15.52


In [62]:
rt_stop_times7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252028 entries, 1 to 378726
Data columns (total 17 columns):
 #   Column                             Non-Null Count   Dtype          
---  ------                             --------------   -----          
 0   trip_id                            252028 non-null  object         
 1   stop_id                            252028 non-null  object         
 2   stop_sequence                      252028 non-null  int64          
 3   scheduled_arrival_sec              252028 non-null  float64        
 4   schedule_gtfs_dataset_key          252028 non-null  object         
 5   trip_instance_key                  252028 non-null  object         
 6   rt_arrival                         252028 non-null  datetime64[ns] 
 7   route_id                           252028 non-null  object         
 8   shape_array_key                    252028 non-null  object         
 9   feed_key                           252028 non-null  object         
 10  sched_ar

In [61]:
rt_stop_times7.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival,route_id,shape_array_key,feed_key,sched_arrival,delay,actual_headway,schd_headway,actual_headway_minus_schd_minutes,actual_headway_minutes,schd_headway_minutes
1,10051003551653-DEC23,7383,2,60900.0,0666caf3ec1ecc96b74f4477ee4bc939,c30e53e3d705e2ef2ed047eb814558fb,2024-05-22 17:04:15,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,2024-05-22 16:55:00,0 days 00:09:15,0 days 00:36:41,0 days 00:41:00,-4.32,36.68,41.0
2,10051003551713-DEC23,7383,2,62100.0,0666caf3ec1ecc96b74f4477ee4bc939,9f7e18664145905aefdbe88f4ee6e8a8,2024-05-22 17:17:52,51-13172,0314d8368e4f695949837e289b644d3e,608992664173210532aa3e6cc573be2f,2024-05-22 17:15:00,0 days 00:02:52,0 days 00:13:37,0 days 00:20:00,-6.38,13.62,20.0


In [69]:
agg1 = (
    rt_stop_times7.groupby(
        [
            "schedule_gtfs_dataset_key",
            "feed_key",
            "shape_array_key",
            "route_id",
            "stop_sequence",
            "stop_id",
        ]
    )
    .agg({"schd_headway_minutes": "mean"})
    .reset_index()
    .rename(columns={"schd_headway_minutes": "avg_sched_headway_mins"})
)

In [70]:
m1 = pd.merge(rt_stop_times7, agg1, on = [
            "schedule_gtfs_dataset_key",
            "feed_key",
            "shape_array_key",
            "route_id",
            "stop_sequence",
            "stop_id",
        ])

In [74]:
m1["std_dev"] = (
    (m1["actual_headway_minutes"] - m1["avg_sched_headway_mins"]) ** 2
).mean() ** 0.5

In [76]:
m1['bunching_coefficient'] = m1.std_dev/m1.avg_sched_headway_mins

In [77]:
m1.bunching_coefficient.describe()

count   252028.00
mean         2.57
std          2.45
min          0.04
25%          1.64
50%          2.33
75%          3.25
max        255.12
Name: bunching_coefficient, dtype: float64

In [86]:
len(m1.loc[m1.bunching_coefficient <= 1])

18820

In [87]:
len(m1.loc[m1.bunching_coefficient > 1])

233208

In [79]:
m1.head(5).drop(columns = preview)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,rt_arrival,route_id,sched_arrival,delay,actual_headway,schd_headway,actual_headway_minus_schd_minutes,actual_headway_minutes,schd_headway_minutes,avg_sched_headway_mins,std_dev,bunching_coefficient
0,10051003551653-DEC23,7383,2,60900.0,2024-05-22 17:04:15,51-13172,2024-05-22 16:55:00,0 days 00:09:15,0 days 00:36:41,0 days 00:41:00,-4.32,36.68,41.0,29.5,42.52,1.44
1,10051003551713-DEC23,7383,2,62100.0,2024-05-22 17:17:52,51-13172,2024-05-22 17:15:00,0 days 00:02:52,0 days 00:13:37,0 days 00:20:00,-6.38,13.62,20.0,29.5,42.52,1.44
2,10051003551748-DEC23,7383,2,64200.0,2024-05-22 17:52:14,51-13172,2024-05-22 17:50:00,0 days 00:02:14,0 days 00:34:22,0 days 00:35:00,-0.63,34.37,35.0,29.5,42.52,1.44
3,10051003551802-DEC23,7383,2,65040.0,2024-05-22 18:07:56,51-13172,2024-05-22 18:04:00,0 days 00:03:56,0 days 00:15:42,0 days 00:14:00,1.7,15.7,14.0,29.5,42.52,1.44
4,10051003551846-DEC23,7383,2,67680.0,2024-05-22 19:01:55,51-13172,2024-05-22 18:48:00,0 days 00:13:55,0 days 00:53:59,0 days 00:44:00,9.98,53.98,44.0,29.5,42.52,1.44


In [80]:
import numpy as np

actual = [786,906,700,302,616,198,304,918,538,120,308,876,168,134]
scheduled = [600,600,600,600,660,600,420,540,540,420,420,420,360, 300]

In [83]:
mean = sum(scheduled) / len(scheduled)

In [84]:
# Calculate standard deviation
standard_deviation = np.std([value - mean for value in actual])

print(standard_deviation)

294.08040688608565


In [85]:
standard_deviation / mean

0.5815149288707908