## Transit Bunching V2
* Incorporating Katrina and Eric's comments.
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import datetime as dt

import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get routes with short headways.

In [5]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [6]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])

In [8]:
route_dir.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,51.77,0.27,12,1.5,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
3,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,all_day,Southbound,46.73,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22
4,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,offpeak,Southbound,46.73,0.28,11,0.69,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22


In [9]:
route_dir2 = route_dir.loc[route_dir.time_period == "peak"].reset_index(drop=True)

In [10]:
route_dir2["headway_minutes"] = 60 / route_dir.frequency

#### QUESTION: Should I use mean or median for finding routes that are high frequency?
* Find Median.

In [11]:
route_freq_groupby = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
]

In [12]:
high_frequency_routes_median = (
    route_dir2.groupby(route_freq_groupby)
    .agg({"headway_minutes": "median"})
    .reset_index()
    .rename(columns={"headway_minutes": "med_headway_minutes"})
)

In [13]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [14]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [15]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [16]:
crosswalk_df.shape

(168, 5)

#### Grab routes in the 5th percentile of frequency for now.
* Eric: <i>Taking the 5%ile (17.65min headway) is reasonable, but I suspect the worst bunching issues might be on routes with headways at/below the 10min mark? Maybe try 15 and 10 as well?</i>

In [17]:
high_frequency_routes_median["med_headway_minutes"].describe(
    percentiles=[0.05, 0.1, 0.9, 0.95]
)

count   3238.00
mean     236.95
std      350.81
min        4.00
5%        13.83
10%       18.81
50%       89.55
90%      750.00
95%     1000.00
max     1500.00
Name: med_headway_minutes, dtype: float64

In [18]:
high_frequency_routes2 = high_frequency_routes_median.loc[
    high_frequency_routes_median.med_headway_minutes <= 15
]

#### Attach operators and districts

In [19]:
high_frequency_routes2 = pd.merge(
    high_frequency_routes2, crosswalk_df, on="schedule_gtfs_dataset_key", how="left"
)

In [20]:
high_frequency_routes2.route_id.nunique()

158

### Get trips of high frequency routes

In [21]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [22]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [23]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
    "route_type",
]

In [24]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [25]:
# Find only trips that belong to high frequency routes
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [26]:
trips_freq_routes.shape

(4140, 14)

In [27]:
# https://gtfs.org/documentation/schedule/reference/#
route_type_crosswalk = {
    "route_type": ["0", "1", "2", "3", "4", "5", "6", "7", "11", "12"],
    "route_type_str": [
        "Tram, Streetcar, Light rail",
        "Subway, Metro",
        "Rail",
        "Bus",
        "Ferry.",
        "Cable tram.",
        "Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).",
        "Funicular.",
        "Trolleybus.",
        "Monorail.",
    ],
}

In [28]:
route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)

In [29]:
# Merge for route_type
trips_freq_routes = pd.merge(
    trips_freq_routes, route_type_crosswalk_df, on=["route_type"], how="left"
)

In [30]:
trips_freq_routes.route_type_str.unique()

array(['Bus', 'Rail', 'Subway, Metro', 'Tram, Streetcar, Light rail'],
      dtype=object)

In [31]:
trips_freq_routes = trips_freq_routes.drop(columns=["route_type"])

### `rt_stop_times2`: Get Stop Times of High Frequency Routes/Trips

In [32]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [33]:
# How is it possible to have right_only trips?
pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
left_only     2483048
both           118214
right_only       1081
dtype: int64

In [34]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [35]:
rt_stop_times2.shape

(118214, 19)

In [36]:
rt_stop_times2.trip_id.nunique(), rt_stop_times2.trip_instance_key.nunique()

(3059, 3059)

#### What to do with `scheduled_arrival_sec` that are `nan`?

In [37]:
rt_stop_times2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118214 entries, 0 to 118213
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   trip_id                    118214 non-null  object        
 1   stop_id                    118214 non-null  object        
 2   stop_sequence              118214 non-null  int64         
 3   scheduled_arrival_sec      116687 non-null  float64       
 4   schedule_gtfs_dataset_key  118214 non-null  object        
 5   trip_instance_key          118214 non-null  object        
 6   rt_arrival_sec             118214 non-null  int64         
 7   route_id                   118214 non-null  object        
 8   shape_array_key            118214 non-null  object        
 9   feed_key                   118214 non-null  object        
 10  route_long_name            118214 non-null  object        
 11  direction_id               118214 non-null  float64 

In [38]:
rt_stop_times2["scheduled_arrival_sec_copy"] = rt_stop_times2.scheduled_arrival_sec

In [39]:
rt_stop_times3 = rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec.notna()].reset_index(drop = True)

In [40]:
rt_stop_times3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116687 entries, 0 to 116686
Data columns (total 20 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   trip_id                     116687 non-null  object        
 1   stop_id                     116687 non-null  object        
 2   stop_sequence               116687 non-null  int64         
 3   scheduled_arrival_sec       116687 non-null  float64       
 4   schedule_gtfs_dataset_key   116687 non-null  object        
 5   trip_instance_key           116687 non-null  object        
 6   rt_arrival_sec              116687 non-null  int64         
 7   route_id                    116687 non-null  object        
 8   shape_array_key             116687 non-null  object        
 9   feed_key                    116687 non-null  object        
 10  route_long_name             116687 non-null  object        
 11  direction_id                116687 non-

###  `rt_stop_times3`: Deal with time

* If 82800  < `scheduled_arrival_time` < 86_400 but `rt_arrival_sec` is lower say 14_000 (4 am in the morning): then perhaps the bus was scheduled to arrive on May 21 (day before the service date) but it arrived a little later on the service date. 

* If  86_400 < `scheduled_arrival_time` and `rt_arrival_sec` is around 86_000 then this is the same service date. 

In [41]:
86_400 - (60*60)

82800

In [42]:
#rt_stop_times3["scheduled_arrival_sec_2"] = (
#    rt_stop_times3["scheduled_arrival_sec"] % 86_400
#).fillna(0)

In [43]:
rt_stop_times3["scheduled_arrival_sec"].describe()

count   116687.00
mean     49179.24
std      18084.54
min      12660.00
25%      33960.00
50%      48540.00
75%      62460.00
max     106680.00
Name: scheduled_arrival_sec, dtype: float64

In [44]:
rt_stop_times3.loc[rt_stop_times3["scheduled_arrival_sec"] == 86_400].shape

(16, 20)

In [45]:
86_400-3_600

82800

In [46]:
def adjust_days_and_time(seconds, date, rt_arrival_sec):
    """
    Adjusts days and time based on seconds and rt_arrival_sec.

    Parameters:
    seconds (int): Number of seconds.
    date (datetime): Initial date.
    rt_arrival_sec (int): Arrival time in seconds.

    Returns:
    datetime: Adjusted date and time.
    """
    if rt_arrival_sec < 16_000 and ((86_400) < seconds < 106_000):
        # Subtract a day
        return pd.Timestamp(date + pd.Timedelta(days=-1)) + pd.Timedelta(seconds=seconds % 86400)
    elif (86_000 < rt_arrival_sec < 86_400) and (86_400 < seconds):
        # Add days and remaining seconds
        return pd.Timestamp(date) + pd.Timedelta(seconds=seconds % 86400)
    #elif rt_arrival_sec < (86_400/2) and (86_400 < seconds):
        ## Don't add extra day
       # return pd.Timestamp(date) + pd.Timedelta(seconds=seconds % 86400)
    elif seconds == 86_400:
        # Add one day and reset time
        date2 = pd.Timestamp(date + pd.Timedelta(days=1))
        date_timestamp = date2.replace(hour=0, minute=0, second=0)
        return date_timestamp
    else:
        # No change
        return pd.Timestamp(date) + pd.Timedelta(seconds=seconds)

In [47]:
rt_stop_times3["converted_schd_arrival"]= rt_stop_times3.apply(lambda row: adjust_days_and_time(row['scheduled_arrival_sec'], row['service_date'], row['rt_arrival_sec']), axis=1)

In [48]:
def convert_to_midnight(df, date_column, comparison_date):
    """
    Converts timestamps in a DataFrame to midnight if the date exceeds the comparison date.

    Args:
    - df (pd.DataFrame): Input DataFrame.
    - date_column (str): Name of the column containing timestamps.
    - comparison_date (str or pd.Timestamp): Date for comparison.

    Returns:
    - pd.DataFrame: Modified DataFrame with timestamps converted to midnight.
    """
    comparison_date = pd.to_datetime(comparison_date)

    # Ensure date_column is datetime type
    df[date_column] = pd.to_datetime(df[date_column])

    # Mask dates exceeding comparison_date and replace with midnight
    mask = df[date_column].dt.date > comparison_date.date()
    df.loc[mask, date_column] = df.loc[mask, date_column].dt.normalize()

    return df

In [49]:
# Convert to midnight anything that goes past the service date
rt_stop_times3 = convert_to_midnight(
    rt_stop_times3, "scheduled_arrival_sec", may_date
)

In [50]:
rt_stop_times3["converted_rt_arrival"] = pd.to_datetime(
    rt_stop_times3["service_date"]
) + pd.to_timedelta(rt_stop_times3["rt_arrival_sec"] % 86400, unit="s")

In [51]:
timestamp_subset = [
    "converted_schd_arrival",
    "converted_rt_arrival",
    "scheduled_arrival_sec",
    "rt_arrival_sec",
    "service_date",
]

In [52]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# stop ids can differ between trips of the same route and the same stop sequence is the same
rt_stop_times4 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "shape_array_key",
        "direction_id",
        "stop_sequence",
        "rt_arrival_sec",
    ]
).reset_index(drop=True)

### Calculate the difference btwn actual vs scheduled arrival.

In [53]:
rt_stop_times4['delay_min'] = (rt_stop_times4['converted_rt_arrival'] - rt_stop_times4['converted_schd_arrival']).dt.total_seconds() / 60


In [54]:
print(rt_stop_times4.delay_min.describe(percentiles=[0.01,0.02, 0.05, 0.1, 0.9, 0.95, 0.98,0.99]))

count   116687.00
mean        30.33
std        216.39
min      -1441.97
1%          -5.52
2%          -3.48
5%          -2.13
10%         -1.25
50%          1.87
90%          9.30
95%         14.36
98%       1437.88
99%       1441.57
max       1701.60
Name: delay_min, dtype: float64


In [55]:
rt_stop_times4.loc[rt_stop_times4.delay_min < -1433][
    timestamp_subset
    + [
        "scheduled_arrival_sec_copy",
        "delay_min",
        
    ]
].head(10)

Unnamed: 0,converted_schd_arrival,converted_rt_arrival,scheduled_arrival_sec,rt_arrival_sec,service_date,scheduled_arrival_sec_copy,delay_min
1788,2024-05-23 04:23:00,2024-05-22 04:27:47,1970-01-01 00:00:00.000102180,16067,2024-05-22,102180.0,-1435.22
1894,2024-05-23 04:25:00,2024-05-22 04:28:32,1970-01-01 00:00:00.000102300,16112,2024-05-22,102300.0,-1436.47
2000,2024-05-23 04:25:00,2024-05-22 04:30:20,1970-01-01 00:00:00.000102300,16220,2024-05-22,102300.0,-1434.67
2106,2024-05-23 04:26:00,2024-05-22 04:31:13,1970-01-01 00:00:00.000102360,16273,2024-05-22,102360.0,-1434.78
2204,2024-05-22 23:55:00,2024-05-22 00:00:44,1970-01-01 00:00:00.000086100,44,2024-05-22,86100.0,-1434.27
2213,2024-05-23 04:27:00,2024-05-22 04:32:04,1970-01-01 00:00:00.000102420,16324,2024-05-22,102420.0,-1434.93
2310,2024-05-22 23:56:00,2024-05-22 00:01:17,1970-01-01 00:00:00.000086160,77,2024-05-22,86160.0,-1434.72
2319,2024-05-23 04:28:00,2024-05-22 04:33:22,1970-01-01 00:00:00.000102480,16402,2024-05-22,102480.0,-1434.63
2416,2024-05-22 23:56:00,2024-05-22 00:01:35,1970-01-01 00:00:00.000086160,95,2024-05-22,86160.0,-1434.42
2425,2024-05-23 04:28:00,2024-05-22 04:33:53,1970-01-01 00:00:00.000102480,16433,2024-05-22,102480.0,-1434.12


In [56]:
rt_stop_times4.loc[rt_stop_times4.delay_min >1438][
    timestamp_subset
    + [
        "scheduled_arrival_sec_copy",
        "delay_min",
        
    ]
].head(10)

Unnamed: 0,converted_schd_arrival,converted_rt_arrival,scheduled_arrival_sec,rt_arrival_sec,service_date,scheduled_arrival_sec_copy,delay_min
0,2024-05-21 00:08:00,2024-05-22 00:06:52,1970-01-01 00:00:00.000086880,412,2024-05-22,86880.0,1438.87
1,2024-05-21 00:41:00,2024-05-22 00:44:22,1970-01-01 00:00:00.000088860,2662,2024-05-22,88860.0,1443.37
2,2024-05-21 01:11:00,2024-05-22 01:12:29,1970-01-01 00:00:00.000090660,4349,2024-05-22,90660.0,1441.48
3,2024-05-21 01:41:00,2024-05-22 01:39:47,1970-01-01 00:00:00.000092460,5987,2024-05-22,92460.0,1438.78
4,2024-05-21 02:11:00,2024-05-22 02:11:40,1970-01-01 00:00:00.000094260,7900,2024-05-22,94260.0,1440.67
5,2024-05-21 02:41:00,2024-05-22 02:39:14,1970-01-01 00:00:00.000096060,9554,2024-05-22,96060.0,1438.23
6,2024-05-21 03:11:00,2024-05-22 03:12:43,1970-01-01 00:00:00.000097860,11563,2024-05-22,97860.0,1441.72
7,2024-05-21 03:42:00,2024-05-22 03:41:05,1970-01-01 00:00:00.000099720,13265,2024-05-22,99720.0,1439.08
8,2024-05-21 04:12:00,2024-05-22 04:11:44,1970-01-01 00:00:00.000101520,15104,2024-05-22,101520.0,1439.73
104,2024-05-21 00:09:00,2024-05-22 00:08:33,1970-01-01 00:00:00.000086940,513,2024-05-22,86940.0,1439.55


#### Observation
* There are some geniunely weird rows.

In [57]:
rt_stop_times4.loc[rt_stop_times4.delay_min >2800][
    timestamp_subset
    + [
        "scheduled_arrival_sec_copy",
        "delay_min",
        
    ]
]

Unnamed: 0,converted_schd_arrival,converted_rt_arrival,scheduled_arrival_sec,rt_arrival_sec,service_date,scheduled_arrival_sec_copy,delay_min


### `rt_stop_times5`: Filter out values in `delay` that are very extreme.

In [58]:
stop

NameError: name 'stop' is not defined

In [None]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"] <= 3600].reset_index(drop=True)

In [None]:
# Filter to only delays that are no less than
rt_stop_times5 = rt_stop_times5[rt_stop_times5["delay"] >= -3600].reset_index(drop=True)

In [None]:
rt_stop_times5.shape

In [None]:
rt_stop_times4.shape

In [None]:
len(rt_stop_times4) - len(rt_stop_times5)

### Calculate the actual headway the `operator-route-direction_id-stop_sequence-stop_id-` grain
#### QUESTION: Do I need to include feed key and shape array key? What is `feed_key` and how does it differ from `schedule_gtfs_dataset_key`? Still need help</b>

### Calculate scheduled headway
* Using the same grain.

In [None]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "shape_array_key",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [None]:
# Subtract rt_arrival_sec from the previous row to the target row
# using groupby columns
rt_stop_times5["actual_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "rt_arrival_sec"
].diff()

In [None]:
rt_stop_times5["schd_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "scheduled_arrival_sec"
].diff()

In [None]:
rt_stop_times5.head(10)[
    [
        "scheduled_arrival_sec",
        "rt_arrival_sec",
        "delay",
        "actual_headway",
        "schd_headway",
    ]
]

### Fill in `nans` with 0 
* I am not sure if `nans` impact calculations of the mean scheduled headway and whatnot?
* These `nans` are because the first `operator-route-stop_id-stop_sequence` combo won't have anything to compare it to.
* Katrina: <i>I would fill in the actual/schedule headway columns with 0 rather than dropping the first row  in each grouping. I wonder if it makes sense to use a more descriptive column name than headway, such as "minutes since last vehicle"</i>

In [None]:
rt_stop_times5 = rt_stop_times5.fillna(0)

### Transit Matters Method
* To Do: add back in route  & operator information

In [None]:
transit_matters_df1 = rt_stop_times5.copy()

In [None]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_headway / transit_matters_df1.schd_headway
)

In [None]:
import numpy as np

transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

#### There are some very extreme values: how to deal with this?


In [None]:
transit_matters_df1.pct_actual_schd_headway.describe()

In [None]:
len(transit_matters_df1.loc[transit_matters_df1.pct_actual_schd_headway < 0])

In [None]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

In [None]:
sf_38r_test = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "14295")
    & (
        transit_matters_df1.schedule_gtfs_dataset_key
        == "7cc0cb1871dfd558f11a2885c145d144"
    )
    & (transit_matters_df1.stop_sequence == 11)
    & (transit_matters_df1.route_id == "38R")
]

In [None]:
# Row 444797: scheduled to arrive at 69480, but actually arrives 69890

#### QUESTION: Wonder if I should convert time stamps to hours so it's at least in military time instead of seconds? Although I'm not really sure if this is sound.

In [None]:
sf_38r_test["sched_arrival_min"] = sf_38r_test.scheduled_arrival_sec / 60

In [None]:
sf_38r_test["rt_arrival_min"] = sf_38r_test.rt_arrival_sec / 60

In [None]:
sf_38r_test["actual_headway_min"] = sf_38r_test.actual_headway / 60

In [None]:
sf_38r_test["schd_headway_min"] = sf_38r_test.schd_headway / 60

In [None]:
sf_38r_test[
    [
        "sched_arrival_min",
        "rt_arrival_min",
        "actual_headway_min",
        "schd_headway_min",
        "pct_actual_schd_headway",
    ]
].tail(5)

* Row 466475 was scheduled to arrive after row 466476

#### Groupby grain and see how many trips for that grain are considered "bunched" or not.

In [None]:
transit_matters_df2 = (
    transit_matters_df1.groupby(
        [
            "schedule_gtfs_dataset_key",
            "route_long_name",
            "shape_array_key",
            "route_id",
            "stop_id",
            "direction_id",
            "route_primary_direction",
            "bunched_y_n",
        ]
    )
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
)

In [None]:
# Filter out only rows that are bunched.
bunched_only = transit_matters_df2.loc[
    transit_matters_df2.bunched_y_n == "bunched"
].reset_index(drop=True)

In [None]:
bunched_only = bunched_only.rename(columns={"trip_instance_key": "bunched_trips"})

In [None]:
transit_matters_agg = [
    "schedule_gtfs_dataset_key",
    "route_long_name",
    "shape_array_key",
    "route_id",
    "stop_id",
    "direction_id",
    "route_primary_direction",
]

In [None]:
# Aggregate all trips on the grain
transit_matters_all_trips = (
    transit_matters_df1.groupby(transit_matters_agg)
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
    .rename(columns={"trip_instance_key": "all_trips"})
)

In [None]:
# Merge back, using left merge to keep bunching
bunched_only = pd.merge(
    bunched_only, transit_matters_all_trips, on=transit_matters_agg, how="left"
)

In [None]:
bunched_only["pct_trips_bunched"] = (
    bunched_only.bunched_trips / bunched_only.all_trips * 100
)

In [None]:
bunched_only = bunched_only.drop(columns=["all_trips"])

In [None]:
# Merge back all rows that don't have bunching trips.
transit_matters_m1 = pd.merge(
    transit_matters_all_trips,
    bunched_only,
    on=transit_matters_agg,
    how="left",
)

In [None]:
transit_matters_m1 = transit_matters_m1.drop(
    columns=[
        "bunched_y_n",
    ]
)

In [None]:
transit_matters_m1.pct_trips_bunched = transit_matters_m1.pct_trips_bunched.fillna(0)

In [None]:
transit_matters_m1.pct_trips_bunched.describe()

In [None]:
transit_matters_m1.loc[transit_matters_m1.pct_trips_bunched >= 10].shape

In [None]:
transit_matters_m1.loc[
    (transit_matters_m1.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (transit_matters_m1.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (transit_matters_m1.route_id == "1")
    & (transit_matters_m1.direction_id == 1)
    & (transit_matters_m1.stop_id == "13853")
]

In [None]:
bunched_only.loc[
    (bunched_only.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (bunched_only.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (bunched_only.route_id == "1")
    & (bunched_only.direction_id == 1)
    & (bunched_only.stop_id == "13853")
]

### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

* To Do: add back in route  & operator information

In [None]:
two_minutess_df = rt_stop_times5.copy()

In [None]:
two_minutess_df.columns

In [None]:
two_minutess_df["actual_headway_min"] = two_minutess_df.rt_arrival_sec / 60

In [None]:
two_minutess_df["bunched_y_n"] = np.where(
    two_minutess_df["actual_headway_min"] <= 2, "bunched", "not bunched"
)

In [None]:
two_minutess_df.info()

In [None]:
two_minutess_df.bunched_y_n.value_counts()

#### Same code as Transit Matters Approach

In [None]:
two_minutes_agg1 = (
    two_minutess_df.groupby(
        [
            "schedule_gtfs_dataset_key",
            "route_long_name",
            "shape_array_key",
            "route_id",
            "stop_id",
            "direction_id",
            "route_primary_direction",
            "bunched_y_n",
        ]
    )
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
)

In [None]:
bunched_only_two_min = (
    two_minutes_agg1.loc[two_minutes_agg1.bunched_y_n == "bunched"]
    .reset_index(drop=True)
    .rename(columns={"trip_instance_key": "bunched_trips"})
)

In [None]:
# I want to do a left merge because I'm only interested in trips that bunched.
bunched_only_two_min = pd.merge(
    bunched_only_two_min,
    transit_matters_all_trips,
    on=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "route_id",
        "stop_id",
        "direction_id",
        "route_primary_direction",
    ],
    how="left",
)

In [None]:
bunched_only_two_min["pct_trips_bunched"] = (
    bunched_only_two_min.bunched_trips / bunched_only_two_min.all_trips * 100
)

In [None]:
bunched_only_two_min = bunched_only_two_min.drop(columns=["all_trips"])

In [None]:
bunched_only_two_min.head(2)

In [None]:
# Need to do a left merge on all trips for the stops that don't have bunching.
final_two_minute = pd.merge(
    transit_matters_all_trips,
    bunched_only_two_min,
    on=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "route_id",
        "stop_id",
        "direction_id",
        "route_primary_direction",
    ],
    how="left",
)

In [None]:
final_two_minute.shape

In [None]:
final_two_minute = final_two_minute.drop(columns=["bunched_y_n"])

In [None]:
final_two_minute = final_two_minute.fillna(0)

In [None]:
final_two_minute.head()

In [None]:
bunched = final_two_minute.loc[final_two_minute.pct_trips_bunched != 0]

In [None]:
bunched.all_trips.describe()

In [None]:
bunched.loc[
    (bunched.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (bunched.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (bunched.route_id == "1")
    & (bunched.direction_id == 1)
    & (bunched.stop_id == "13853")
]

In [None]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times5.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (rt_stop_times5.route_id == "1")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "13853")
][["scheduled_arrival_sec2", "rt_arrival_sec", "actual_headway", "schd_headway"]]

In [None]:
bunched.sort_values(by=["pct_trips_bunched"], ascending=False)