## Transit Bunching V2
* Incorporating Katrina and Eric's comments.
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import datetime as dt

import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get routes with short headways.

In [5]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [6]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])

In [8]:
route_dir.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,51.77,0.27,12,1.5,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
3,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,all_day,Southbound,46.73,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22
4,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,offpeak,Southbound,46.73,0.28,11,0.69,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22


In [9]:
route_dir2 = route_dir.loc[route_dir.time_period == "peak"].reset_index(drop=True)

In [10]:
route_dir2["headway_minutes"] = 60 / route_dir.frequency

In [11]:
route_freq_groupby = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
]

In [12]:
high_frequency_routes_median = (
    route_dir2.groupby(route_freq_groupby)
    .agg({"headway_minutes": "median"})
    .reset_index()
    .rename(columns={"headway_minutes": "med_headway_minutes"})
)

In [13]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [14]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [15]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [16]:
crosswalk_df.shape

(168, 5)

#### Grab routes in the 5th percentile of frequency for now.
* Eric: <i>Taking the 5%ile (17.65min headway) is reasonable, but I suspect the worst bunching issues might be on routes with headways at/below the 10min mark? Maybe try 15 and 10 as well?</i>

In [17]:
high_frequency_routes_median["med_headway_minutes"].describe(
    percentiles=[0.05, 0.1, 0.9, 0.95]
)

count   3238.00
mean     236.95
std      350.81
min        4.00
5%        13.83
10%       18.81
50%       89.55
90%      750.00
95%     1000.00
max     1500.00
Name: med_headway_minutes, dtype: float64

In [18]:
high_frequency_routes2 = high_frequency_routes_median.loc[
    high_frequency_routes_median.med_headway_minutes <= 15
]

#### Attach operators and districts

In [19]:
high_frequency_routes2 = pd.merge(
    high_frequency_routes2, crosswalk_df, on="schedule_gtfs_dataset_key", how="left"
)

In [20]:
high_frequency_routes2.route_id.nunique()

158

### Get trips of high frequency routes

In [21]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [22]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [23]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
    "route_type",
]

In [24]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [25]:
# Find only trips that belong to high frequency routes
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [26]:
trips_freq_routes.shape

(4140, 14)

#### Attach route type
* Figure out the proper way to do this.

In [27]:
# https://gtfs.org/documentation/schedule/reference/#
route_type_crosswalk = {
    "route_type": ["0", "1", "2", "3", "4", "5", "6", "7", "11", "12"],
    "route_type_str": [
        "Tram, Streetcar, Light rail",
        "Subway, Metro",
        "Rail",
        "Bus",
        "Ferry.",
        "Cable tram.",
        "Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).",
        "Funicular.",
        "Trolleybus.",
        "Monorail.",
    ],
}

In [28]:
route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)

In [29]:
# Merge for route_type
trips_freq_routes = pd.merge(
    trips_freq_routes, route_type_crosswalk_df, on=["route_type"], how="left"
)

In [30]:
high_frequency_routes2 = trips_freq_routes.drop(columns=["route_type"])

In [31]:
high_frequency_routes2.route_type_str.value_counts()

Bus                            3837
Tram, Streetcar, Light rail     115
Rail                             98
Subway, Metro                    90
Name: route_type_str, dtype: int64

### `rt_stop_times2`: Get Stop Times of High Frequency Routes/Trips

In [32]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [33]:
# How is it possible to have right_only trips?
pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
left_only     2483048
both           118214
right_only       1081
dtype: int64

In [34]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [35]:
rt_stop_times2.shape

(118214, 20)

In [36]:
rt_stop_times2.trip_id.nunique(), rt_stop_times2.trip_instance_key.nunique()

(3059, 3059)

#### What to do with `scheduled_arrival_sec` that are `nan`?

In [37]:
(len(rt_stop_times2) - rt_stop_times2.scheduled_arrival_sec.isna().sum()) / len(
    rt_stop_times2
)

0.9870827482362495

In [38]:
(rt_stop_times2.scheduled_arrival_sec.isna().sum()) / len(rt_stop_times2)

0.012917251763750486

In [39]:
# Add a copy of scheduled arrival sec
rt_stop_times2["scheduled_arrival_sec_copy"] = rt_stop_times2.scheduled_arrival_sec

In [40]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec.notna()
].reset_index(drop=True)

In [41]:
len(rt_stop_times3)

116687

In [42]:
len(rt_stop_times2)

118214

### Sort

In [43]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# stop ids can differ between trips of the same route and the same stop sequence is the same
rt_stop_times3 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "shape_array_key",
        "direction_id",
        "stop_sequence",
        "rt_arrival_sec",
    ]
).reset_index(drop=True)

###  `rt_stop_times3`: Deal with time

* If 82800  < `scheduled_arrival_time` < 86_400 but `rt_arrival_sec` is lower say 14_000 (4 am in the morning): then perhaps the bus was scheduled to arrive on May 21 (day before the service date) but it arrived a little later on the service date. 

* If  86_400 < `scheduled_arrival_time` and `rt_arrival_sec` is around 86_000 then this is the same service date. 

In [44]:
rt_stop_times3["scheduled_arrival_sec"].describe()

count   116687.00
mean     49179.24
std      18084.54
min      12660.00
25%      33960.00
50%      48540.00
75%      62460.00
max     106680.00
Name: scheduled_arrival_sec, dtype: float64

In [45]:
rt_stop_times3.loc[rt_stop_times3["scheduled_arrival_sec"] == 86_400].shape

(16, 21)

In [46]:
rt_stop_times3.loc[rt_stop_times3["scheduled_arrival_sec"] > 86_400].shape

(2583, 21)

#### Test  `scheduled_arrival_sec` rows ` rows that exceed 86,400 seconds

In [47]:
# Convert to midnight anything that goes past the service date
# rt_stop_times3 = convert_to_midnight(
#    rt_stop_times3, "scheduled_arrival_sec", may_date
# )

In [48]:
timestamp_subset = [
    "converted_schd_arrival",
    "converted_rt_arrival",
    "scheduled_arrival_sec",
    "rt_arrival_sec",
    "service_date",
]

In [49]:
86_400 - 3600

82800

In [50]:
def adjust_days_and_time(seconds, date, rt_arrival_sec):
    """
    Adjusts days and time based on seconds and rt_arrival_sec.

    Parameters:
    seconds (int): Number of seconds.
    date (datetime): Initial date.
    rt_arrival_sec (int): Arrival time in seconds.

    Returns:
    datetime: Adjusted date and time.
    """
    if rt_arrival_sec < (60 * 60) and (85_000 < seconds < 87_000):
        # Subtract a day
        return pd.Timestamp(date + pd.Timedelta(days=-1)) + pd.Timedelta(
            seconds=seconds % 86400
        )
    elif rt_arrival_sec < (86_400 / 2) and (86_400 < seconds):
        return pd.Timestamp(date) + pd.Timedelta(seconds=seconds % 86400)
    else:
        # No change
        return pd.Timestamp(date) + pd.Timedelta(seconds=seconds)

In [51]:
more_than_86400 = rt_stop_times3.loc[
    rt_stop_times3["scheduled_arrival_sec"] > 86_400
].reset_index(drop=True)

In [52]:
more_than_86400["converted_schd_arrival"] = more_than_86400.apply(
    lambda row: adjust_days_and_time(
        row["scheduled_arrival_sec"], row["service_date"], row["rt_arrival_sec"]
    ),
    axis=1,
)

In [53]:
more_than_86400["converted_rt_arrival"] = pd.to_datetime(
    more_than_86400["service_date"]
) + pd.to_timedelta(more_than_86400["rt_arrival_sec"] % 86400, unit="s")

In [54]:
more_than_86400["delay_min"] = (
    more_than_86400["converted_rt_arrival"] - more_than_86400["converted_schd_arrival"]
).dt.total_seconds() / 60

In [55]:
print(
    more_than_86400.delay_min.describe(
        percentiles=[0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]
    )
)

count   2583.00
mean     101.87
std      365.42
min      -18.18
1%        -5.47
2%        -3.25
5%        -1.92
10%       -1.20
50%        1.75
90%        9.60
95%     1440.26
98%     1445.47
99%     1448.25
max     1466.70
Name: delay_min, dtype: float64


In [56]:
more_than_86400.rt_arrival_sec.describe()

count    2583.00
mean     7041.60
std      5789.22
min         9.00
25%      2556.50
50%      5874.00
75%     10739.00
max     86389.00
Name: rt_arrival_sec, dtype: float64

In [57]:
more_than_86400.shape

(2583, 24)

#### Apply to all rows

In [58]:
rt_stop_times3["converted_schd_arrival"] = rt_stop_times3.apply(
    lambda row: adjust_days_and_time(
        row["scheduled_arrival_sec"], row["service_date"], row["rt_arrival_sec"]
    ),
    axis=1,
)

In [59]:
rt_stop_times3["converted_rt_arrival"] = pd.to_datetime(
    rt_stop_times3["service_date"]
) + pd.to_timedelta(rt_stop_times3["rt_arrival_sec"] % 86400, unit="s")

In [60]:
rt_stop_times3["delay_min"] = (
    rt_stop_times3["converted_rt_arrival"] - rt_stop_times3["converted_schd_arrival"]
).dt.total_seconds() / 60

In [61]:
def add_day_to_23_hours(df):
    df["converted_schd_arrival"] = df.apply(
        lambda row: row["converted_schd_arrival"] + pd.Timedelta(days=1)
        if row["delay_min"] > (23 * 60)
        else row["converted_schd_arrival"],
        axis=1,
    )

    df["delay_min"] = (
        df["converted_rt_arrival"] - rt_stop_times3["converted_schd_arrival"]
    ).dt.total_seconds() / 60
    return df

In [62]:
percentiles = [0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]

In [63]:
rt_stop_times3 = add_day_to_23_hours(rt_stop_times3)

In [64]:
print(rt_stop_times3.delay_min.describe(percentiles))

count   116687.00
mean         3.10
std         20.22
min      -1345.67
1%          -4.85
2%          -3.38
5%          -2.12
10%         -1.25
50%          1.78
90%          8.33
95%         11.50
98%         17.12
99%         23.32
max        727.87
Name: delay_min, dtype: float64


In [65]:
percentile_99 = rt_stop_times3["delay_min"].quantile(0.99)

In [66]:
percentile_01 = rt_stop_times3["delay_min"].quantile(0.01)

In [67]:
percentile_01_df = rt_stop_times3.loc[rt_stop_times3.delay_min < percentile_01]

In [68]:
len(percentile_01_df)

1165

In [69]:
delay_above_99 = rt_stop_times3.loc[rt_stop_times3.delay_min > percentile_99]

In [70]:
delay_above_99.delay_min.describe(percentiles)

count   1167.00
mean      83.06
std      133.17
min       23.33
1%        23.44
2%        23.59
5%        23.92
10%       24.68
50%       30.17
90%      274.95
95%      450.83
98%      460.65
99%      616.57
max      727.87
Name: delay_min, dtype: float64

In [71]:
len(delay_above_99)

1167

In [72]:
percentile_01_df.delay_min.describe(percentiles)

count    1165.00
mean      -41.63
std       114.86
min     -1345.67
1%       -593.61
2%       -393.00
5%       -183.12
10%       -52.63
50%       -10.10
90%        -5.22
95%        -5.03
98%        -4.92
99%        -4.90
max        -4.87
Name: delay_min, dtype: float64

In [73]:
percentile_10 = percentile_01_df["delay_min"].quantile(0.10)

In [74]:
percentile_10

-52.626666666666665

In [75]:
percentile_90 = delay_above_99["delay_min"].quantile(0.90)

In [76]:
percentile_90

274.9466666666675

* If scheduled_arrival_sec is in the 86000 ballpark and rt_arrival_sec is less than 60*60*3, then subtract a day 

### Filter out values in `delay` that are very extreme.

In [77]:
# Filter to only delays that are an hour or less
rt_stop_times4 = rt_stop_times3[
    (rt_stop_times3["delay_min"] >= percentile_10)
    & (rt_stop_times3["delay_min"] <= percentile_90)
].reset_index(drop=True)

In [78]:
len(rt_stop_times4) - len(rt_stop_times3)

-234

In [79]:
rt_stop_times4.delay_min.describe(percentiles)

count   116453.00
mean         2.94
std          6.38
min        -52.42
1%          -4.59
2%          -3.30
5%          -2.10
10%         -1.25
50%          1.78
90%          8.30
95%         11.40
98%         16.77
99%         21.97
max        271.13
Name: delay_min, dtype: float64

### Calculate the actual & scheduled headway the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [80]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "shape_array_key",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [81]:
rt_stop_times4.columns

Index(['trip_id', 'stop_id', 'stop_sequence', 'scheduled_arrival_sec',
       'schedule_gtfs_dataset_key', 'trip_instance_key', 'rt_arrival_sec',
       'route_id', 'shape_array_key', 'feed_key', 'route_long_name',
       'direction_id', 'route_type', 'route_primary_direction',
       'med_headway_minutes', 'organization_name', 'name', 'caltrans_district',
       'service_date', 'route_type_str', 'scheduled_arrival_sec_copy',
       'converted_schd_arrival', 'converted_rt_arrival', 'delay_min'],
      dtype='object')

In [82]:
# Subtract rt_arrival_sec from the previous row to the target row
# using groupby columns
rt_stop_times4["actual_arrival_lag"] = rt_stop_times4.groupby(groupby_cols)[
    "converted_rt_arrival"
].diff()

In [83]:
rt_stop_times4["scheduled_arrival_lag"] = rt_stop_times4.groupby(groupby_cols)[
    "converted_schd_arrival"
].diff()

In [84]:
rt_stop_times4.head(10)[
    [
        "converted_rt_arrival",
        "actual_arrival_lag",
        "converted_schd_arrival",
        "scheduled_arrival_lag",
    ]
]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag,converted_schd_arrival,scheduled_arrival_lag
0,2024-05-22 00:06:52,NaT,2024-05-22 00:08:00,NaT
1,2024-05-22 00:44:22,0 days 00:37:30,2024-05-22 00:41:00,0 days 00:33:00
2,2024-05-22 01:12:29,0 days 00:28:07,2024-05-22 01:11:00,0 days 00:30:00
3,2024-05-22 01:39:47,0 days 00:27:18,2024-05-22 01:41:00,0 days 00:30:00
4,2024-05-22 02:11:40,0 days 00:31:53,2024-05-22 02:11:00,0 days 00:30:00
5,2024-05-22 02:39:14,0 days 00:27:34,2024-05-22 02:41:00,0 days 00:30:00
6,2024-05-22 03:12:43,0 days 00:33:29,2024-05-22 03:11:00,0 days 00:30:00
7,2024-05-22 03:41:05,0 days 00:28:22,2024-05-22 03:42:00,0 days 00:31:00
8,2024-05-22 04:11:44,0 days 00:30:39,2024-05-22 04:12:00,0 days 00:30:00
9,2024-05-22 04:45:07,0 days 00:33:23,2024-05-22 04:49:00,0 days 00:37:00


### Fill in `nans` with 0 
* I am not sure if `nans` impact calculations of the mean scheduled headway and whatnot?
* These `nans` are because the first `operator-route-stop_id-stop_sequence` combo won't have anything to compare it to.
* Katrina: <i>I would fill in the actual/schedule headway columns with 0 rather than dropping the first row  in each grouping. I wonder if it makes sense to use a more descriptive column name than headway, such as "minutes since last vehicle"</i>

In [85]:
rt_stop_times4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116453 entries, 0 to 116452
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype          
---  ------                      --------------   -----          
 0   trip_id                     116453 non-null  object         
 1   stop_id                     116453 non-null  object         
 2   stop_sequence               116453 non-null  int64          
 3   scheduled_arrival_sec       116453 non-null  float64        
 4   schedule_gtfs_dataset_key   116453 non-null  object         
 5   trip_instance_key           116453 non-null  object         
 6   rt_arrival_sec              116453 non-null  int64          
 7   route_id                    116453 non-null  object         
 8   shape_array_key             116453 non-null  object         
 9   feed_key                    116453 non-null  object         
 10  route_long_name             116453 non-null  object         
 11  direction_id              

### Transit Matters Method

In [86]:
transit_matters_df1 = rt_stop_times4.copy()

In [87]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_arrival_lag / transit_matters_df1.scheduled_arrival_lag
)

In [88]:
import numpy as np

transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

#### There are some very extreme values: how to deal with this?


In [89]:
transit_matters_df1.pct_actual_schd_headway.describe()

count   110343.00
mean         0.99
std          0.28
min         -1.45
25%          0.90
50%          1.00
75%          1.10
max          3.24
Name: pct_actual_schd_headway, dtype: float64

In [90]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

not bunched   0.98
bunched       0.02
Name: bunched_y_n, dtype: float64

In [91]:
example1 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "5637")
    & (
        transit_matters_df1.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (transit_matters_df1.stop_sequence == 32)
    & (transit_matters_df1.route_id == "204-13172")
]

In [92]:
example1["sched_arrival_min"] = example1.scheduled_arrival_sec / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example1["sched_arrival_min"] = example1.scheduled_arrival_sec / 60


In [93]:
example1["rt_arrival_min"] = example1.rt_arrival_sec / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example1["rt_arrival_min"] = example1.rt_arrival_sec / 60


In [94]:
example1.shape

(106, 30)

In [95]:
example1[
    [
        "converted_rt_arrival",
        "actual_arrival_lag",
        "converted_schd_arrival",
        "scheduled_arrival_lag",
        "pct_actual_schd_headway",
        "bunched_y_n",
    ]
]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag,converted_schd_arrival,scheduled_arrival_lag,pct_actual_schd_headway,bunched_y_n
3158,2024-05-22 00:07:06,NaT,2024-05-22 00:02:00,NaT,,not bunched
3159,2024-05-22 00:34:01,0 days 00:26:55,2024-05-22 00:32:00,0 days 00:30:00,0.9,not bunched
3160,2024-05-22 01:14:46,0 days 00:40:45,2024-05-22 01:04:00,0 days 00:32:00,1.27,not bunched
3161,2024-05-22 01:35:45,0 days 00:20:59,2024-05-22 01:34:00,0 days 00:30:00,0.7,not bunched
3162,2024-05-22 02:04:52,0 days 00:29:07,2024-05-22 02:04:00,0 days 00:30:00,0.97,not bunched
3163,2024-05-22 02:34:57,0 days 00:30:05,2024-05-22 02:34:00,0 days 00:30:00,1.0,not bunched
3164,2024-05-22 03:03:09,0 days 00:28:12,2024-05-22 03:04:00,0 days 00:30:00,0.94,not bunched
3165,2024-05-22 03:35:53,0 days 00:32:44,2024-05-22 03:34:00,0 days 00:30:00,1.09,not bunched
3166,2024-05-22 04:04:36,0 days 00:28:43,2024-05-22 04:04:00,0 days 00:30:00,0.96,not bunched
3167,2024-05-22 04:39:50,0 days 00:35:14,2024-05-22 04:34:00,0 days 00:30:00,1.17,not bunched


#### Groupby grain and see how many trips for that grain are considered "bunched" or not.
* Take out `stop_sequence`?

In [96]:
transit_matters_agg = [
    "caltrans_district",
    "organization_name",
    "route_long_name",
    "route_type_str",
    "shape_array_key",
    "route_id",
    "stop_id",
    "route_primary_direction",
    "bunched_y_n",
]

In [97]:
transit_matters_df2 = (
    transit_matters_df1.groupby(transit_matters_agg)
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
).rename(columns = {"trip_instance_key":"all_trips"} )

In [98]:
# Filter out only rows that are bunched.
bunched_only = transit_matters_df2.loc[
    transit_matters_df2.bunched_y_n == "bunched"
].reset_index(drop=True).drop(columns = ["bunched_y_n"])

In [99]:
len(bunched_only)

730

In [100]:
bunched_only = bunched_only.rename(columns={"all_trips": "bunched_trips"})

In [101]:
bunched_only.head(2)

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type_str,shape_array_key,route_id,stop_id,route_primary_direction,bunched_trips
0,03 - Marysville,Yolo County Transportation District,ROUTE 215 WB,Bus,e939d633652e2af6d3aa82d28a042dbf,07959480-2a40-4a51-92ac-8ca2029d5f4f,a1d203a6-cfdd-40a1-af2f-0fa502ea65b4,Westbound,1
1,03 - Marysville,Yolo County Transportation District,ROUTE 215 WB,Bus,e939d633652e2af6d3aa82d28a042dbf,07959480-2a40-4a51-92ac-8ca2029d5f4f,c8a9bfc8-7e84-483b-95bc-02a1494c3ae3,Westbound,1


In [102]:
transit_matters_df2 = transit_matters_df2.drop(columns = ["bunched_y_n"])

In [103]:
transit_matters_df2.head(2)

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type_str,shape_array_key,route_id,stop_id,route_primary_direction,all_trips
0,01 - Eureka,City of Eureka,AMRTS Gold Route,Bus,c47c15ffc43da6e556ff913272778e4d,14,1262,Northbound,11
1,01 - Eureka,City of Eureka,AMRTS Gold Route,Bus,c47c15ffc43da6e556ff913272778e4d,14,1264,Northbound,11


In [104]:
transit_matters_agg.remove("bunched_y_n")

In [105]:
transit_matters_agg

['caltrans_district',
 'organization_name',
 'route_long_name',
 'route_type_str',
 'shape_array_key',
 'route_id',
 'stop_id',
 'route_primary_direction']

In [106]:
# Merge back, using left merge to keep bunching to find % of bunched trips
transit_matters_m1 = pd.merge(
    transit_matters_df2, bunched_only, on=transit_matters_agg, how="outer"
)

In [107]:
transit_matters_m1.loc[
    (transit_matters_m1.stop_id == "5637")
    & (
        transit_matters_m1.organization_name
        == "Los Angeles County Metropolitan Transportation Authority"
    )
    & (transit_matters_m1.route_id == "204-13172")
]

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type_str,shape_array_key,route_id,stop_id,route_primary_direction,all_trips,bunched_trips
4131,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,8,8.0
4132,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,98,8.0


In [108]:
transit_matters_m2 = transit_matters_m1.sort_values(by = ["all_trips"], ascending = False).drop_duplicates(subset = transit_matters_agg).reset_index(drop = True)

In [109]:
transit_matters_m2.loc[
    (transit_matters_m2.stop_id == "5637")
    & (
        transit_matters_m2.organization_name
        == "Los Angeles County Metropolitan Transportation Authority"
    )
    & (transit_matters_m2.route_id == "204-13172")
]

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type_str,shape_array_key,route_id,stop_id,route_primary_direction,all_trips,bunched_trips
71,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,98,8.0


In [110]:
transit_matters_m2["pct_trips_bunched"] = (
    transit_matters_m2.bunched_trips / transit_matters_m2.all_trips * 100
)

In [111]:
transit_matters_m2.pct_trips_bunched = transit_matters_m2.pct_trips_bunched.fillna(0)

In [112]:
transit_matters_m2.pct_trips_bunched.describe(percentiles)

count   5871.00
mean       0.64
std        2.91
min        0.00
1%         0.00
2%         0.00
5%         0.00
10%        0.00
50%        0.00
90%        1.79
95%        4.04
98%        8.47
99%       11.54
max      100.00
Name: pct_trips_bunched, dtype: float64

In [121]:
transit_matters_m2.sort_values(by = ["pct_trips_bunched"], ascending = False).head(50)

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type_str,shape_array_key,route_id,stop_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
5599,04 - Oakland,Peninsula Corridor Joint Powers Board,Local,Rail,8c4de04e7398d418c12cc1541651e951,L1,70022,Eastbound,1,1.0,100.0
5582,04 - Oakland,Peninsula Corridor Joint Powers Board,Local,Rail,8c4de04e7398d418c12cc1541651e951,L1,70032,Eastbound,1,1.0,100.0
728,04 - Oakland,Emeryville Transportation Management Agency,Hollis,Bus,0628e405f93c2d0b5e3e68a7115857d4,Hollis,855335,Northbound,44,12.0,27.27
4362,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,0688a14c97a2ebfe90f5674c1262d741,217-13172,104720,Southbound,4,1.0,25.0
4356,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,0688a14c97a2ebfe90f5674c1262d741,217-13172,11013,Southbound,4,1.0,25.0
4337,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,0688a14c97a2ebfe90f5674c1262d741,217-13172,11031,Southbound,4,1.0,25.0
2911,06 - Fresno,City of Visalia,Route 9,Bus,60da59c7000ea5dcb5f845d8fa227f14,2042,2307469,Westbound,13,3.0,23.08
2588,06 - Fresno,City of Visalia,Route 9,Bus,60da59c7000ea5dcb5f845d8fa227f14,2042,2307692,Westbound,15,3.0,20.0
2631,06 - Fresno,City of Visalia,Route 9,Bus,60da59c7000ea5dcb5f845d8fa227f14,2042,2307698,Westbound,15,3.0,20.0
2625,06 - Fresno,City of Visalia,Route 9,Bus,60da59c7000ea5dcb5f845d8fa227f14,2042,2307716,Westbound,15,3.0,20.0


In [125]:
example2 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "2307719")
    & (
        transit_matters_df1.organization_name
        == "City of Visalia"
    )
    & (transit_matters_df1.route_id == "2042")
    & (transit_matters_df1.shape_array_key == "60da59c7000ea5dcb5f845d8fa227f14")
]

In [126]:
example2[
    [
        "converted_rt_arrival",
        "actual_arrival_lag",
        "converted_schd_arrival",
        "scheduled_arrival_lag",
        "pct_actual_schd_headway",
        "bunched_y_n",
    ]
]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag,converted_schd_arrival,scheduled_arrival_lag,pct_actual_schd_headway,bunched_y_n
91366,2024-05-22 06:32:47,NaT,2024-05-22 06:42:00,NaT,,not bunched
91367,2024-05-22 07:26:41,0 days 00:53:54,2024-05-22 07:27:00,0 days 00:45:00,1.2,not bunched
91368,2024-05-22 08:06:11,0 days 00:39:30,2024-05-22 08:12:00,0 days 00:45:00,0.88,not bunched
91369,2024-05-22 08:56:57,0 days 00:50:46,2024-05-22 08:57:00,0 days 00:45:00,1.13,not bunched
91370,2024-05-22 09:37:59,0 days 00:41:02,2024-05-22 09:42:00,0 days 00:45:00,0.91,not bunched
91371,2024-05-22 10:27:26,0 days 00:49:27,2024-05-22 10:27:00,0 days 00:45:00,1.1,not bunched
91372,2024-05-22 11:10:05,0 days 00:42:39,2024-05-22 11:12:00,0 days 00:45:00,0.95,not bunched
91373,2024-05-22 12:01:01,0 days 00:50:56,2024-05-22 11:57:00,0 days 00:45:00,1.13,not bunched
91374,2024-05-22 12:38:08,0 days 00:37:07,2024-05-22 12:42:00,0 days 00:45:00,0.82,not bunched
91375,2024-05-22 13:27:10,0 days 00:49:02,2024-05-22 13:27:00,0 days 00:45:00,1.09,not bunched


### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

* To Do: add back in route  & operator information

In [114]:
two_minutess_df = rt_stop_times5.copy()

NameError: name 'rt_stop_times5' is not defined

In [None]:
two_minutess_df.columns

In [None]:
two_minutess_df["actual_headway_min"] = two_minutess_df.rt_arrival_sec / 60

In [None]:
two_minutess_df["bunched_y_n"] = np.where(
    two_minutess_df["actual_headway_min"] <= 2, "bunched", "not bunched"
)

In [None]:
two_minutess_df.info()

In [None]:
two_minutess_df.bunched_y_n.value_counts()

#### Same code as Transit Matters Approach

In [None]:
two_minutes_agg1 = (
    two_minutess_df.groupby(
        [
            "schedule_gtfs_dataset_key",
            "route_long_name",
            "shape_array_key",
            "route_id",
            "stop_id",
            "direction_id",
            "route_primary_direction",
            "bunched_y_n",
        ]
    )
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
)

In [None]:
bunched_only_two_min = (
    two_minutes_agg1.loc[two_minutes_agg1.bunched_y_n == "bunched"]
    .reset_index(drop=True)
    .rename(columns={"trip_instance_key": "bunched_trips"})
)

In [None]:
# I want to do a left merge because I'm only interested in trips that bunched.
bunched_only_two_min = pd.merge(
    bunched_only_two_min,
    transit_matters_all_trips,
    on=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "route_id",
        "stop_id",
        "direction_id",
        "route_primary_direction",
    ],
    how="left",
)

In [None]:
bunched_only_two_min["pct_trips_bunched"] = (
    bunched_only_two_min.bunched_trips / bunched_only_two_min.all_trips * 100
)

In [None]:
bunched_only_two_min = bunched_only_two_min.drop(columns=["all_trips"])

In [None]:
bunched_only_two_min.head(2)

In [None]:
# Need to do a left merge on all trips for the stops that don't have bunching.
final_two_minute = pd.merge(
    transit_matters_all_trips,
    bunched_only_two_min,
    on=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "route_id",
        "stop_id",
        "direction_id",
        "route_primary_direction",
    ],
    how="left",
)

In [None]:
final_two_minute.shape

In [None]:
final_two_minute = final_two_minute.drop(columns=["bunched_y_n"])

In [None]:
final_two_minute = final_two_minute.fillna(0)

In [None]:
final_two_minute.head()

In [None]:
bunched = final_two_minute.loc[final_two_minute.pct_trips_bunched != 0]

In [None]:
bunched.all_trips.describe()

In [None]:
bunched.loc[
    (bunched.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (bunched.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (bunched.route_id == "1")
    & (bunched.direction_id == 1)
    & (bunched.stop_id == "13853")
]

In [None]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times5.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (rt_stop_times5.route_id == "1")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "13853")
][["scheduled_arrival_sec2", "rt_arrival_sec", "actual_headway", "schd_headway"]]

In [None]:
bunched.sort_values(by=["pct_trips_bunched"], ascending=False)