## Transit Bunching V2
* Incorporating Katrina and Eric's comments.
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import datetime as dt

import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get routes with short headways.

In [5]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [6]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])

In [8]:
route_dir.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,51.77,0.27,12,1.5,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
3,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,all_day,Southbound,46.73,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22
4,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,offpeak,Southbound,46.73,0.28,11,0.69,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22


#### For now, only calculating stats based on "peak" time frequency, but later on, I'll calculate this for all routes regardless of peak/offpeak/all_day

In [9]:
route_dir2 = route_dir.loc[route_dir.time_period == "peak"].reset_index(drop=True)

In [10]:
route_dir2["headway_minutes"] = 60 / route_dir.frequency

In [11]:
route_freq_groupby = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
]

In [12]:
high_frequency_routes_median = (
    route_dir2.groupby(route_freq_groupby)
    .agg({"headway_minutes": "median"})
    .reset_index()
    .rename(columns={"headway_minutes": "med_headway_minutes"})
)

In [13]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [14]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [15]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [16]:
crosswalk_df.shape

(189, 5)

#### Grab routes in the 5th percentile of frequency for now.
* Eric: <i>Taking the 5%ile (17.65min headway) is reasonable, but I suspect the worst bunching issues might be on routes with headways at/below the 10min mark? Maybe try 15 and 10 as well?</i>

In [17]:
high_frequency_routes_median["med_headway_minutes"].describe(
    percentiles=[0.05, 0.1, 0.9, 0.95]
)

count   3238.00
mean     236.95
std      350.81
min        4.00
5%        13.83
10%       18.81
50%       89.55
90%      750.00
95%     1000.00
max     1500.00
Name: med_headway_minutes, dtype: float64

In [18]:
high_frequency_routes2 = high_frequency_routes_median.loc[
    high_frequency_routes_median.med_headway_minutes <= 15
]

#### Attach operators and districts

In [19]:
high_frequency_routes2 = pd.merge(
    high_frequency_routes2, crosswalk_df, on="schedule_gtfs_dataset_key", how="left"
)

In [20]:
high_frequency_routes2.route_id.nunique()

158

### Get trips of high frequency routes

In [21]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [22]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [23]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
    "route_type",
]

In [24]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [25]:
# Find only trips that belong to high frequency routes
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [26]:
trips_freq_routes.shape

(6446, 14)

In [27]:
trips_freq_routes.route_id.nunique()

158

### HELP: Attach route type
* Figure out the proper way to do this.
* DO we want all route types? 

In [28]:
# https://gtfs.org/documentation/schedule/reference/#
route_type_crosswalk = {
    "route_type": ["0", "1", "2", "3", "4", "5", "6", "7", "11", "12"],
    "route_type_str": [
        "Tram, Streetcar, Light rail",
        "Subway, Metro",
        "Rail",
        "Bus",
        "Ferry.",
        "Cable tram.",
        "Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).",
        "Funicular.",
        "Trolleybus.",
        "Monorail.",
    ],
}

In [29]:
route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)

In [30]:
# Merge for route_type
trips_freq_routes = pd.merge(
    trips_freq_routes, route_type_crosswalk_df, on=["route_type"], how="left"
)

In [31]:
high_frequency_routes2 = trips_freq_routes.drop(columns=["route_type"])

In [32]:
high_frequency_routes2.route_type_str = high_frequency_routes2.route_type_str.fillna(
    "NA"
)

In [33]:
high_frequency_routes2.route_type_str.value_counts()

Bus                            6143
Tram, Streetcar, Light rail     115
Rail                             98
Subway, Metro                    90
Name: route_type_str, dtype: int64

In [34]:
high_frequency_routes2 = high_frequency_routes2.rename(
    columns={"route_type_str": "route_type"}
)

In [35]:
high_frequency_routes2.route_id.nunique()

158

### `rt_stop_times2`: Get Stop Times of High Frequency Routes/Trips
#### NOTE: There aren't that many rows that are found in "both" dataframes? I am using data from the same analysis_date.

In [36]:
may_date

'2024-05-22'

In [37]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [38]:
rt_stop_times.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
0,1d105244-776c-4b3f-af78-9c7ad78c2103,0b2443b6-b50f-452b-a749-464588ca93b8,8,60991.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61434
1,1d105244-776c-4b3f-af78-9c7ad78c2103,cd5650b0-9a18-4e78-aedc-385f3094fa0f,9,61179.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61616


In [39]:
trips_freq_routes.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,route_type,route_primary_direction,med_headway_minutes,organization_name,name,caltrans_district,service_date,route_type_str
0,1770249a5a2e770ca90628434d4934b1,3404,e2360ffa7956dc185476b183e61e08c0,006c2eb48c8a3deeba175207dd110219,926867fdee73d5fbfe4f011871bcd830,Route 16,0.0,3,Northbound,14.56,Ventura County Transportation Commission,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus
1,1770249a5a2e770ca90628434d4934b1,3404,e2360ffa7956dc185476b183e61e08c0,006c2eb48c8a3deeba175207dd110219,926867fdee73d5fbfe4f011871bcd830,Route 16,0.0,3,Northbound,14.56,City of Simi Valley,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus


In [40]:
# How is it possible to have right_only trips?
pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
left_only     2483048
both           165073
right_only       1203
dtype: int64

In [41]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

rt_stop_times2.shape

In [42]:
rt_stop_times2.trip_id.nunique(), rt_stop_times2.trip_instance_key.nunique()

(3059, 3059)

In [43]:
rt_stop_times2.route_id.nunique()

116

#### HELP What to do with `scheduled_arrival_sec` that are `nan`?

In [44]:
(rt_stop_times2.scheduled_arrival_sec.isna().sum()) / len(rt_stop_times2)

0.009577580827876152

In [45]:
rt_stop_times2.scheduled_arrival_sec.isna().sum()

1581

In [46]:
df_na = rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec.isna()]

##### Many more direction 0 rows are empty

In [47]:
df_na.direction_id.value_counts()

0.00    1242
1.00     339
Name: direction_id, dtype: int64

In [48]:
rt_stop_times2.loc[rt_stop_times2.scheduled_arrival_sec.isna()].sample(3)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_type,route_primary_direction,med_headway_minutes,organization_name,name,caltrans_district,service_date,route_type_str
11502,t_5751816_b_81644_tn_0,75298,2,,0f5e1b251db53223200c5bfc365d33f2,3803b787800a24b3fb9a8f4e4bb4e271,56457,8T,69a5deee4b19d757a58a40e5dcb5632f,8fb9cf3289b649549adfc229f2fc7cbc,Cordelia School Tripper,0.0,3,Northbound,10.0,City of Fairfield,Bay Area 511 Fairfield and Suisun Transit Schedule,04 - Oakland,2024-05-22,Bus
10601,t_5745231_b_81578_tn_3,2710,13,,090b30e4249a7ec2b4c6a0923ed2f953,642a4c709c6ecd58dfb949596070940a,53026,115,f3d094808a2a265ba403dd00b8d77467,87c96d5026263d6986f2cabe6892390c,Route 1-Blue Wal-Mart / DNHS College,0.0,3,Southbound,7.39,Redwood Coast Transit Authority,Redwood Coast Schedulel,01 - Eureka,2024-05-22,Bus
10336,t_5745236_b_81578_tn_2,2778,15,,090b30e4249a7ec2b4c6a0923ed2f953,5cbac7c577dfc9b3db86040f98f63c55,31532,118,8c85592e068e141e7b3d83216c4ec1c6,87c96d5026263d6986f2cabe6892390c,Route 4 - Orange Bertsch / Howland Hill Rd,0.0,3,Westbound,7.74,Redwood Coast Transit Authority,Redwood Coast Schedulel,01 - Eureka,2024-05-22,Bus


In [49]:
len(rt_stop_times2)

165073

#### Why would they be missing? Look at one route/operator/stop/direction_id combo #1

##### Looks like none of the rows in this combination have populated `scheduled_arrival_sec`

In [50]:
rt_stop_times2.loc[
    (rt_stop_times2.stop_id == "2712")
    & (rt_stop_times2.stop_sequence == 15)
    & (rt_stop_times2.schedule_gtfs_dataset_key == "090b30e4249a7ec2b4c6a0923ed2f953")
    & (rt_stop_times2.route_long_name == "Route 1-Blue Wal-Mart / DNHS College")
    & (rt_stop_times2.direction_id == 0)
][["scheduled_arrival_sec"]]

Unnamed: 0,scheduled_arrival_sec
10388,
10410,
10421,
10434,
10504,
10556,
10562,
10577,
10597,
11022,


##### Lots of missing data for this route

In [51]:
rt_stop_times2.loc[
    (rt_stop_times2.schedule_gtfs_dataset_key == "090b30e4249a7ec2b4c6a0923ed2f953")
    & (rt_stop_times2.route_long_name == "Route 1-Blue Wal-Mart / DNHS College")
    & (rt_stop_times2.direction_id == 0)
][["stop_sequence", "stop_id", "scheduled_arrival_sec"]].sort_values(
    by=["stop_sequence"]
)

Unnamed: 0,stop_sequence,stop_id,scheduled_arrival_sec
10613,2,772534,52260.0
11032,2,772534,48660.0
10554,2,772534,37860.0
10372,2,772534,30660.0
11084,2,772534,34260.0
11348,2,772534,59460.0
10426,2,772534,41460.0
10580,2,772534,27060.0
10440,2,772534,55860.0
10507,2,772534,63060.0


####  Look at one route/operator/stop/direction_id combo #2

In [52]:
rt_stop_times2.loc[
    (rt_stop_times2.stop_id == "783967")
    & (rt_stop_times2.stop_sequence == 7)
    & (rt_stop_times2.schedule_gtfs_dataset_key == "0a3c0b21c85fb09f8db91599e14dd7f7")
    & (rt_stop_times2.route_long_name == "Highway 29, Clearlake to Deer Park")
    & (rt_stop_times2.direction_id == 0)
][["scheduled_arrival_sec"]]

Unnamed: 0,scheduled_arrival_sec
73583,
73902,
73930,


In [53]:
rt_stop_times2.loc[
    (rt_stop_times2.schedule_gtfs_dataset_key == "0a3c0b21c85fb09f8db91599e14dd7f7")
    & (rt_stop_times2.route_long_name == "Highway 29, Clearlake to Deer Park")
    & (rt_stop_times2.direction_id == 0)
][["stop_sequence", "stop_id", "scheduled_arrival_sec"]].sort_values(
    by=["stop_sequence"]
)

Unnamed: 0,stop_sequence,stop_id,scheduled_arrival_sec
73901,2,784009,
73926,2,784009,
73587,2,784009,
73897,3,783970,
73927,4,784008,60840.0
73896,4,784008,28740.0
73925,5,783969,
73903,5,783969,
73582,5,783969,
73899,6,783968,29640.0


In [54]:
# Add a copy of scheduled arrival sec
rt_stop_times2["scheduled_arrival_sec_copy"] = rt_stop_times2.scheduled_arrival_sec

#### Dropping for them now.

In [55]:
rt_stop_times3 = rt_stop_times2.loc[
    rt_stop_times2.scheduled_arrival_sec.notna()
].reset_index(drop=True)

In [56]:
len(rt_stop_times3)

163492

In [57]:
len(rt_stop_times2)

165073

### Sort

In [58]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# Do I need to include stop_id?
rt_stop_times3 = rt_stop_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "shape_array_key",
        "direction_id",
        "stop_sequence",
        "rt_arrival_sec",
    ]
).reset_index(drop=True)

In [59]:
rt_stop_times3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163492 entries, 0 to 163491
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   trip_id                     163492 non-null  object        
 1   stop_id                     163492 non-null  object        
 2   stop_sequence               163492 non-null  int64         
 3   scheduled_arrival_sec       163492 non-null  float64       
 4   schedule_gtfs_dataset_key   163492 non-null  object        
 5   trip_instance_key           163492 non-null  object        
 6   rt_arrival_sec              163492 non-null  int64         
 7   route_id                    163492 non-null  object        
 8   shape_array_key             163492 non-null  object        
 9   feed_key                    163492 non-null  object        
 10  route_long_name             163492 non-null  object        
 11  direction_id                163492 non-

###  `rt_stop_times3`: Deal with `scheduled_arrival_time`

* If 82800  < `scheduled_arrival_time` < 86_400 but `rt_arrival_sec` is lower say 14_000 (4 am in the morning): then perhaps the bus was scheduled to arrive on May 21 (day before the service date) but it arrived a little later on the service date. 

* If  86_400 < `scheduled_arrival_time` and `rt_arrival_sec` is around 86_000 then this is the same service date. 

In [60]:
rt_stop_times3["scheduled_arrival_sec"].describe()

count   163492.00
mean     48550.93
std      17164.75
min      12660.00
25%      34260.00
50%      48000.00
75%      61440.00
max     106680.00
Name: scheduled_arrival_sec, dtype: float64

In [61]:
rt_stop_times3.loc[rt_stop_times3["scheduled_arrival_sec"] == 86_400].shape

(16, 21)

In [62]:
rt_stop_times3.loc[rt_stop_times3["scheduled_arrival_sec"] > 86_400].shape

(2583, 21)

In [63]:
timestamp_subset = [
    "converted_schd_arrival",
    "converted_rt_arrival",
    "scheduled_arrival_sec",
    "rt_arrival_sec",
    "service_date",
]

#### There are 86_400 seconds in a day.

In [64]:
86_400 - 3600

82800

In [65]:
86_400 / 2 / 3_600

12.0

In [66]:
def adjust_days_and_time(sched_arrival_seconds, date, rt_arrival_sec):
    """
    Adjusts days and time based on sched_arrival_seconds and rt_arrival_sec.

    Parameters:
    sched_arrival_seconds (int): Number of seconds.
    date (datetime): Initial date.
    rt_arrival_sec (int): Arrival time in seconds.

    Returns:
    datetime: Adjusted date and time.
    """
    # If the rt arrival second is between 12-1AM and the scheduled arrival time is between
    # is between 11pm and 1am
    if rt_arrival_sec < (60 * 60) and (82_800 < sched_arrival_seconds < 90_000):
        # Subtract a day
        return pd.Timestamp(date + pd.Timedelta(days=-1)) + pd.Timedelta(
            seconds=sched_arrival_seconds % 86400
        )
    # If rt_arrival team is before 12 PM and scheduled_arrival_seconds is
    # more than 86_400 seconds, return the date plus the seconds htat remain
    elif rt_arrival_sec < (86_400 / 2) and (86_400 < sched_arrival_seconds):
        return pd.Timestamp(date) + pd.Timedelta(seconds=sched_arrival_seconds % 86400)
    else:
        # No change
        return pd.Timestamp(date) + pd.Timedelta(seconds=sched_arrival_seconds)

#### Apply to all rows
* This takes forever, find another way to do this. 

In [67]:
rt_stop_times3["converted_schd_arrival"] = rt_stop_times3.apply(
    lambda row: adjust_days_and_time(
        row["scheduled_arrival_sec"], row["service_date"], row["rt_arrival_sec"]
    ),
    axis=1,
)

### Convert `converted_rt_arrival`
* Thankfully, none of the values in `converted_rt_arrival` go over 86_400.

In [68]:
rt_stop_times3["converted_rt_arrival"] = pd.to_datetime(
    rt_stop_times3["service_date"]
) + pd.to_timedelta(rt_stop_times3["rt_arrival_sec"] % 86400, unit="s")

In [69]:
rt_stop_times3["delay_min"] = (
    rt_stop_times3["converted_rt_arrival"] - rt_stop_times3["converted_schd_arrival"]
).dt.total_seconds() / 60

#### Don't understand why I chose 11PM instead of just 12am??

In [70]:
def add_day_to_23_hours(df):
    """
    This function takes a pandas DataFrame `df` as input, modifies it, and returns the modified DataFrame.
    The function performs two operations:
    1. It adds one day to the `converted_schd_arrival` column values if
    the corresponding `delay_min` value exceeds 23 hours (1380 minutes).
    This is done using the `apply` method with a lambda function.
    2. It recalculates the `delay_min` column by subtracting the `converted_schd_arrival`.

    Parameters:
    df (pandas.DataFrame): Input DataFrame containing the columns `converted_schd_arrival` and `delay_min`.

    Returns:
    pandas.DataFrame: Modified DataFrame with updated `converted_schd_arrival` and `delay_min` columns.

    Note: This function assumes that the input DataFrame has the required columns and that the `rt_stop_times3` DataFrame is defined in the global scope.
    """
    df["converted_schd_arrival"] = df.apply(
        lambda row: row["converted_schd_arrival"] + pd.Timedelta(days=1)
        if row["delay_min"] > (23 * 60)
        else row["converted_schd_arrival"],
        axis=1,
    )

    df["delay_min"] = (
        df["converted_rt_arrival"] - rt_stop_times3["converted_schd_arrival"]
    ).dt.total_seconds() / 60
    return df

In [71]:
rt_stop_times3 = add_day_to_23_hours(rt_stop_times3)

### HELP: Filter out values in `delay` that are very extreme.
* How to determine what is considered "extreme"

In [72]:
percentiles = [0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]

In [73]:
700 / 60

11.666666666666666

In [74]:
-1345 / 60

-22.416666666666668

In [75]:
print(rt_stop_times3.delay_min.describe(percentiles))

count   163492.00
mean         2.88
std         17.85
min      -1345.67
1%          -5.98
2%          -3.75
5%          -2.17
10%         -1.28
50%          1.63
90%          7.88
95%         11.10
98%         17.12
99%         21.73
max        727.87
Name: delay_min, dtype: float64


In [76]:
percentile_99 = rt_stop_times3["delay_min"].quantile(0.99)

In [77]:
percentile_99

21.733333333333334

In [78]:
percentile_01 = rt_stop_times3["delay_min"].quantile(0.01)

In [79]:
percentile_01

-5.983333333333333

In [80]:
percentile_01_df = rt_stop_times3.loc[rt_stop_times3.delay_min < percentile_01]

In [81]:
len(percentile_01_df)

1629

In [82]:
delay_above_99 = rt_stop_times3.loc[rt_stop_times3.delay_min > percentile_99]

In [83]:
len(delay_above_99)

1632

In [84]:
delay_above_99.delay_min.describe(percentiles)

count   1632.00
mean      75.79
std      118.05
min       21.75
1%        21.97
2%        22.09
5%        22.68
10%       23.33
50%       29.68
90%      146.72
95%      443.50
98%      454.96
99%      501.58
max      727.87
Name: delay_min, dtype: float64

In [85]:
percentile_01_df.delay_min.describe(percentiles)

count    1629.00
mean      -39.27
std        97.01
min     -1345.67
1%       -456.31
2%       -366.02
5%       -166.58
10%       -35.50
50%       -24.00
90%        -6.81
95%        -6.27
98%        -6.08
99%        -6.04
max        -6.00
Name: delay_min, dtype: float64

In [86]:
percentile_5 = percentile_01_df["delay_min"].quantile(0.05)

In [87]:
percentile_5

-166.57666666666665

In [88]:
percentile_95 = delay_above_99["delay_min"].quantile(0.95)

In [89]:
percentile_95

443.5033333333331

* If scheduled_arrival_sec is in the 86000 ballpark and rt_arrival_sec is less than 60*60*3, then subtract a day 

In [90]:
# Filter out
rt_stop_times4 = rt_stop_times3[
    (rt_stop_times3["delay_min"] >= percentile_5)
    & (rt_stop_times3["delay_min"] <= percentile_95)
].reset_index(drop=True)

In [91]:
len(rt_stop_times4) - len(rt_stop_times3)

-164

In [92]:
rt_stop_times4.delay_min.describe(percentiles)

count   163328.00
mean         2.82
std          9.61
min       -165.27
1%          -5.73
2%          -3.70
5%          -2.17
10%         -1.27
50%          1.63
90%          7.87
95%         11.05
98%         16.93
99%         21.22
max        442.93
Name: delay_min, dtype: float64

### Calculate the actual & scheduled headway the `operator-route-direction_id-stop_sequence-stop_id-` grain

I am calculating the difference of a column between a row and the one above it using this code in Python: rt_stop_times4["scheduled_arrival_lag"] = rt_stop_times4.groupby( [
    "schedule_gtfs_dataset_key",
    "route_id",
    "shape_array_key",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
])[
"converted_schd_arrival"].diff(). The column is of the datetime type. For example, the row above holds the value of 2024-05-22 16:27:00. The row below holds the value of 2024-05-22 15:42:00. The difference between the two rows should be -44 minutes. However, with my code I am getting -1 days +23:15:00. How can I fix my code to get the right results?

In [93]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "shape_array_key",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [95]:
rt_stop_times4["actual_arrival_lag"] = (rt_stop_times4.groupby(groupby_cols)["converted_rt_arrival"].diff())

In [97]:
rt_stop_times4["scheduled_arrival_lag"] = (rt_stop_times4.groupby(groupby_cols)["converted_schd_arrival"].diff())

#### Checking out that this is calculated correctly.

In [98]:
rt_stop_times4.head(10)[
    [
        "converted_rt_arrival",
        "actual_arrival_lag",
        "converted_schd_arrival",
        "scheduled_arrival_lag",
    ]
]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag,converted_schd_arrival,scheduled_arrival_lag
0,2024-05-22 00:06:52,NaT,2024-05-22 00:08:00,NaT
1,2024-05-22 00:44:22,0 days 00:37:30,2024-05-22 00:41:00,0 days 00:33:00
2,2024-05-22 01:12:29,0 days 00:28:07,2024-05-22 01:11:00,0 days 00:30:00
3,2024-05-22 01:39:47,0 days 00:27:18,2024-05-22 01:41:00,0 days 00:30:00
4,2024-05-22 02:11:40,0 days 00:31:53,2024-05-22 02:11:00,0 days 00:30:00
5,2024-05-22 02:39:14,0 days 00:27:34,2024-05-22 02:41:00,0 days 00:30:00
6,2024-05-22 03:12:43,0 days 00:33:29,2024-05-22 03:11:00,0 days 00:30:00
7,2024-05-22 03:41:05,0 days 00:28:22,2024-05-22 03:42:00,0 days 00:31:00
8,2024-05-22 04:11:44,0 days 00:30:39,2024-05-22 04:12:00,0 days 00:30:00
9,2024-05-22 04:45:07,0 days 00:33:23,2024-05-22 04:49:00,0 days 00:37:00


### HELP What to do with lag values that are nan?

In [99]:
rt_stop_times4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163328 entries, 0 to 163327
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype          
---  ------                      --------------   -----          
 0   trip_id                     163328 non-null  object         
 1   stop_id                     163328 non-null  object         
 2   stop_sequence               163328 non-null  int64          
 3   scheduled_arrival_sec       163328 non-null  float64        
 4   schedule_gtfs_dataset_key   163328 non-null  object         
 5   trip_instance_key           163328 non-null  object         
 6   rt_arrival_sec              163328 non-null  int64          
 7   route_id                    163328 non-null  object         
 8   shape_array_key             163328 non-null  object         
 9   feed_key                    163328 non-null  object         
 10  route_long_name             163328 non-null  object         
 11  direction_id              

In [100]:
# rt_stop_times4 = rt_stop_times4.fillna(0)

### Transit Matters Method

In [101]:
transit_matters_df1 = rt_stop_times4.copy()

In [102]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_arrival_lag / transit_matters_df1.scheduled_arrival_lag
)

In [103]:
transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

In [104]:
transit_matters_df1.pct_actual_schd_headway.describe()

count   110371.00
mean         0.99
std          0.28
min         -1.82
25%          0.90
50%          1.00
75%          1.10
max          3.24
Name: pct_actual_schd_headway, dtype: float64

In [105]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

not bunched   0.99
bunched       0.01
Name: bunched_y_n, dtype: float64

#### Looking at examples
* Can't recall why I chose this...

In [106]:
example1 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "5637")
    & (
        transit_matters_df1.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (transit_matters_df1.stop_sequence == 32)
    & (transit_matters_df1.route_id == "204-13172")
]

In [107]:
example1["sched_arrival_min"] = example1.scheduled_arrival_sec / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example1["sched_arrival_min"] = example1.scheduled_arrival_sec / 60


In [108]:
example1["rt_arrival_min"] = example1.rt_arrival_sec / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  example1["rt_arrival_min"] = example1.rt_arrival_sec / 60


In [109]:
example1.shape

(106, 30)

In [110]:
example1[
    [
        "converted_rt_arrival",
        "actual_arrival_lag",
        "converted_schd_arrival",
        "scheduled_arrival_lag",
        "pct_actual_schd_headway",
        "bunched_y_n",
    ]
]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag,converted_schd_arrival,scheduled_arrival_lag,pct_actual_schd_headway,bunched_y_n
3158,2024-05-22 00:07:06,NaT,2024-05-22 00:02:00,NaT,,not bunched
3159,2024-05-22 00:34:01,0 days 00:26:55,2024-05-22 00:32:00,0 days 00:30:00,0.9,not bunched
3160,2024-05-22 01:14:46,0 days 00:40:45,2024-05-22 01:04:00,0 days 00:32:00,1.27,not bunched
3161,2024-05-22 01:35:45,0 days 00:20:59,2024-05-22 01:34:00,0 days 00:30:00,0.7,not bunched
3162,2024-05-22 02:04:52,0 days 00:29:07,2024-05-22 02:04:00,0 days 00:30:00,0.97,not bunched
3163,2024-05-22 02:34:57,0 days 00:30:05,2024-05-22 02:34:00,0 days 00:30:00,1.0,not bunched
3164,2024-05-22 03:03:09,0 days 00:28:12,2024-05-22 03:04:00,0 days 00:30:00,0.94,not bunched
3165,2024-05-22 03:35:53,0 days 00:32:44,2024-05-22 03:34:00,0 days 00:30:00,1.09,not bunched
3166,2024-05-22 04:04:36,0 days 00:28:43,2024-05-22 04:04:00,0 days 00:30:00,0.96,not bunched
3167,2024-05-22 04:39:50,0 days 00:35:14,2024-05-22 04:34:00,0 days 00:30:00,1.17,not bunched


#### Aggregate. HELP: Take out `stop_sequence` and `shape_array_key`?
* At this point, it doesn't matter the sequence, we just care about how bunched the traffic is around one partiuclar stop. 
* See how many trips for that grain are considered "bunched" or not.

In [111]:
transit_matters_agg = [
    "caltrans_district",
    "organization_name",
    "route_long_name",
    "route_type",
    "shape_array_key",
    "route_id",
    "stop_id",
    "route_primary_direction",
    "bunched_y_n",
]

In [112]:
transit_matters_df2 = (
    transit_matters_df1.groupby(transit_matters_agg)
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
).rename(columns={"trip_instance_key": "all_trips"})

In [113]:
# Filter out any rows with only one trip
transit_matters_df2 = transit_matters_df2.loc[transit_matters_df2.all_trips > 1].reset_index(drop = True)

In [114]:
# Filter out only rows that are bunched.
bunched_only = (
    transit_matters_df2.loc[transit_matters_df2.bunched_y_n == "bunched"]
    .reset_index(drop=True)
    .drop(columns=["bunched_y_n"])
    .rename(columns={"all_trips": "bunched_trips"})
)

In [115]:
len(bunched_only)

355

In [116]:
bunched_only.head(2)

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,bunched_trips
0,04 - Oakland,Alameda-Contra Costa Transit District,Piedmont - Harrison - Park Blvd.,3,48274af748bb908c0b6f679cd21cd213,33,55250,Eastbound,2
1,04 - Oakland,Alameda-Contra Costa Transit District,Piedmont - Harrison - Park Blvd.,3,48274af748bb908c0b6f679cd21cd213,33,55350,Eastbound,2


In [117]:
transit_matters_df2 = transit_matters_df2.drop(columns=["bunched_y_n"])

In [118]:
transit_matters_agg.remove("bunched_y_n")

In [119]:
transit_matters_agg

['caltrans_district',
 'organization_name',
 'route_long_name',
 'route_type',
 'shape_array_key',
 'route_id',
 'stop_id',
 'route_primary_direction']

In [120]:
# Merge back, using left merge to keep bunching to find % of bunched trips
transit_matters_m1 = pd.merge(
    transit_matters_df2, bunched_only, on=transit_matters_agg, how="left"
)

#### Fix merge, why are the rows duplicate? 

In [121]:
transit_matters_m1.loc[
    (transit_matters_m1.stop_id == "5637")
    & (
        transit_matters_m1.organization_name
        == "Los Angeles County Metropolitan Transportation Authority"
    )
    & (transit_matters_m1.route_id == "204-13172")
]

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,all_trips,bunched_trips
6235,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,3,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,8,8.0
6236,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,3,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,98,8.0


In [122]:
transit_matters_m2 = (
    transit_matters_m1.sort_values(by=["all_trips"], ascending=False)
    .drop_duplicates(subset=transit_matters_agg)
    .reset_index(drop=True)
)

In [123]:
transit_matters_m2["pct_trips_bunched"] = (
    transit_matters_m2.bunched_trips / transit_matters_m2.all_trips * 100
)

In [124]:
transit_matters_m2.pct_trips_bunched = transit_matters_m2.pct_trips_bunched.fillna(0)

In [125]:
transit_matters_m2.loc[
    (transit_matters_m2.stop_id == "5637")
    & (
        transit_matters_m2.organization_name
        == "Los Angeles County Metropolitan Transportation Authority"
    )
    & (transit_matters_m2.route_id == "204-13172")
]

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
72,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,3,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,98,8.0,8.16


In [126]:
transit_matters_m2.pct_trips_bunched.describe(percentiles)

count   7969.00
mean       0.32
std        1.81
min        0.00
1%         0.00
2%         0.00
5%         0.00
10%        0.00
50%        0.00
90%        0.00
95%        0.00
98%        6.10
99%        9.43
max       27.27
Name: pct_trips_bunched, dtype: float64

In [127]:
len(transit_matters_m2)

7969

In [128]:
transit_matters_m2.route_id.nunique()

100

In [129]:
rt_stop_times4.route_id.nunique()

113

#### City of Visalia 	 has a lot of bunched trips.

In [130]:
transit_matters_m2.sort_values(by=["pct_trips_bunched"], ascending=False).head(30)

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
726,04 - Oakland,Emeryville Transportation Management Agency,Hollis,3,0628e405f93c2d0b5e3e68a7115857d4,Hollis,855335,Northbound,44,12.0,27.27
4657,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,2307469,Westbound,13,3.0,23.08
3929,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,2307691,Westbound,15,3.0,20.0
3527,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,2307719,Westbound,15,3.0,20.0
3552,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,2307716,Westbound,15,3.0,20.0
3551,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,2307700,Westbound,15,3.0,20.0
3550,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,2307699,Westbound,15,3.0,20.0
3549,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,2307698,Westbound,15,3.0,20.0
3872,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,2307466,Westbound,15,3.0,20.0
3473,06 - Fresno,City of Visalia,Route 9,3,60da59c7000ea5dcb5f845d8fa227f14,2042,3869710,Westbound,15,3.0,20.0


#### HELP What to do if order is switched? 
* Row 91379:  the -1 day + 23:15 is confusing to me

In [131]:
preview_cols = [
    "converted_rt_arrival",
    "actual_arrival_lag",
    "converted_schd_arrival",
    "scheduled_arrival_lag",
    "pct_actual_schd_headway",
    "bunched_y_n",
]

In [132]:
example2 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "2307719")
    & (transit_matters_df1.organization_name == "City of Visalia")
    & (transit_matters_df1.route_id == "2042")
    & (transit_matters_df1.shape_array_key == "60da59c7000ea5dcb5f845d8fa227f14")
]

In [133]:
example2[preview_cols]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag,converted_schd_arrival,scheduled_arrival_lag,pct_actual_schd_headway,bunched_y_n
131018,2024-05-22 06:32:47,NaT,2024-05-22 06:42:00,NaT,,not bunched
131019,2024-05-22 07:26:41,0 days 00:53:54,2024-05-22 07:27:00,0 days 00:45:00,1.2,not bunched
131020,2024-05-22 08:06:11,0 days 00:39:30,2024-05-22 08:12:00,0 days 00:45:00,0.88,not bunched
131021,2024-05-22 08:56:57,0 days 00:50:46,2024-05-22 08:57:00,0 days 00:45:00,1.13,not bunched
131022,2024-05-22 09:37:59,0 days 00:41:02,2024-05-22 09:42:00,0 days 00:45:00,0.91,not bunched
131023,2024-05-22 10:27:26,0 days 00:49:27,2024-05-22 10:27:00,0 days 00:45:00,1.1,not bunched
131024,2024-05-22 11:10:05,0 days 00:42:39,2024-05-22 11:12:00,0 days 00:45:00,0.95,not bunched
131025,2024-05-22 12:01:01,0 days 00:50:56,2024-05-22 11:57:00,0 days 00:45:00,1.13,not bunched
131026,2024-05-22 12:38:08,0 days 00:37:07,2024-05-22 12:42:00,0 days 00:45:00,0.82,not bunched
131027,2024-05-22 13:27:10,0 days 00:49:02,2024-05-22 13:27:00,0 days 00:45:00,1.09,not bunched


In [134]:
example3 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "2307698")
    & (transit_matters_df1.organization_name == "City of Visalia")
    & (transit_matters_df1.route_id == "2042")
    & (transit_matters_df1.shape_array_key == "60da59c7000ea5dcb5f845d8fa227f14")
]

In [135]:
example3[preview_cols]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag,converted_schd_arrival,scheduled_arrival_lag,pct_actual_schd_headway,bunched_y_n
131288,2024-05-22 06:59:51,NaT,2024-05-22 06:59:00,NaT,,not bunched
131289,2024-05-22 07:57:19,0 days 00:57:28,2024-05-22 07:44:00,0 days 00:45:00,1.28,not bunched
131290,2024-05-22 08:36:03,0 days 00:38:44,2024-05-22 08:29:00,0 days 00:45:00,0.86,not bunched
131291,2024-05-22 09:20:37,0 days 00:44:34,2024-05-22 09:14:00,0 days 00:45:00,0.99,not bunched
131292,2024-05-22 10:01:36,0 days 00:40:59,2024-05-22 09:59:00,0 days 00:45:00,0.91,not bunched
131293,2024-05-22 10:50:30,0 days 00:48:54,2024-05-22 10:44:00,0 days 00:45:00,1.09,not bunched
131294,2024-05-22 11:30:12,0 days 00:39:42,2024-05-22 11:29:00,0 days 00:45:00,0.88,not bunched
131295,2024-05-22 12:22:42,0 days 00:52:30,2024-05-22 12:14:00,0 days 00:45:00,1.17,not bunched
131296,2024-05-22 12:59:07,0 days 00:36:25,2024-05-22 12:59:00,0 days 00:45:00,0.81,not bunched
131297,2024-05-22 13:46:21,0 days 00:47:14,2024-05-22 13:44:00,0 days 00:45:00,1.05,not bunched


### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

* To Do: add back in route  & operator information

In [136]:
two_minutes_df = rt_stop_times4.copy()

In [137]:
two_minutes_df.head(3)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_type,route_primary_direction,med_headway_minutes,organization_name,name,caltrans_district,service_date,route_type_str,scheduled_arrival_sec_copy,converted_schd_arrival,converted_rt_arrival,delay_min,actual_arrival_lag,scheduled_arrival_lag
0,10204001252406-DEC23,7093,2,86880.0,0666caf3ec1ecc96b74f4477ee4bc939,2d6ea456f6d155e566e41f01c1b46370,412,204-13172,6a10ede3fa469c8b4d9bf761946ed20a,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,3,Northbound,14.85,Los Angeles County Metropolitan Transportation Authority,LA Metro Bus Schedule,07 - Los Angeles,2024-05-22,Bus,86880.0,2024-05-22 00:08:00,2024-05-22 00:06:52,-1.13,NaT,NaT
1,10204001252439-DEC23,7093,2,88860.0,0666caf3ec1ecc96b74f4477ee4bc939,aed84185568efda59a9ce7342d919202,2662,204-13172,6a10ede3fa469c8b4d9bf761946ed20a,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,3,Northbound,14.85,Los Angeles County Metropolitan Transportation Authority,LA Metro Bus Schedule,07 - Los Angeles,2024-05-22,Bus,88860.0,2024-05-22 00:41:00,2024-05-22 00:44:22,3.37,0 days 00:37:30,0 days 00:33:00
2,10204001252509-DEC23,7093,2,90660.0,0666caf3ec1ecc96b74f4477ee4bc939,79aa575337434ff1eeb332de268e44c2,4349,204-13172,6a10ede3fa469c8b4d9bf761946ed20a,608992664173210532aa3e6cc573be2f,Metro Local Line,0.0,3,Northbound,14.85,Los Angeles County Metropolitan Transportation Authority,LA Metro Bus Schedule,07 - Los Angeles,2024-05-22,Bus,90660.0,2024-05-22 01:11:00,2024-05-22 01:12:29,1.48,0 days 00:28:07,0 days 00:30:00


In [139]:
two_minutes_df["rt_mins"] = (
    two_minutes_df["actual_arrival_lag"].dt.total_seconds()
) / 60

In [140]:
two_minutes_df["bunched_y_n"] = np.where(
    two_minutes_df["rt_mins"] <= 2, "bunched", "not bunched"
)

In [141]:
two_minutes_df.bunched_y_n.value_counts()

not bunched    115653
bunched         47675
Name: bunched_y_n, dtype: int64

#### Same code as Transit Matters Approach

In [142]:
transit_matters_agg.append("bunched_y_n")

In [143]:
transit_matters_agg

['caltrans_district',
 'organization_name',
 'route_long_name',
 'route_type',
 'shape_array_key',
 'route_id',
 'stop_id',
 'route_primary_direction',
 'bunched_y_n']

In [144]:
two_minutes_agg1 = (
    two_minutes_df.groupby(transit_matters_agg)
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
).rename(columns={"trip_instance_key": "all_trips"})

In [145]:
# Filter out for any rows with only one trip
two_minutes_agg1 = two_minutes_agg1.loc[two_minutes_agg1.all_trips > 1].reset_index(drop = True)

In [146]:
bunched_only_two_min = (
    two_minutes_agg1.loc[two_minutes_agg1.bunched_y_n == "bunched"]
    .reset_index(drop=True)
    .rename(columns={"all_trips": "bunched_trips"})
    .drop(columns=["bunched_y_n"])
)

In [147]:
bunched_only_two_min.head(2)

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,bunched_trips
0,01 - Eureka,City of Eureka,AMRTS Gold Route,3,c47c15ffc43da6e556ff913272778e4d,14,1262,Northbound,11
1,01 - Eureka,City of Eureka,AMRTS Gold Route,3,c47c15ffc43da6e556ff913272778e4d,14,1264,Northbound,11


In [148]:
len(bunched_only_two_min)

3283

In [149]:
transit_matters_agg.remove("bunched_y_n")

In [150]:
two_minutes_agg1 = two_minutes_agg1.drop(columns=["bunched_y_n"])

In [151]:
len(two_minutes_agg1)

8124

In [152]:
# Need to do a left merge on all trips for the stops that don't have bunching.
final_two_minute = pd.merge(
    
    bunched_only_two_min,
    two_minutes_agg1,
    on=transit_matters_agg,
    how="left",
)

In [153]:
final_two_minute = final_two_minute.fillna(0)

In [154]:
final_two_minute["pct_trips_bunched"] = (
    final_two_minute.bunched_trips / final_two_minute.all_trips * 100
)

In [155]:
final_two_minute.route_id.nunique()

36

In [156]:
final_two_minute2 = (
    final_two_minute.sort_values(by=["all_trips"], ascending=False)
    .drop_duplicates(subset=transit_matters_agg)
    .reset_index(drop=True)
)

### Comparing both outcomes
* There are so many more bunched trips for the 2 minute approach.

In [157]:
final_two_minute2.pct_trips_bunched.describe(percentiles)

count   3283.00
mean      95.52
std       20.14
min        1.79
1%         3.57
2%         4.17
5%       100.00
10%      100.00
50%      100.00
90%      100.00
95%      100.00
98%      100.00
99%      100.00
max      100.00
Name: pct_trips_bunched, dtype: float64

In [158]:
transit_matters_m2.pct_trips_bunched.describe(percentiles)

count   7969.00
mean       0.32
std        1.81
min        0.00
1%         0.00
2%         0.00
5%         0.00
10%        0.00
50%        0.00
90%        0.00
95%        0.00
98%        6.10
99%        9.43
max       27.27
Name: pct_trips_bunched, dtype: float64

In [159]:
def compare_approaches(stop_id:str, organization_name:str, route_id:str):
    transit_matter = transit_matters_m2.loc[
    (transit_matters_m2.stop_id ==stop_id)
    & (
        transit_matters_m2.organization_name
        == organization_name
    )
    & (transit_matters_m2.route_id == route_id)]
    display(transit_matter)
    
    two_min = final_two_minute2.loc[
    (final_two_minute2.stop_id == stop_id)
    & (
        final_two_minute2.organization_name
        == organization_name
    )
    & (final_two_minute2.route_id == route_id)]
    
    display(two_min)

In [160]:
compare_approaches(stop_id = "5637",
                  organization_name = "Los Angeles County Metropolitan Transportation Authority",
                  route_id = "204-13172")

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
72,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,3,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,98,8.0,8.16


Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,bunched_trips,all_trips,pct_trips_bunched
10,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,3,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,3,103,2.91


In [161]:
final_two_minute.sample(2)

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,bunched_trips,all_trips,pct_trips_bunched
3162,07 - Los Angeles,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,3,6a10ede3fa469c8b4d9bf761946ed20a,204-13172,5637,Northbound,3,3,100.0
1365,07 - Los Angeles,City of Ojai,Route 10,3,1f7f53cf0a44e810faf167f7a78466e7,4778,3737134,Westbound,7,7,100.0


#### How is this SO different???

In [162]:
compare_approaches(stop_id = "3288011",
                  organization_name = "City of Thousand Oaks",
                  route_id = "3402")

Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
1762,07 - Los Angeles,City of Thousand Oaks,Route 11,3,e15098a36704dc44bb9c92ece10436a4,3402,3288011,Eastbound,26,,0.0


Unnamed: 0,caltrans_district,organization_name,route_long_name,route_type,shape_array_key,route_id,stop_id,route_primary_direction,bunched_trips,all_trips,pct_trips_bunched
296,07 - Los Angeles,City of Thousand Oaks,Route 11,3,e15098a36704dc44bb9c92ece10436a4,3402,3288011,Eastbound,26,26,100.0


In [163]:
transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "3288011")
    & (transit_matters_df1.organization_name == "City of Thousand Oaks")
    & (transit_matters_df1.route_id == "3402")
][preview_cols]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag,converted_schd_arrival,scheduled_arrival_lag,pct_actual_schd_headway,bunched_y_n
88718,2024-05-22 07:41:59,0 days,2024-05-22 07:38:00,0 days,,not bunched
88725,2024-05-22 09:02:09,0 days,2024-05-22 08:59:00,0 days,,not bunched
88732,2024-05-22 09:39:57,0 days,2024-05-22 09:40:00,0 days,,not bunched
88739,2024-05-22 10:24:56,0 days,2024-05-22 10:20:00,0 days,,not bunched
88746,2024-05-22 10:50:33,0 days,2024-05-22 10:50:00,0 days,,not bunched
88753,2024-05-22 11:16:03,0 days,2024-05-22 11:16:00,0 days,,not bunched
88760,2024-05-22 11:49:57,0 days,2024-05-22 11:46:00,0 days,,not bunched
88767,2024-05-22 12:14:46,0 days,2024-05-22 12:17:00,0 days,,not bunched
88774,2024-05-22 12:56:54,0 days,2024-05-22 12:52:00,0 days,,not bunched
88781,2024-05-22 13:19:58,0 days,2024-05-22 13:17:00,0 days,,not bunched


In [165]:
two_minutes_df.loc[
    (two_minutes_df.stop_id == "3288011")
    & (two_minutes_df.organization_name == "City of Thousand Oaks")
    & (two_minutes_df.route_id == "3402")
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,feed_key,route_long_name,direction_id,route_type,route_primary_direction,med_headway_minutes,organization_name,name,caltrans_district,service_date,route_type_str,scheduled_arrival_sec_copy,converted_schd_arrival,converted_rt_arrival,delay_min,actual_arrival_lag,scheduled_arrival_lag,rt_mins,bunched_y_n
88718,139-648,3288011,13,27480.0,1770249a5a2e770ca90628434d4934b1,99187af2c19cae6b8c3e04d9e9f11970,27719,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,27480.0,2024-05-22 07:38:00,2024-05-22 07:41:59,3.98,0 days,0 days,0.0,bunched
88725,139-91,3288011,13,32340.0,1770249a5a2e770ca90628434d4934b1,6b1b409f790260f87a418e395ff10c12,32529,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,32340.0,2024-05-22 08:59:00,2024-05-22 09:02:09,3.15,0 days,0 days,0.0,bunched
88732,139-266,3288011,13,34800.0,1770249a5a2e770ca90628434d4934b1,e18814c62c1822f7a85dbd0f06d2e030,34797,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,34800.0,2024-05-22 09:40:00,2024-05-22 09:39:57,-0.05,0 days,0 days,0.0,bunched
88739,139-1274,3288011,13,37200.0,1770249a5a2e770ca90628434d4934b1,71074c1eb999b0c6ce92acaff6dd7cba,37496,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,37200.0,2024-05-22 10:20:00,2024-05-22 10:24:56,4.93,0 days,0 days,0.0,bunched
88746,139-331,3288011,13,39000.0,1770249a5a2e770ca90628434d4934b1,262d176132a0a7a88e3c3ccbf78e44f2,39033,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,39000.0,2024-05-22 10:50:00,2024-05-22 10:50:33,0.55,0 days,0 days,0.0,bunched
88753,139-793,3288011,13,40560.0,1770249a5a2e770ca90628434d4934b1,1eeb6073ddc8d233b75efc2335eae489,40563,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,40560.0,2024-05-22 11:16:00,2024-05-22 11:16:03,0.05,0 days,0 days,0.0,bunched
88760,139-940,3288011,13,42360.0,1770249a5a2e770ca90628434d4934b1,eef8d16110dbd54ed272d1e702923a0c,42597,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,42360.0,2024-05-22 11:46:00,2024-05-22 11:49:57,3.95,0 days,0 days,0.0,bunched
88767,139-120,3288011,13,44220.0,1770249a5a2e770ca90628434d4934b1,17ec8a730578a011706eb4340b1714b2,44086,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,44220.0,2024-05-22 12:17:00,2024-05-22 12:14:46,-2.23,0 days,0 days,0.0,bunched
88774,139-182,3288011,13,46320.0,1770249a5a2e770ca90628434d4934b1,526cd3e83a2ce6fd0cd0ba00ca114a99,46614,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,46320.0,2024-05-22 12:52:00,2024-05-22 12:56:54,4.9,0 days,0 days,0.0,bunched
88781,139-1192,3288011,13,47820.0,1770249a5a2e770ca90628434d4934b1,4221cf0c15c464da48048e9b50d2cca8,47998,3402,e15098a36704dc44bb9c92ece10436a4,926867fdee73d5fbfe4f011871bcd830,Route 11,0.0,3,Eastbound,10.91,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles,2024-05-22,Bus,47820.0,2024-05-22 13:17:00,2024-05-22 13:19:58,2.97,0 days,0 days,0.0,bunched
