## Transit Bunching 
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)
### 11/8
* Figure out how to address City of Visalia: one of the buses that is scheduled to arrive earlier arrives later than another bus. 
* This leads to a negative time stamp and makes it appear like there is a lot of bunching per the Transit Matters approach.



In [1]:
import datetime as dt

import altair as alt
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Grab Routes

In [5]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [6]:
route_dir_columns = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
    "route_primary_direction",
    "frequency",
    "service_date",
]

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[
    route_dir_columns
]

In [8]:
route_dir.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,frequency,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,0.92,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,0.62,2024-05-22
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,1.5,2024-05-22
3,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,all_day,Southbound,0.92,2024-05-22
4,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,offpeak,Southbound,0.69,2024-05-22


In [9]:
# Filter for only all_day
route_dir = route_dir.loc[route_dir.time_period == "peak"].reset_index(drop=True)

In [10]:
len(route_dir)

3238

#### Attach operators and districts

In [11]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [12]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [13]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [14]:
crosswalk_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,name,caltrans_district,service_date
0,1770249a5a2e770ca90628434d4934b1,Ventura County Transportation Commission,VCTC GMV Schedule,07 - Los Angeles,2024-05-22
1,f8102a9c0693206bf36d302540bf1bcf,City of Corona,Corona Schedule,08 - San Bernardino,2024-05-22


In [15]:
crosswalk_df.shape

(189, 5)

In [16]:
routes = pd.merge(
    route_dir,
    crosswalk_df,
    on=["schedule_gtfs_dataset_key", "service_date"],
    how="left",
)

In [17]:
# routes = pd.concat([thousand_oaks, visalia, metro, metro_33])

In [18]:
len(routes)

4695

### Add Trips

In [19]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [20]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [21]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
    "route_type",
]

In [22]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [23]:
trips_routes = pd.merge(
    trips,
    routes,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [24]:
trips_routes.route_id.nunique()

1303

#### Help - I know we can get this from the warehouse but it seems cumbersome. Correct me if I'm wrong.

In [25]:
# https://gtfs.org/documentation/schedule/reference/#
route_type_crosswalk = {
    "route_type": ["0", "1", "2", "3", "4", "5", "6", "7", "11", "12"],
    "route_type_str": [
        "Tram, Streetcar, Light rail",
        "Subway, Metro",
        "Rail",
        "Bus",
        "Ferry.",
        "Cable tram.",
        "Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).",
        "Funicular.",
        "Trolleybus.",
        "Monorail.",
    ],
}

In [26]:
route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)

In [27]:
# Merge for route_type
trips_routes = pd.merge(
    trips_routes, route_type_crosswalk_df, on=["route_type"], how="left"
)

In [28]:
trips_routes = trips_routes.drop(columns=["route_type"])

In [29]:
trips_routes = trips_routes.rename(columns={"route_type_str": "route_type"})

In [30]:
trips_routes.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,time_period,route_primary_direction,frequency,service_date,organization_name,name,caltrans_district,route_type
0,1770249a5a2e770ca90628434d4934b1,3408,c256553e28c4bba693e3136240b35419,8f644f847e987de68e0cb6fcd339cf41,926867fdee73d5fbfe4f011871bcd830,Route 21,0.0,peak,Westbound,1.88,2024-05-22,Ventura County Transportation Commission,VCTC GMV Schedule,07 - Los Angeles,Bus


### Get Stop Times 

In [31]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [32]:
rt_stop_times.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
0,1d105244-776c-4b3f-af78-9c7ad78c2103,0b2443b6-b50f-452b-a749-464588ca93b8,8,60991.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61434
1,1d105244-776c-4b3f-af78-9c7ad78c2103,cd5650b0-9a18-4e78-aedc-385f3094fa0f,9,61179.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61616


In [33]:
rt_stop_times.shape

(2601262, 7)

In [34]:
len(rt_stop_times.drop_duplicates())

2601262

In [35]:
trips_routes_times = pd.merge(
    rt_stop_times,
    trips_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [36]:
(trips_routes_times.scheduled_arrival_sec.isna().sum())

15029

#### Lots of duplicates??

In [37]:
len(trips_routes_times)

3672925

In [38]:
len(trips_routes_times.drop_duplicates())

3061772

In [39]:
trips_routes_times2 = trips_routes_times.drop_duplicates().reset_index(drop=True)

### Question: How Granular?
* San Diego example.

In [42]:
subset = [
    "service_date",
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
    "stop_id",
    "stop_sequence",
    "trip_instance_key",
    "rt_arrival_sec",
    "scheduled_arrival_sec",
    
]

In [43]:
trips_routes_times3 = trips_routes_times2[subset]

In [44]:
trips_routes_times4 = trips_routes_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "direction_id",
        "stop_id",
        "stop_sequence",
        "rt_arrival_sec",
    ],
    ascending=[True, True, True, True, True, True],
).reset_index(drop=True)

#### STC: Switches `shape_array_key` often.

In [46]:
stc_test = trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "fb467982dcc77a7f9199bebe709bb700"
    )
    & (trips_routes_times4.route_id == "BlueN")
    & (trips_routes_times4.direction_id == 0)
]

In [51]:
# stc_test

In [48]:
stc_test2 = trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "fb467982dcc77a7f9199bebe709bb700"
    )
    & (trips_routes_times4.route_id == "BlueN")
]

In [50]:
# stc_test2

####  San Diego Route
* Why is the `stop_id` and `stop_sequence` flip flopping?

In [52]:
sd_test3 = trips_routes_times4.loc[
    (trips_routes_times4.organization_name == "San Diego Metropolitan Transit System")
    & (trips_routes_times4.route_id == "834")
    & (trips_routes_times4.direction_id == 0)
]

In [54]:
sd_test3.direction_id.nunique()

1

In [55]:
sd_test3.stop_id.nunique()

20

In [56]:
sd_test3.shape

(62, 14)

In [58]:
# sd_test3

#### Other Tests

In [60]:
la_test = trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (trips_routes_times4.route_id == "204-13172")
    & (trips_routes_times4.direction_id == 1)
]

In [61]:
sf_test = trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "7cc0cb1871dfd558f11a2885c145d144"
    )
    & (trips_routes_times4.route_id == "30")
    & (trips_routes_times4.direction_id == 1)
]

In [63]:
# la_test

In [None]:
sf_test.stop_sequence.nunique()

In [None]:
sf_test.loc[sf_test.stop_id=="18027"]

### Convert scheduled and RT arrival times.

In [64]:
trips_routes_times3["rt_arrival_sec"].describe()

count   3061772.00
mean      48136.63
std       17699.97
min           0.00
25%       33491.00
50%       48279.00
75%       62288.00
max       86399.00
Name: rt_arrival_sec, dtype: float64

In [65]:
trips_routes_times3["scheduled_arrival_sec"].describe()

count   3046743.00
mean      49237.71
std       17716.63
min         720.00
25%       34149.00
50%       49000.00
75%       62878.00
max      108431.00
Name: scheduled_arrival_sec, dtype: float64

In [66]:
trips_routes_times3.loc[trips_routes_times3["scheduled_arrival_sec"] >= 86_400].shape

(43167, 14)

In [67]:
trips_routes_times4["converted_rt_arrival"] = pd.to_datetime(
    trips_routes_times4["service_date"]
) + pd.to_timedelta(trips_routes_times4["rt_arrival_sec"] % 86400, unit="s")

In [68]:
trips_routes_times4["converted_schd_arrival"] = pd.to_datetime(
    trips_routes_times4["service_date"]
) + pd.to_timedelta(trips_routes_times4["scheduled_arrival_sec"] % 86400, unit="s")

### Through delays (subtracting )`converted_rt_arrival` from `converted_schd_arrival` to fix  time stamps.

In [69]:
percentiles = [0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]

In [70]:
trips_routes_times4["delay_min"] = (
    trips_routes_times4["converted_rt_arrival"]
    - trips_routes_times4["converted_schd_arrival"]
).dt.total_seconds() / 60

In [71]:
print(trips_routes_times4.delay_min.describe(percentiles))

count   3046743.00
mean          2.03
std          32.83
min       -1439.78
1%           -5.27
2%           -3.87
5%           -2.47
10%          -1.52
50%           1.45
90%           7.67
95%          10.87
98%          15.82
99%          20.23
max        1439.98
Name: delay_min, dtype: float64


In [72]:
trips_routes_times4["converted_rt_arrival"] = np.where(
    trips_routes_times4["delay_min"] >= 600,
    trips_routes_times4["converted_rt_arrival"] - pd.Timedelta(days=1),
    trips_routes_times4["converted_rt_arrival"],
)

In [73]:
trips_routes_times4["converted_schd_arrival"] = np.where(
    trips_routes_times4["delay_min"] <= -600,
    trips_routes_times4["converted_schd_arrival"] - pd.Timedelta(days=1),
    trips_routes_times4["converted_schd_arrival"],
)

In [74]:
trips_routes_times4["converted_schd_arrival"] = np.where(
    trips_routes_times4["delay_min"] >= 600,
    trips_routes_times4["converted_schd_arrival"] + pd.Timedelta(days=1),
    trips_routes_times4["converted_schd_arrival"],
)

In [75]:
trips_routes_times4["converted_rt_arrival"] = np.where(
    trips_routes_times4["delay_min"] <= -600,
    trips_routes_times4["converted_rt_arrival"] + pd.Timedelta(days=1),
    trips_routes_times4["converted_rt_arrival"],
)

In [76]:
# Recalc delay_min
trips_routes_times4["delay_min"] = (
    trips_routes_times4["converted_rt_arrival"]
    - trips_routes_times4["converted_schd_arrival"]
).dt.total_seconds() / 60

In [77]:
# Recalc delay_min
trips_routes_times4["delay_min"] = (
    trips_routes_times4["converted_rt_arrival"]
    - trips_routes_times4["converted_schd_arrival"]
).dt.total_seconds() / 60

#### Question: How to know when to just throw away rows?

In [78]:
print(trips_routes_times4.delay_min.describe(percentiles))

count   3046743.00
mean          3.06
std          36.82
min       -2279.98
1%           -5.18
2%           -3.83
5%           -2.45
10%          -1.52
50%           1.45
90%           7.68
95%          10.88
98%          15.92
99%          20.50
max        2277.53
Name: delay_min, dtype: float64


In [79]:
trips_routes_times4.loc[trips_routes_times4.delay_min > 2277]

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,trip_instance_key,rt_arrival_sec,scheduled_arrival_sec,converted_rt_arrival,converted_schd_arrival,delay_min
1807404,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,Flagship Cruises and Events Inc.,Santee - 12th & Imperial,"Tram, Streetcar, Light rail",530,0.0,75044,11,24a5c851c69af5a1bd292c27d44ab6a3,49892,86040.0,2024-05-23 13:51:32,2024-05-21 23:54:00,2277.53
1807405,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego International Airport,Santee - 12th & Imperial,"Tram, Streetcar, Light rail",530,0.0,75044,11,24a5c851c69af5a1bd292c27d44ab6a3,49892,86040.0,2024-05-23 13:51:32,2024-05-21 23:54:00,2277.53
1807406,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,Santee - 12th & Imperial,"Tram, Streetcar, Light rail",530,0.0,75044,11,24a5c851c69af5a1bd292c27d44ab6a3,49892,86040.0,2024-05-23 13:51:32,2024-05-21 23:54:00,2277.53


In [80]:
trips_routes_times4.loc[trips_routes_times4.delay_min <= -2279]

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,trip_instance_key,rt_arrival_sec,scheduled_arrival_sec,converted_rt_arrival,converted_schd_arrival,delay_min
1526522,2024-05-22,04 - Oakland,b82a23bef8a501e980c086ef269ffec7,ed7a212f2a38fd8734244030b40e4d07,City of Union City,Tamarack,Bus,4,1.0,3537271,7,e1f161b232c2de4d42e3de5b8658ba5f,58141,22140.0,2024-05-21 16:09:01,2024-05-23 06:09:00,-2279.98


In [81]:
trips_routes_times5 = trips_routes_times4.drop(
    columns=[
        "service_date",
        "delay_min",
    ]
)

### Calculate the actual & scheduled headway the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [82]:
groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_id",
    "route_long_name",
    "route_type",
    "direction_id",
    "stop_id",
    "stop_sequence",
]

In [83]:
trips_routes_times5["actual_arrival_lag_min"] = (
    trips_routes_times5.groupby(groupby_cols)["converted_rt_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

In [84]:
trips_routes_times5["scheduled_arrival_lag_min"] = (
    trips_routes_times5.groupby(groupby_cols)["converted_schd_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

#### Check San Diego

In [85]:
sd_test = trips_routes_times5.loc[
    (trips_routes_times5.organization_name == "San Diego Metropolitan Transit System")
    & (trips_routes_times5.route_id == "834")
]

In [86]:
sd_test.shape

(62, 17)

In [87]:
trips_routes_times5["scheduled_arrival_lag_min"].describe(percentiles)

count   2857803.00
mean         32.75
std          52.38
min       -1200.00
1%           -2.00
2%            6.00
5%            9.00
10%          10.00
50%          23.00
90%          60.00
95%          63.00
98%         120.00
99%         210.00
max        1620.00
Name: scheduled_arrival_lag_min, dtype: float64

### Question: Last time, I received the suggestion to throw away things more than 2 hrs because that is not bunching. But wouldn't we want all rows to be included for this metric? 
* Filter out for just "extreme" values.

### Checking out some extreme values

In [102]:
preview_time_col = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "stop_id",
    "stop_sequence",
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
]

### Many lags are actually empty b/c it's the first of that groupby-sequence.

### Transit Matters Method

In [103]:
transit_matters_df1 = trips_routes_times5.copy()

In [104]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_arrival_lag_min
    / transit_matters_df1.scheduled_arrival_lag_min
)

In [105]:
transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

In [106]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

not bunched   0.98
bunched       0.02
Name: bunched_y_n, dtype: float64

#### Aggregate.
* At this point, it doesn't matter the sequence, we just care about how bunched the traffic is around one partiuclar stop. 
* See how many trips for that grain are considered "bunched" or not.

In [107]:
def bunched_not_bunched(
    df: pd.DataFrame, bunched_y_n: str, groupby_cols: list
) -> pd.DataFrame:
    df2 = df.loc[df.bunched_y_n == bunched_y_n].reset_index(drop=True)

    bunched_y_n = bunched_y_n.replace(" ", "_")
    agg1 = (
        df2.groupby(groupby_cols).agg({"trip_instance_key": "nunique"}).reset_index()
    ).rename(columns={"trip_instance_key": f"{bunched_y_n}_trips"})
    return agg1

In [108]:
def agg_final_df(df: pd.DataFrame) -> pd.DataFrame:
    groupby_cols = [
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "feed_key",
        "organization_name",
        "route_long_name",
        "route_type",
        "route_id",
        "direction_id",
        "stop_id",
        "stop_sequence",
    ]

    # Find total trips that are bunched
    bunched = bunched_not_bunched(df, "bunched", groupby_cols)

    # Find total trips that are NOT bunched
    not_bunched = bunched_not_bunched(df, "not bunched", groupby_cols)

    # Merge
    m1 = pd.merge(not_bunched, bunched, on=groupby_cols, how="outer")

    # Find the % of bunched trips
    m1 = m1.fillna(0)
    m1["all_trips"] = m1.not_bunched_trips + m1.bunched_trips
    m1["per_trip_bunched_per_stop"] = (m1.bunched_trips / m1.all_trips)*100

    # Filter out any rows with only one trip of that groupby combo
    # for that service date
    m1 = m1.loc[m1.all_trips > 1].reset_index(drop=True)
    m1 = m1.drop(columns=["not_bunched_trips", "bunched_trips"])

    return m1

In [109]:
transit_matters_m1 = agg_final_df(transit_matters_df1)

In [110]:
# transit_matters_m1 = (
#   transit_matters_m1.sort_values(by=["all_trips"], ascending=False)
#   .drop_duplicates(subset=transit_matters_agg)
#   .reset_index(drop=True)
# )

### Help: Swapped order of a bus is messing with the transit matters metric.
* How to solve for this?? 

In [111]:
preview_cols = [
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
    "pct_actual_schd_headway",
    "bunched_y_n",
]

In [113]:
example2 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "2307719")
    & (transit_matters_df1.organization_name == "City of Visalia")
    & (transit_matters_df1.route_id == "2042")
]

#### Starting row 33484: the RT Arrival time is swapped. A bus that was scheduled to arrive at 4:27 arrived boefre the bus arrived at 3:42.
* This repeats again row 33486.

In [114]:
example2[preview_cols]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min,pct_actual_schd_headway,bunched_y_n
1022772,2024-05-22 06:32:47,,2024-05-22 06:42:00,,,not bunched
1022773,2024-05-22 07:26:41,53.9,2024-05-22 07:27:00,45.0,1.2,not bunched
1022774,2024-05-22 08:06:11,39.5,2024-05-22 08:12:00,45.0,0.88,not bunched
1022775,2024-05-22 08:56:57,50.77,2024-05-22 08:57:00,45.0,1.13,not bunched
1022776,2024-05-22 09:37:59,41.03,2024-05-22 09:42:00,45.0,0.91,not bunched
1022777,2024-05-22 10:27:26,49.45,2024-05-22 10:27:00,45.0,1.1,not bunched
1022778,2024-05-22 11:10:05,42.65,2024-05-22 11:12:00,45.0,0.95,not bunched
1022779,2024-05-22 12:01:01,50.93,2024-05-22 11:57:00,45.0,1.13,not bunched
1022780,2024-05-22 12:38:08,37.12,2024-05-22 12:42:00,45.0,0.82,not bunched
1022781,2024-05-22 13:27:10,49.03,2024-05-22 13:27:00,45.0,1.09,not bunched


### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

* To Do: add back in route  & operator information

In [136]:
two_minutes_df = trips_routes_times5.copy()

In [137]:
two_minutes_df["bunched_y_n"] = np.where(
    (two_minutes_df["actual_arrival_lag_min"] > 0) & 
    (two_minutes_df["actual_arrival_lag_min"] <= 2), 
    "bunched", 
    "not bunched"
)

In [138]:
two_minutes_df.bunched_y_n.value_counts() / len(two_minutes_df)

not bunched   0.99
bunched       0.01
Name: bunched_y_n, dtype: float64

In [139]:
final_two_minute = agg_final_df(two_minutes_df)

In [140]:
final_two_minute.loc[
    (final_two_minute.stop_id == "2307695")
    & (final_two_minute.organization_name == "City of Visalia")
    & (final_two_minute.route_id == "2042")
]

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
30193,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307695,16,18.0,0.0


### Comparing both outcomes
* There are so many more bunched trips for the 2 minute approach.
* Add back in schedule_gtfs_key and then grab stop level data from the warehouse.

In [141]:
final_two_minute.per_trip_bunched_per_stop.describe(percentiles)

count   140760.00
mean         0.70
std          3.90
min          0.00
1%           0.00
2%           0.00
5%           0.00
10%          0.00
50%          0.00
90%          0.00
95%          3.66
98%          8.33
99%         14.29
max         75.00
Name: per_trip_bunched_per_stop, dtype: float64

In [142]:
final_two_minute.sort_values(by=["per_trip_bunched_per_stop"], ascending=False).head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
39950,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,182-13172,1.0,142124,8,4.0,75.0
39955,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,182-13172,1.0,14817,7,4.0,75.0
39961,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,182-13172,1.0,14820,5,4.0,75.0
8904,04 - Oakland,587e730fac4db21d54037e0f12b0dd5d,2aa353e11dbf4ca87f156324bdcd11ab,Central Contra Costa Transit Authority,Miramonte High/OIS/Orinda BART,Bus,606,0.0,831397,6,6.0,66.67
8903,04 - Oakland,587e730fac4db21d54037e0f12b0dd5d,2aa353e11dbf4ca87f156324bdcd11ab,Central Contra Costa Transit Authority,Miramonte High/OIS/Orinda BART,Bus,606,0.0,831396,5,6.0,66.67


In [143]:
transit_matters_m1.per_trip_bunched_per_stop.describe(percentiles)

count   140760.00
mean         1.14
std          4.35
min          0.00
1%           0.00
2%           0.00
5%           0.00
10%          0.00
50%          0.00
90%          3.17
95%          7.69
98%         14.29
99%         20.00
max         86.67
Name: per_trip_bunched_per_stop, dtype: float64

In [144]:
transit_matters_m1.sort_values(by=["per_trip_bunched_per_stop"], ascending=False).head(
    10
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
26887,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,West Santa Barbara,Bus,1,0.0,83,2,15.0,86.67
26807,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,Goleta,Bus,6,0.0,84,47,4.0,75.0
26865,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,161,3,4.0,75.0
26867,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,4.0,75.0
26874,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,804,6,4.0,75.0
26873,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,598,5,4.0,75.0
26872,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,597,4,4.0,75.0
26800,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,Goleta,Bus,6,0.0,808,2,4.0,75.0
26790,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,Goleta,Bus,6,0.0,780,46,4.0,75.0
26870,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,369,7,4.0,75.0


### Make Visuals

In [145]:
freq_range = [
    "#ccbb44",
    "#e9d868",
    "#fcb40e",
    "#ff9c42",
    "#fc5c04",
    "#dd217d",
    "#dd217d",
    "#dd217d",
]

In [146]:
trips_routes_times5["hour"] = trips_routes_times5["converted_rt_arrival"].dt.hour
trips_routes_times5["min"] = trips_routes_times5["converted_rt_arrival"].dt.minute

In [147]:
def compare_approaches(
    stop_id: str, organization_name: str, route_id: str, stop_sequence: int
):
    transit_matter = transit_matters_m1.loc[
        (transit_matters_m1.stop_id == stop_id)
        & (transit_matters_m1.organization_name == organization_name)
        & (transit_matters_m1.route_id == route_id)
        & (transit_matters_m1.stop_sequence == stop_sequence)
    ]
    print("Transit Matters")
    display(transit_matter)

    two_min = final_two_minute.loc[
        (final_two_minute.stop_id == stop_id)
        & (final_two_minute.organization_name == organization_name)
        & (final_two_minute.route_id == route_id)
        & (final_two_minute.stop_sequence == stop_sequence)
    ]
    print("Two Minutes")
    display(two_min)
    total_trips = trips_routes_times5.loc[
        (trips_routes_times5.stop_id == stop_id)
        & (trips_routes_times5.organization_name == organization_name)
        & (trips_routes_times5.route_id == route_id)
        & (trips_routes_times5.stop_sequence == stop_sequence)
    ]

    og = trips_routes_times.loc[
        (trips_routes_times.stop_id == stop_id)
        & (trips_routes_times.organization_name == organization_name)
        & (trips_routes_times.route_id == route_id)
        & (trips_routes_times.stop_sequence == stop_sequence)
    ]

    display(total_trips.trip_instance_key.nunique())

    chart = (
        alt.Chart(total_trips)
        .mark_circle(size=500)
        .encode(
            x="hour",
            y="min",
            color=alt.Color(
                "hour",
                scale=alt.Scale(range=freq_range),
            ),
            tooltip=["hour", "min", "actual_arrival_lag_min"],
        )
        .properties(width=800, height=400)
    )
    display(chart)
    return og, total_trips

In [148]:
df_stc1, df_stc2 = compare_approaches(
    stop_id="62078",
    organization_name="Santa Clara Valley Transportation Authority",
    route_id="51",
    stop_sequence=13,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
23883,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62078,13,3.0,66.67


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
23883,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62078,13,3.0,33.33


3

In [149]:
df_duarte1, df_duarte2 = compare_approaches(
    stop_id="2665",
    organization_name="City of Duarte",
    route_id="707",
    stop_sequence=3696,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
84689,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Montclair-Pomona- El Monte- L.A.,Bus,707,0.0,2665,3696,2.0,50.0


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
84689,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Montclair-Pomona- El Monte- L.A.,Bus,707,0.0,2665,3696,2.0,50.0


2

In [150]:
df_vis1, df_vis2 = compare_approaches(
    stop_id="2307469",
    organization_name="City of Visalia",
    route_id="2042",
    stop_sequence=27,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
30183,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,16.0,18.75


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
30183,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,16.0,0.0


16

In [151]:
df_sd1, df_sd2 = compare_approaches(
    stop_id="88949",
    organization_name="San Diego Metropolitan Transit System",
    route_id="834",
    stop_sequence=19,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
132422,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,West Santee Loop,Bus,834,0.0,88949,19,2.0,0.0


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
132422,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,West Santee Loop,Bus,834,0.0,88949,19,2.0,0.0


2

#### Another example of a strange flip-flop of time stamps due to a

In [152]:
df_ucsb1, df_ucsb2 = compare_approaches(
    stop_id="22",
    organization_name="Santa Barbara Metropolitan Transit District",
    route_id="28",
    stop_sequence=8,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
26867,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,4.0,75.0


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
26867,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,4.0,0.0


4

In [153]:
df_ucsb2

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,trip_instance_key,rt_arrival_sec,scheduled_arrival_sec,converted_rt_arrival,converted_schd_arrival,actual_arrival_lag_min,scheduled_arrival_lag_min,hour,min
989108,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,991d2e0dc3ef0a55d97f318f135f7fd2,33828,28813.0,2024-05-22 09:23:48,2024-05-22 08:00:13,,,9,23
989109,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,b591464a2c0a45bfc69e2687b190f2ff,35927,57973.0,2024-05-22 09:58:47,2024-05-22 16:06:13,34.98,486.0,9,58
989110,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,adb98752ffd22d6e5bbb3790f8568ad9,39478,47173.0,2024-05-22 10:57:58,2024-05-22 13:06:13,59.18,-180.0,10,57
989111,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,f5317a6f30d550d0b5911e555e07736a,73018,32053.0,2024-05-21 20:16:58,2024-05-23 08:54:13,-881.0,1188.0,20,16


In [154]:
df_ucsb1[['rt_arrival_sec','scheduled_arrival_sec']]

Unnamed: 0,rt_arrival_sec,scheduled_arrival_sec
218675,73018,32053.0
218735,35927,57973.0
218788,33828,28813.0
3653131,39478,47173.0
