## Transit Bunching 
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)


In [1]:
import datetime as dt

import altair as alt
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
may_date = "2024-05-22"

In [5]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Grab Routes

In [6]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [7]:
route_dir_columns = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
    "route_primary_direction",
    "frequency",
    "service_date",
]

In [8]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[
    route_dir_columns
]

In [9]:
route_dir["headway_minutes"] = 60 / route_dir.frequency

In [10]:
# Filter for only peak
route_dir = route_dir.loc[route_dir.time_period == "peak"].reset_index(drop=True)

In [11]:
len(route_dir)

3238

#### Attach operators and districts

In [12]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [13]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [14]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [15]:
crosswalk_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,name,caltrans_district,service_date
0,1770249a5a2e770ca90628434d4934b1,Ventura County Transportation Commission,VCTC GMV Schedule,07 - Los Angeles,2024-05-22
1,f8102a9c0693206bf36d302540bf1bcf,City of Corona,Corona Schedule,08 - San Bernardino,2024-05-22


In [16]:
routes = pd.merge(
    route_dir,
    crosswalk_df,
    on=["schedule_gtfs_dataset_key", "service_date"],
    how="left",
)

In [17]:
# routes = pd.concat([thousand_oaks, visalia, metro, metro_33])

In [18]:
len(routes)

4695

### Some headway minutes seem off? How come route_id 30 for City of LA has a headway of 60 minutes for direction 1, but  157 minute headway for direction 0? Shouldn't it be the same?

In [19]:
routes.loc[
    (routes.schedule_gtfs_dataset_key == "cc53a0dbf5df90e3009b9cb5d89d80ba")
    & (routes.route_id == "30")
    & (routes.direction_id == 0)
]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,frequency,service_date,headway_minutes,organization_name,name,caltrans_district
3426,cc53a0dbf5df90e3009b9cb5d89d80ba,30,0.0,peak,Westbound,0.38,2024-05-22,157.89,City of Los Angeles,LA DOT Schedule,07 - Los Angeles


In [20]:
routes.loc[
    (routes.schedule_gtfs_dataset_key == "cc53a0dbf5df90e3009b9cb5d89d80ba")
    & (routes.route_id == "30")
    & (routes.direction_id == 1)
]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,frequency,service_date,headway_minutes,organization_name,name,caltrans_district
3427,cc53a0dbf5df90e3009b9cb5d89d80ba,30,1.0,peak,Eastbound,1.0,2024-05-22,60.0,City of Los Angeles,LA DOT Schedule,07 - Los Angeles


### Add Trips

In [21]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [22]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [23]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
    "route_type",
]

In [24]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [25]:
trips_routes = pd.merge(
    trips,
    routes,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [26]:
trips_routes.route_id.nunique()

1303

#### Help - I know we can get this from the warehouse but it seems cumbersome. Correct me if I'm wrong.

In [27]:
# https://gtfs.org/documentation/schedule/reference/#
route_type_crosswalk = {
    "route_type": ["0", "1", "2", "3", "4", "5", "6", "7", "11", "12"],
    "route_type_str": [
        "Tram, Streetcar, Light rail",
        "Subway, Metro",
        "Rail",
        "Bus",
        "Ferry.",
        "Cable tram.",
        "Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).",
        "Funicular.",
        "Trolleybus.",
        "Monorail.",
    ],
}

In [28]:
route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)

In [29]:
# Merge for route_type
trips_routes = pd.merge(
    trips_routes, route_type_crosswalk_df, on=["route_type"], how="left"
)

In [30]:
trips_routes = trips_routes.drop(columns=["route_type"]).rename(
    columns={"route_type_str": "route_type"}
)

### Get Stop Times 

In [31]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [32]:
trips_routes.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,time_period,route_primary_direction,frequency,service_date,headway_minutes,organization_name,name,caltrans_district,route_type
0,1770249a5a2e770ca90628434d4934b1,3408,c256553e28c4bba693e3136240b35419,8f644f847e987de68e0cb6fcd339cf41,926867fdee73d5fbfe4f011871bcd830,Route 21,0.0,peak,Westbound,1.88,2024-05-22,31.91,Ventura County Transportation Commission,VCTC GMV Schedule,07 - Los Angeles,Bus


In [33]:
rt_stop_times.shape

(2601262, 7)

In [34]:
trips_routes_times = pd.merge(
    rt_stop_times,
    trips_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [35]:
(trips_routes_times.scheduled_arrival_sec.isna().sum())

15029

#### Lots of duplicates??

In [36]:
len(trips_routes_times)

3672925

In [37]:
trips_routes_times2 = trips_routes_times.drop_duplicates().reset_index(drop=True)

In [38]:
len(trips_routes_times2)

3061772

### Sorting & Subsetting

In [39]:
subset = [
    "service_date",
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
    "stop_id",
    "stop_sequence",
    "trip_instance_key",
    "rt_arrival_sec",
    "scheduled_arrival_sec",
    "headway_minutes",
]

In [40]:
trips_routes_times3 = trips_routes_times2[subset]

In [41]:
trips_routes_times4 = trips_routes_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "direction_id",
        "stop_id",
        "stop_sequence",
        "rt_arrival_sec",
    ],
    ascending=[True, True, True, True, True, True],
).reset_index(drop=True)

### Convert scheduled and RT arrival times.

In [42]:
trips_routes_times4["converted_rt_arrival"] = pd.to_datetime(
    trips_routes_times4["service_date"]
) + pd.to_timedelta(trips_routes_times4["rt_arrival_sec"] % 86400, unit="s")

In [43]:
trips_routes_times4["converted_schd_arrival"] = pd.to_datetime(
    trips_routes_times4["service_date"]
) + pd.to_timedelta(trips_routes_times4["scheduled_arrival_sec"] % 86400, unit="s")

### Subtracting `converted_rt_arrival` from `converted_schd_arrival` to fix  time stamps.

In [44]:
percentiles = [0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]

In [45]:
trips_routes_times4["delay_min"] = (
    trips_routes_times4["converted_rt_arrival"]
    - trips_routes_times4["converted_schd_arrival"]
).dt.total_seconds() / 60

In [46]:
print(trips_routes_times4.delay_min.describe(percentiles))

count   3046743.00
mean          2.03
std          32.83
min       -1439.78
1%           -5.27
2%           -3.87
5%           -2.47
10%          -1.52
50%           1.45
90%           7.67
95%          10.87
98%          15.82
99%          20.23
max        1439.98
Name: delay_min, dtype: float64


In [47]:
trips_routes_times4["converted_rt_arrival"] = np.where(
    trips_routes_times4["delay_min"] >= 600,
    trips_routes_times4["converted_rt_arrival"] - pd.Timedelta(days=1),
    trips_routes_times4["converted_rt_arrival"],
)

In [48]:
trips_routes_times4["converted_rt_arrival"] = np.where(
    trips_routes_times4["delay_min"] <= -600,
    trips_routes_times4["converted_rt_arrival"] + pd.Timedelta(days=1),
    trips_routes_times4["converted_rt_arrival"],
)

In [49]:
trips_routes_times4["converted_schd_arrival"] = np.where(
    trips_routes_times4["delay_min"] <= -600,
    trips_routes_times4["converted_schd_arrival"] - pd.Timedelta(days=1),
    trips_routes_times4["converted_schd_arrival"],
)

In [50]:
trips_routes_times4["converted_schd_arrival"] = np.where(
    trips_routes_times4["delay_min"] >= 600,
    trips_routes_times4["converted_schd_arrival"] + pd.Timedelta(days=1),
    trips_routes_times4["converted_schd_arrival"],
)

In [51]:
# Recalc delay_min
trips_routes_times4["delay_min"] = (
    trips_routes_times4["converted_rt_arrival"]
    - trips_routes_times4["converted_schd_arrival"]
).dt.total_seconds() / 60

In [52]:
print(trips_routes_times4.delay_min.describe(percentiles))

count   3046743.00
mean          3.06
std          36.82
min       -2279.98
1%           -5.18
2%           -3.83
5%           -2.45
10%          -1.52
50%           1.45
90%           7.68
95%          10.88
98%          15.92
99%          20.50
max        2277.53
Name: delay_min, dtype: float64


In [53]:
extreme_values = trips_routes_times4.loc[
    (trips_routes_times4.delay_min < -5.18) | (trips_routes_times4.delay_min > 20.50)
]

In [54]:
print(extreme_values.delay_min.describe(percentiles))

count   60949.00
mean       38.11
std       256.50
min     -2279.98
1%       -126.55
2%        -55.63
5%        -25.72
10%       -13.95
50%        -5.18
90%        43.75
95%        74.00
98%      1443.00
99%      1450.90
max      2277.53
Name: delay_min, dtype: float64


In [55]:
extreme_values.loc[extreme_values.delay_min > 71].sample(3)

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,trip_instance_key,rt_arrival_sec,scheduled_arrival_sec,headway_minutes,converted_rt_arrival,converted_schd_arrival,delay_min
989050,2024-05-22,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,Ellwood,Bus,25,1.0,30,2,c4a350efc3387d2cc5c1d556bf76d74b,56953,28932.0,43.48,2024-05-22 15:49:13,2024-05-22 08:02:12,467.02
1712446,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego International Airport,Old Town - UTC via Pacific Beach,Bus,30,0.0,12270,10,0ef61dffae42c56b1873ea6bf8dac916,9616,86400.0,15.46,2024-05-22 02:40:16,2024-05-22 00:00:00,160.27
945593,2024-05-22,07 - Los Angeles,1770249a5a2e770ca90628434d4934b1,926867fdee73d5fbfe4f011871bcd830,City of Thousand Oaks,Route 10,Bus,4778,0.0,3737147,18,0602dd78ba40b3981325457f8063a82e,57236,48720.0,60.0,2024-05-22 15:53:56,2024-05-22 13:32:00,141.93


### Add some columns

In [56]:
trips_routes_times4["rt_hour"] = trips_routes_times4["converted_rt_arrival"].dt.hour
trips_routes_times4["rt_min"] = trips_routes_times4["converted_rt_arrival"].dt.minute
trips_routes_times4["schd_hour"] = trips_routes_times4["converted_schd_arrival"].dt.hour
trips_routes_times4["schd_min"] = trips_routes_times4[
    "converted_schd_arrival"
].dt.minute
trips_routes_times4["converted_schd_arrival_tooltip"] = trips_routes_times4[
    "converted_schd_arrival"
].dt.strftime("%Y-%m-%d %H:%M:%S")
trips_routes_times4["converted_rt_arrival_tooltip"] = trips_routes_times4[
    "converted_rt_arrival"
].dt.strftime("%Y-%m-%d %H:%M:%S")

### Calculate the actual headway the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [57]:
groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_id",
    "route_long_name",
    "route_type",
    "direction_id",
    "stop_id",
    "stop_sequence",
]

In [58]:
trips_routes_times4["actual_arrival_lag_min"] = (
    trips_routes_times4.groupby(groupby_cols)["converted_rt_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

In [59]:
trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "cc53a0dbf5df90e3009b9cb5d89d80ba"
    )
    & (trips_routes_times4.route_id == "30")
    & (trips_routes_times4.direction_id == 1)
].headway_minutes.describe()

count   476.00
mean     60.00
std       0.00
min      60.00
25%      60.00
50%      60.00
75%      60.00
max      60.00
Name: headway_minutes, dtype: float64

#### Check San Diego Ex: which has been messing up

In [60]:
sd_test = trips_routes_times4.loc[
    (trips_routes_times4.organization_name == "San Diego Metropolitan Transit System")
    & (trips_routes_times4.route_id == "834")
]

In [61]:
# sd_test

### Headway minutes are funky?

In [62]:
# # agg_operator_route_dir2.sort_values(by=["pct_bunched"], ascending=False).head(30)

In [63]:
trips_routes_times4.loc[
    (trips_routes_times4.organization_name == "Eastern Contra Costa Transit Authority")
    & (trips_routes_times4.route_id == "390")
    & (trips_routes_times4.direction_id == 1)
][["schedule_gtfs_dataset_key", "headway_minutes"]].drop_duplicates()

Unnamed: 0,schedule_gtfs_dataset_key,headway_minutes
1132081,55a01ef72af21906934ae8ffb4786e86,500.0


In [64]:
routes.loc[
    (routes.schedule_gtfs_dataset_key == "55a01ef72af21906934ae8ffb4786e86")
    & (routes.route_id == "390")
    & (routes.direction_id == 1)
]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,frequency,service_date,headway_minutes,organization_name,name,caltrans_district
1380,55a01ef72af21906934ae8ffb4786e86,390,1.0,peak,Westbound,0.12,2024-05-22,500.0,Eastern Contra Costa Transit Authority,Bay Area 511 Tri Delta Schedule,04 - Oakland


In [65]:
routes.loc[
    (routes.schedule_gtfs_dataset_key == "55a01ef72af21906934ae8ffb4786e86")
    & (routes.route_id == "390")
    & (routes.direction_id == 0)
]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,frequency,service_date,headway_minutes,organization_name,name,caltrans_district
1379,55a01ef72af21906934ae8ffb4786e86,390,0.0,peak,Eastbound,1.0,2024-05-22,60.0,Eastern Contra Costa Transit Authority,Bay Area 511 Tri Delta Schedule,04 - Oakland


In [66]:
trips_routes_times4.loc[
    (trips_routes_times4.organization_name == "Eastern Contra Costa Transit Authority")
    & (trips_routes_times4.route_id == "390")
    & (trips_routes_times4.direction_id == 0)
][["headway_minutes"]].drop_duplicates()

Unnamed: 0,headway_minutes
1132018,60.0


In [67]:
trips_routes_times4.loc[
    (trips_routes_times4.organization_name == "City of Los Angeles")
    & (trips_routes_times4.route_id == "30")
    & (trips_routes_times4.direction_id == 0)
][["headway_minutes"]].drop_duplicates()

Unnamed: 0,headway_minutes
2279235,157.89


In [68]:
trips_routes_times4.loc[
    (trips_routes_times4.organization_name == "City of Los Angeles")
    & (trips_routes_times4.route_id == "30")
    & (trips_routes_times4.direction_id == 1)
][
    [
        "stop_id",
        "stop_sequence",
        "rt_arrival_sec",
        "scheduled_arrival_sec",
        "headway_minutes",
        "converted_rt_arrival",
        "actual_arrival_lag_min",
    ]
].head()

Unnamed: 0,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,headway_minutes,converted_rt_arrival,actual_arrival_lag_min
2279697,1457172,43,68171,65400.0,60.0,2024-05-22 18:56:11,
2279698,1457172,43,70650,70020.0,60.0,2024-05-22 19:37:30,41.32
2279699,1457172,43,71702,71460.0,60.0,2024-05-22 19:55:02,17.53
2279700,414012,2,50387,50400.0,60.0,2024-05-22 13:59:47,
2279701,414012,2,52181,51600.0,60.0,2024-05-22 14:29:41,29.9


In [69]:
trips_routes_times4.loc[
    (trips_routes_times4.organization_name == "City of Los Angeles")
    & (trips_routes_times4.route_id == "30")
    & (trips_routes_times4.direction_id == 0)
][
    [
        "stop_id",
        "stop_sequence",
        "rt_arrival_sec",
        "scheduled_arrival_sec",
        "headway_minutes",
        "converted_rt_arrival",
        "actual_arrival_lag_min",
    ]
].head()

Unnamed: 0,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,headway_minutes,converted_rt_arrival,actual_arrival_lag_min
2279235,1457171,1,18186,17760.0,157.89,2024-05-22 05:03:06,
2279236,1457171,1,18559,18660.0,157.89,2024-05-22 05:09:19,6.22
2279237,1457171,1,20007,19860.0,157.89,2024-05-22 05:33:27,24.13
2279238,1457171,1,20640,20460.0,157.89,2024-05-22 05:44:00,10.55
2279239,1457171,1,21053,21060.0,157.89,2024-05-22 05:50:53,6.88


### MBTA - Massachusetts Bay Transportation Authority: 25% of scheduled headway 
* [Source](https://transitmatters.org/blog/reveal-mbtas-slowest-most-bunched-bus)
* [2024 Report](https://drive.google.com/file/d/1QFTVg0N3-uQeVoMqlOE6QLPqcoCtifzp/view?pli=1)
    * Taking a data-backed approach by relying on archival bus arrival and departure times from the MBTAʼs Open Data Portal and augmenting the data
      with route information from the MBTAʼs GTFS Feed,we adapted the methodology to reflect Bostonʼs
         unique transit characteristics as well as the post- COVID ridership dynamic to find bus speeds and bus bunching rates.
    * We limited this analysis to routes that had 500 or more daily riders, and only examined trips between 7am and 7pm on weekdays.
    * Adapted from NYC's analysis [here](https://www.nypirg.org/pubs/202311/Top_Ten_Best_Worst_in_NYC_Transit_2010-2019_FINAL.pdf)
    * To calculate the most bunched buses, we first
        defined a "bunch" as a bus that arrives within 25%
        of the scheduled headway of the bus in front of it.
        For example, if a bus is scheduled to arrive every
        10 minutes, a bus that arrives less than 2.5
        minutes a�er the bus in front of it is considered
        "bunched". We then looked at all time point events
        between 7am and 7pm on weekdays for each
        route. We matched each one to that dayʼs GTFS
        schedule to calculate the appropriate scheduled
        headway for that time of day and then calculated
        the total percent of departure events that met our
        bunching criteria. [here](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/6617ec40675223398aac12bf/1712843871514/TransitMatters-Bus-Bunching-Reports-Oct-2023)
    * They calculate it on the route level.
    * If a route has a bunching rate of 10% that means that every 1 out of 10 buses are
bunched. For a rider who does a round trip every day of the month, say 60 individual
trips, that means that the rider will experience bunching 6 times. (AH: how did they consider a trip to be bunched??)
    * Bunching typically worsens throughout a trip and
is most severe at the end of its route. However,
poor scheduling, dispatching, and operational
policy result in buses departing in a bunch, which
sets trips up for failure.
* [2023 Report](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/6617ec40675223398aac12bf/1712843871514/TransitMatters-Bus-Bunching-Reports-Oct-2023)
    * Analyzing bunching on a stop level: how many trips for a stop is bunched? 
    * Here, bunching is defined as headways < 25% of the scheduled_headway.
* 

In [70]:
transit_matters_df1 = trips_routes_times4.copy()

#### Use the scheduled headway min instead of calculating it

In [71]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_arrival_lag_min / transit_matters_df1.headway_minutes
)

In [72]:
transit_matters_df1["pct_actual_schd_headway"].describe(percentiles)

count   2871307.00
mean          1.46
std           4.40
min        -249.09
1%            0.11
2%            0.22
5%            0.47
10%           0.65
50%           1.04
90%           2.24
95%           3.12
98%           4.96
99%           8.24
max         249.95
Name: pct_actual_schd_headway, dtype: float64

In [73]:
transit_matters_df1.loc[transit_matters_df1.pct_actual_schd_headway > 249].head()

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,trip_instance_key,rt_arrival_sec,scheduled_arrival_sec,headway_minutes,converted_rt_arrival,converted_schd_arrival,delay_min,rt_hour,rt_min,schd_hour,schd_min,converted_schd_arrival_tooltip,converted_rt_arrival_tooltip,actual_arrival_lag_min,pct_actual_schd_headway
1188578,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,CALIFORNIA,Bus,1,0.0,16300,13,77dcdcc4aa4e2be2467711f6b41baf7c,298,86355.0,5.78,2024-05-23 00:04:58,2024-05-21 23:59:15,1445.72,0,4,23.0,59.0,2024-05-21 23:59:15,2024-05-23 00:04:58,1443.65,249.75
1188739,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,CALIFORNIA,Bus,1,0.0,16301,11,77dcdcc4aa4e2be2467711f6b41baf7c,296,86277.0,5.78,2024-05-23 00:04:56,2024-05-21 23:57:57,1446.98,0,4,23.0,57.0,2024-05-21 23:57:57,2024-05-23 00:04:56,1444.8,249.95
1189217,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,CALIFORNIA,Bus,1,0.0,16304,12,77dcdcc4aa4e2be2467711f6b41baf7c,297,86316.0,5.78,2024-05-23 00:04:57,2024-05-21 23:58:36,1446.35,0,4,23.0,58.0,2024-05-21 23:58:36,2024-05-23 00:04:57,1444.17,249.84


In [74]:
# transit_matters_df1.loc[transit_matters_df1.pct_actual_schd_headway < 0].head()

In [75]:
transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

In [76]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

not bunched   0.98
bunched       0.02
Name: bunched_y_n, dtype: float64

In [77]:
transit_matters_df1.loc[
    (
        transit_matters_df1.schedule_gtfs_dataset_key
        == "cc53a0dbf5df90e3009b9cb5d89d80ba"
    )
    & (transit_matters_df1.route_id == "30")
    & (transit_matters_df1.direction_id == 0)
].headway_minutes.describe()

count   462.00
mean    157.89
std       0.00
min     157.89
25%     157.89
50%     157.89
75%     157.89
max     157.89
Name: headway_minutes, dtype: float64

### Transit Matters: 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. <b>When bunching is defined as
buses that run within two minutes or less of
each other</b>, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.


In [78]:
two_minutes_df = trips_routes_times4.copy()

#### Added my own condition here to not tag any rows below 0...

In [79]:
two_minutes_df["bunched_y_n"] = np.where(
    (two_minutes_df["actual_arrival_lag_min"] > 0)
    & (two_minutes_df["actual_arrival_lag_min"] <= 2),
    "bunched",
    "not bunched",
)

In [80]:
two_minutes_df.bunched_y_n.value_counts() / len(two_minutes_df)

not bunched   0.99
bunched       0.01
Name: bunched_y_n, dtype: float64

In [81]:
two_minutes_df.loc[two_minutes_df.actual_arrival_lag_min < 0].shape

(1598, 26)

In [82]:
negative_lags = two_minutes_df.loc[two_minutes_df.actual_arrival_lag_min < 0]

In [83]:
print(negative_lags.actual_arrival_lag_min.describe(percentiles))

count    1598.00
mean    -1343.79
std       168.71
min     -1440.00
1%      -1439.53
2%      -1438.97
5%      -1436.46
10%     -1433.69
50%     -1410.76
90%     -1097.13
95%     -1071.67
98%      -961.42
99%      -807.49
max       -68.13
Name: actual_arrival_lag_min, dtype: float64


In [84]:
negative_lags.loc[(negative_lags.actual_arrival_lag_min > -807)].head(5)

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,trip_instance_key,rt_arrival_sec,scheduled_arrival_sec,headway_minutes,converted_rt_arrival,converted_schd_arrival,delay_min,rt_hour,rt_min,schd_hour,schd_min,converted_schd_arrival_tooltip,converted_rt_arrival_tooltip,actual_arrival_lag_min,bunched_y_n
122816,2024-05-22,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,14-13172,1.0,13406,77,7c0fff930dc74bb927d64aa10dacc395,62427,62520.0,12.63,2024-05-22 17:20:27,2024-05-22 17:22:00,-1.55,17,20,17.0,22.0,2024-05-22 17:22:00,2024-05-22 17:20:27,-402.1,not bunched
131788,2024-05-22,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,150-13172,1.0,16804,10,948ca978e46cd771aa0429d9120a96a8,86376,86400.0,20.83,2024-05-21 23:59:36,2024-05-23 00:00:00,-1440.4,23,59,0.0,0.0,2024-05-23 00:00:00,2024-05-21 23:59:36,-290.47,not bunched
387325,2024-05-22,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,233-13172,0.0,2231,62,583bf317904be0fc4f3727bd7698b4f4,86224,86460.0,10.43,2024-05-21 23:57:04,2024-05-23 00:01:00,-1443.93,23,57,0.0,1.0,2024-05-23 00:01:00,2024-05-21 23:57:04,-121.28,not bunched
387369,2024-05-22,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,233-13172,0.0,2258,61,583bf317904be0fc4f3727bd7698b4f4,86200,86400.0,10.43,2024-05-21 23:56:40,2024-05-23 00:00:00,-1443.33,23,56,0.0,0.0,2024-05-23 00:00:00,2024-05-21 23:56:40,-121.13,not bunched
387764,2024-05-22,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,233-13172,0.0,4348,63,583bf317904be0fc4f3727bd7698b4f4,86251,86460.0,10.43,2024-05-21 23:57:31,2024-05-23 00:01:00,-1443.48,23,57,0.0,1.0,2024-05-23 00:01:00,2024-05-21 23:57:31,-121.43,not bunched


### Aggregate


In [93]:
def bunched_not_bunched(
    df: pd.DataFrame, bunched_y_n: str, groupby_cols: list, agg_col: str, agg_type: str
) -> pd.DataFrame:
    df2 = df.loc[df.bunched_y_n == bunched_y_n].reset_index(drop=True)

    bunched_y_n = bunched_y_n.replace(" ", "_")
    agg1 = (df2.groupby(groupby_cols).agg({agg_col: agg_type}).reset_index()).rename(
        columns={agg_col: bunched_y_n}
    )
    return agg1

In [94]:
def aggregation(
    df: pd.DataFrame, groupby_cols: list, merge_cols: list, agg_col: str, agg_type: str
) -> pd.DataFrame:
    # Find total trips that are bunched
    bunched = bunched_not_bunched(df, "bunched", groupby_cols, agg_col, agg_type)
    # Find total trips that are NOT bunched
    not_bunched = bunched_not_bunched(
        df, "not bunched", groupby_cols, agg_col, agg_type
    )
    # Merge
    m1 = pd.merge(
        bunched,
        not_bunched,
        on=merge_cols,
        how="outer",
    )

    m1 = m1.fillna(0)
    m1.bunched = m1.bunched.apply(np.int64)
    m1.not_bunched = m1.not_bunched.apply(np.int64)
    # display(m1.info())

    # Find all of the nunique columns
    m1[f"all_{agg_col}"] = (m1.not_bunched + m1.bunched).astype(int)

    # Filter out any rows with only one trip of that groupby combo
    # for that service date
    m1 = m1.loc[m1[f"all_{agg_col}"] > 1].reset_index(drop=True)

    # Find % of bunched and not bunched
    m1["pct_bunched"] = (m1.bunched / m1[f"all_{agg_col}"]) * 100
    m1["pct_not_bunched"] = (m1.not_bunched / m1[f"all_{agg_col}"]) * 100

    return m1

#### Transit Matters Method: Stop Grain
* [2023 report](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/6617ec40675223398aac12bf/1712843871514/TransitMatters-Bus-Bunching-Reports-Oct-2023)

In [96]:
transit_groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
    "stop_id",
]

In [97]:
transit_matter_ag = aggregation(
    transit_matters_df1,
    transit_groupby_cols,
    transit_groupby_cols,
    "trip_instance_key",
    "nunique",
)

In [98]:
transit_matter_ag.head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
0,03 - Marysville,3c62ad6ee589d56eca915ce291a5df0a,9097af5e9a0e3909ca754a46ca037919,Yolo County Transportation District,ROUTE 215 EB,Bus,eb077710-0df4-4c7a-828e-914c6769117d,0.0,12789766-807c-4e66-9f92-f18123353300,1,9,10,10.0,90.0
1,03 - Marysville,3c62ad6ee589d56eca915ce291a5df0a,9097af5e9a0e3909ca754a46ca037919,Yolo County Transportation District,ROUTE 215 EB,Bus,eb077710-0df4-4c7a-828e-914c6769117d,0.0,1842225b-1d46-4127-a9ba-2aab24a1f30b,1,7,8,12.5,87.5
2,03 - Marysville,3c62ad6ee589d56eca915ce291a5df0a,9097af5e9a0e3909ca754a46ca037919,Yolo County Transportation District,ROUTE 215 EB,Bus,eb077710-0df4-4c7a-828e-914c6769117d,0.0,339cf3c9-e4e1-4144-b60f-ed32f130c9bc,1,9,10,10.0,90.0
3,03 - Marysville,3c62ad6ee589d56eca915ce291a5df0a,9097af5e9a0e3909ca754a46ca037919,Yolo County Transportation District,ROUTE 215 EB,Bus,eb077710-0df4-4c7a-828e-914c6769117d,0.0,355b416e-8231-4c4d-b1d8-6c150bb8d0f2,1,7,8,12.5,87.5
4,03 - Marysville,3c62ad6ee589d56eca915ce291a5df0a,9097af5e9a0e3909ca754a46ca037919,Yolo County Transportation District,ROUTE 215 EB,Bus,eb077710-0df4-4c7a-828e-914c6769117d,0.0,39d50615-06d2-4a3d-a7aa-cbc50f9ee924,1,8,9,11.11,88.89


In [100]:
transit_matter_ag.shape

(99015, 14)

In [101]:
transit_matter_ag.pct_bunched.describe(percentiles)

count   99015.00
mean        2.80
std        10.08
min         0.00
1%          0.00
2%          0.00
5%          0.00
10%         0.00
50%         0.00
90%         5.26
95%        11.88
98%        50.00
99%        50.00
max        91.67
Name: pct_bunched, dtype: float64

#####  City of LA Route 30 is very bunched b/c of the scheduled headway which is much higher than waht is happening in the RT data.

In [103]:
transit_matter_ag.loc[transit_matter_ag.pct_bunched > 90].head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
14599,07 - Los Angeles,cc53a0dbf5df90e3009b9cb5d89d80ba,2cfdf0e33e9229d6b0ad124d956f5856,City of Los Angeles,CE422,Bus,30,0.0,422300,11,1,12,91.67,8.33
14601,07 - Los Angeles,cc53a0dbf5df90e3009b9cb5d89d80ba,2cfdf0e33e9229d6b0ad124d956f5856,City of Los Angeles,CE422,Bus,30,0.0,422305,11,1,12,91.67,8.33
14602,07 - Los Angeles,cc53a0dbf5df90e3009b9cb5d89d80ba,2cfdf0e33e9229d6b0ad124d956f5856,City of Los Angeles,CE422,Bus,30,0.0,422306,11,1,12,91.67,8.33
14603,07 - Los Angeles,cc53a0dbf5df90e3009b9cb5d89d80ba,2cfdf0e33e9229d6b0ad124d956f5856,City of Los Angeles,CE422,Bus,30,0.0,422309,11,1,12,91.67,8.33
14604,07 - Los Angeles,cc53a0dbf5df90e3009b9cb5d89d80ba,2cfdf0e33e9229d6b0ad124d956f5856,City of Los Angeles,CE422,Bus,30,0.0,422310,11,1,12,91.67,8.33


In [126]:
la_30_og = trips_routes_times4.loc[
        (trips_routes_times4.stop_id == "422300")
        & (trips_routes_times4.organization_name == "City of Los Angeles")
        & (trips_routes_times4.route_id == "30")
    ]


#### Aggregate for `operator-route-direction`: How many stops are bunched?

In [118]:
op_route_dir_groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
]

In [120]:
agg_operator_route_dir1 = aggregation(
    transit_matters_df1,
    op_route_dir_groupby_cols,
    op_route_dir_groupby_cols,
    "trip_instance_key",
    "nunique",
)

In [123]:
la_30_test = agg_operator_route_dir1.loc[
    (agg_operator_route_dir1.organization_name == "City of Los Angeles")
    & (agg_operator_route_dir1.route_id == "30")
]

In [124]:
la_30_test_m = pd.melt(
    la_30_test,
    id_vars=["organization_name", "route_id", "direction_id"],
    value_vars=["pct_bunched", "pct_not_bunched"],
).rename(columns={"value": "percentage"})

alt.Chart(la_30_test_m).mark_bar().encode(
    x=alt.X(
        "variable:O",
        axis=alt.Axis(labelAngle=-45),
    ),
    y=alt.Y("percentage:Q", scale=alt.Scale(domain=[0, 100])),
    color=alt.Color(
        "variable:N",
        title="Bunched or Not",
        scale=alt.Scale(range=color_dict["tri_color"]),
    ),
    column="direction_id:N",
    tooltip=list(la_30_test_m.columns),
).properties(
    title={
        "text": "Total Occurences of Bunching for Operator-Route-Direction",
    },
    width=200,
    height=250,
)

In [125]:
sf_30_test = agg_operator_route_dir1.loc[
    (agg_operator_route_dir1.organization_name == "City and County of San Francisco")
    & (agg_operator_route_dir1.route_id == "30")
]

sf_30_test_m = pd.melt(
    sf_30_test,
    id_vars=["organization_name", "route_id", "direction_id"],
    value_vars=["pct_bunched", "pct_not_bunched"],
).rename(columns={"value": "percentage"})

alt.Chart(sf_30_test_m).mark_bar().encode(
    x=alt.X(
        "variable:O",
        axis=alt.Axis(labelAngle=-45),
    ),
    y=alt.Y("percentage:Q", scale=alt.Scale(domain=[0, 100])),
    color=alt.Color(
        "variable:N",
        title="Bunched or Not",
        scale=alt.Scale(range=color_dict["tri_color"]),
    ),
    column="direction_id:N",
    tooltip=list(sf_30_test_m.columns),
).properties(
    title={
        "text": "Total Occurences of Bunching for Operator-Route-Direction",
    },
    width=200,
    height=250,
)

In [161]:
sf_49_test = agg_operator_route_dir1.loc[
    (agg_operator_route_dir1.organization_name == "City and County of San Francisco")
    & (agg_operator_route_dir1.route_id == "49")
]

sf_49_test_m = pd.melt(
    sf_49_test,
    id_vars=["organization_name", "route_id", "direction_id"],
    value_vars=["pct_bunched", "pct_not_bunched"],
).rename(columns={"value": "percentage"})

alt.Chart(sf_49_test_m).mark_bar().encode(
    x=alt.X(
        "variable:O",
        axis=alt.Axis(labelAngle=-45),
    ),
    y=alt.Y("percentage:Q", scale=alt.Scale(domain=[0, 100])),
    color=alt.Color(
        "variable:N",
        title="Bunched or Not",
        scale=alt.Scale(range=color_dict["tri_color"]),
    ),
    column="direction_id:N",
    tooltip=list(sf_49_test_m.columns),
).properties(
    title={
        "text": "Total Occurences of Bunching for Operator-Route-Direction",
    },
    width=200,
    height=250,
)

#### Aggregate for `operator-route-direction-rt_arrival_hour`
* Testing a couple of options to see the difference.
* Do we care how many stops experience  bunching or how many trips experience bunching?
* If one stop for a trip is bunched, the whole trip is considered bunched...Is that too "draconian"?
    * Per my conversation with Katie, generally if one stop is bunched, the whole trip is considered bunched.

In [129]:
op_route_dir_hour_groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
    "rt_hour",
]

In [131]:
agg_operator_route_dir_hour = aggregation(
    transit_matters_df1,
    op_route_dir_hour_groupby_cols,
    op_route_dir_hour_groupby_cols,
    "stop_id",
    "nunique",
)

In [141]:
agg_operator_trip_dir_hour2 = aggregation(
    transit_matters_df1,
    op_route_dir_hour_groupby_cols,
    op_route_dir_hour_groupby_cols,
    "trip_instance_key",
    "nunique",
)

In [137]:
one_route49_stop_bunched = agg_operator_route_dir_hour.loc[
    (
        agg_operator_route_dir_hour.organization_name
        == "City and County of San Francisco"
    )
    & (agg_operator_route_dir_hour.route_id == "49")
    & (agg_operator_route_dir_hour.direction_id == 1)
]

In [144]:
one_route49_test2_trip_bunched = agg_operator_trip_dir_hour2.loc[
    (
        agg_operator_trip_dir_hour2.organization_name
        == "City and County of San Francisco"
    )
    & (agg_operator_trip_dir_hour2.route_id == "49")
    & (agg_operator_trip_dir_hour2.direction_id == 1)
]

In [139]:
(
    alt.Chart(one_route49_stop_bunched)
    .mark_bar(size=10, color="#dd217d")
    .encode(
        x=alt.X("rt_hour", scale=alt.Scale(domain=[0, 24])),
        y=alt.Y("pct_bunched", scale=alt.Scale(domain=[0, 100])),
        tooltip=["pct_bunched", "rt_hour"],
    )
    .properties(width=800, height=400)
)

In [145]:
(
    alt.Chart(one_route49_test2_trip_bunched)
    .mark_bar(size=10, color="#dd217d")
    .encode(
        x=alt.X("rt_hour", scale=alt.Scale(domain=[0, 24])),
        y=alt.Y("pct_bunched", scale=alt.Scale(domain=[0, 100])),
        tooltip=["pct_bunched", "rt_hour"],
    )
    .properties(width=800, height=400)
)

#### Aggregate for `operator-route-direction-stop` grain.

In [146]:
op_route_dir_stop_groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
    "stop_id",
    "stop_sequence",
]

In [147]:
final_two_minute = aggregation(
    two_minutes_df,
    op_route_dir_stop_groupby_cols,
    op_route_dir_stop_groupby_cols,
    "trip_instance_key",
    "nunique",
)

In [148]:
transit_matters_m1 = aggregation(
    transit_matters_df1,
    op_route_dir_stop_groupby_cols,
    op_route_dir_stop_groupby_cols,
    "trip_instance_key",
    "nunique",
)

In [149]:
def compare_approaches(
    stop_id: str, organization_name: str, route_id: str, stop_sequence: int
):
    transit_matter = transit_matters_m1.loc[
        (transit_matters_m1.stop_id == stop_id)
        & (transit_matters_m1.organization_name == organization_name)
        & (transit_matters_m1.route_id == route_id)
        & (transit_matters_m1.stop_sequence == stop_sequence)
    ]
    print("Transit Matters")
    display(transit_matter)

    two_min = final_two_minute.loc[
        (final_two_minute.stop_id == stop_id)
        & (final_two_minute.organization_name == organization_name)
        & (final_two_minute.route_id == route_id)
        & (final_two_minute.stop_sequence == stop_sequence)
    ]
    print("Two Minutes")
    display(two_min)
    total_trips = trips_routes_times4.loc[
        (trips_routes_times4.stop_id == stop_id)
        & (trips_routes_times4.organization_name == organization_name)
        & (trips_routes_times4.route_id == route_id)
        & (trips_routes_times4.stop_sequence == stop_sequence)
    ]

    og = trips_routes_times.loc[
        (trips_routes_times.stop_id == stop_id)
        & (trips_routes_times.organization_name == organization_name)
        & (trips_routes_times.route_id == route_id)
        & (trips_routes_times.stop_sequence == stop_sequence)
    ]

    display(total_trips.trip_instance_key.nunique())

    rt_chart = (
        alt.Chart(total_trips)
        .mark_circle(size=450, color="#dd217d")
        .encode(
            x="rt_hour",
            y="rt_min",
            tooltip=["converted_schd_arrival_tooltip", "converted_rt_arrival_tooltip"],
        )
        .properties(width=800, height=400)
    )

    schd_chart = (
        alt.Chart(total_trips)
        .mark_circle(size=450, color="#fcb40e")
        .encode(
            x="schd_hour",
            y="schd_min",
            tooltip=["converted_schd_arrival_tooltip", "converted_rt_arrival_tooltip"],
        )
        .properties(width=800, height=400)
    )

    title = f"Bunching for {organization_name}: Route {route_id}/Stop {stop_id}"
    chart = (rt_chart + schd_chart).properties(
        title={
            "text": title,
            "subtitle": "Orange dots represent scheduled arrivals, magenta dots represent actual arrivals ",
        }
    )
    display(chart)
    return og, total_trips

In [150]:
df_la1, df_la2 = compare_approaches(
    stop_id="15659",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="16-13172",
    stop_sequence=35,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
8760,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,16-13172,0.0,15659,35,15,144,159,9.43,90.57


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
5488,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,16-13172,0.0,15659,35,14,145,159,8.81,91.19


159

In [151]:
df_sf1, df_sf2 = compare_approaches(
    stop_id="16299",
    organization_name="City and County of San Francisco",
    route_id="1",
    stop_sequence=7,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
1828,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,CALIFORNIA,Bus,1,0.0,16299,7,14,149,163,8.59,91.41


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
844,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,CALIFORNIA,Bus,1,0.0,16299,7,27,136,163,16.56,83.44


163

In [152]:
df_sf3, df_sf4 = compare_approaches(
    stop_id="14115",
    organization_name="City and County of San Francisco",
    route_id="30",
    stop_sequence=14,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
3914,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,STOCKTON,Bus,30,0.0,14115,14,17,104,121,14.05,85.95


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
2615,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,STOCKTON,Bus,30,0.0,14115,14,16,105,121,13.22,86.78


121

In [153]:
df_stc1, df_stc2 = compare_approaches(
    stop_id="62078",
    organization_name="Santa Clara Valley Transportation Authority",
    route_id="51",
    stop_sequence=13,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
6639,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62078,13,1,2,3,33.33,66.67


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
4206,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62078,13,1,2,3,33.33,66.67


3

In [154]:
df_duarte1, df_duarte2 = compare_approaches(
    stop_id="2665",
    organization_name="City of Duarte",
    route_id="707",
    stop_sequence=3696,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
17865,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Montclair-Pomona- El Monte- L.A.,Bus,707,0.0,2665,3696,1,1,2,50.0,50.0


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
10696,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Montclair-Pomona- El Monte- L.A.,Bus,707,0.0,2665,3696,1,1,2,50.0,50.0


2

In [155]:
df_duarte2

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,trip_instance_key,rt_arrival_sec,scheduled_arrival_sec,headway_minutes,converted_rt_arrival,converted_schd_arrival,delay_min,rt_hour,rt_min,schd_hour,schd_min,converted_schd_arrival_tooltip,converted_rt_arrival_tooltip,actual_arrival_lag_min
2915580,2024-05-22,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Montclair-Pomona- El Monte- L.A.,Bus,707,0.0,2665,3696,2715d5782ca784477f6b0f908e187407,63267,61356.0,16.57,2024-05-22 17:34:27,2024-05-22 17:02:36,31.85,17,34,17.0,2.0,2024-05-22 17:02:36,2024-05-22 17:34:27,
2915582,2024-05-22,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Montclair-Pomona- El Monte- L.A.,Bus,707,0.0,2665,3696,ec6da0c31365ce24abfdce8d57137f6a,63278,62256.0,16.57,2024-05-22 17:34:38,2024-05-22 17:17:36,17.03,17,34,17.0,17.0,2024-05-22 17:17:36,2024-05-22 17:34:38,0.18


In [156]:
df_vis1, df_vis2 = compare_approaches(
    stop_id="2307469",
    organization_name="City of Visalia",
    route_id="2042",
    stop_sequence=27,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
46605,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,0,16,16,0.0,100.0


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
39073,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,0,16,16,0.0,100.0


16

In [157]:
df_sd1, df_sd2 = compare_approaches(
    stop_id="88949",
    organization_name="San Diego Metropolitan Transit System",
    route_id="834",
    stop_sequence=19,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
133910,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,West Santee Loop,Bus,834,0.0,88949,19,0,2,2,0.0,100.0


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
132919,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,West Santee Loop,Bus,834,0.0,88949,19,0,2,2,0.0,100.0


2

In [158]:
df_ucsb1, df_ucsb2 = compare_approaches(
    stop_id="22",
    organization_name="Santa Barbara Metropolitan Transit District",
    route_id="28",
    stop_sequence=8,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
7236,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,1,3,4,25.0,75.0


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,bunched,not_bunched,all_trip_instance_key,pct_bunched,pct_not_bunched
35779,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,0,4,4,0.0,100.0


4

In [159]:
df_ucsb2

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,trip_instance_key,rt_arrival_sec,scheduled_arrival_sec,headway_minutes,converted_rt_arrival,converted_schd_arrival,delay_min,rt_hour,rt_min,schd_hour,schd_min,converted_schd_arrival_tooltip,converted_rt_arrival_tooltip,actual_arrival_lag_min
989108,2024-05-22,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,991d2e0dc3ef0a55d97f318f135f7fd2,33828,28813.0,28.3,2024-05-22 09:23:48,2024-05-22 08:00:13,83.58,9,23,8.0,0.0,2024-05-22 08:00:13,2024-05-22 09:23:48,
989109,2024-05-22,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,b591464a2c0a45bfc69e2687b190f2ff,35927,57973.0,28.3,2024-05-22 09:58:47,2024-05-22 16:06:13,-367.43,9,58,16.0,6.0,2024-05-22 16:06:13,2024-05-22 09:58:47,34.98
989110,2024-05-22,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,adb98752ffd22d6e5bbb3790f8568ad9,39478,47173.0,28.3,2024-05-22 10:57:58,2024-05-22 13:06:13,-128.25,10,57,13.0,6.0,2024-05-22 13:06:13,2024-05-22 10:57:58,59.18
989111,2024-05-22,05 - San Luis Obispo,239f3baf3dd3b9e9464f66a777f9897d,a3c82f955ca3b93746786deb7fe4fb0d,Santa Barbara Metropolitan Transit District,UCSB Shuttle,Bus,28,0.0,22,8,f5317a6f30d550d0b5911e555e07736a,73018,32053.0,28.3,2024-05-21 20:16:58,2024-05-23 08:54:13,-2197.25,20,16,8.0,54.0,2024-05-23 08:54:13,2024-05-21 20:16:58,-881.0


In [160]:
df_ucsb1[["rt_arrival_sec", "scheduled_arrival_sec"]]

Unnamed: 0,rt_arrival_sec,scheduled_arrival_sec
218675,73018,32053.0
218735,35927,57973.0
218788,33828,28813.0
3653131,39478,47173.0
