## Transit Bunching 
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)
### 10/25 To-Do
* Figure out how to address City of Visalia: one of the buses that is scheduled to arrive earlier arrives later than another box. This leads to a negative time stamp and makes it appear like there is a lot of bunching per the Transit Matters approach.
* Figure out if taking out stop sequence matters by the time we are applying the metric/calculating the arrival and scheduled lags. We don't need that fine of a grain. 



In [140]:
import datetime as dt

import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [141]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [142]:
may_date = "2024-05-22"

In [143]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Grab Routes

In [144]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [145]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [146]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])

In [147]:
route_dir.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,51.77,0.27,12,1.5,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
3,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,all_day,Southbound,46.73,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22
4,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,offpeak,Southbound,46.73,0.28,11,0.69,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22


#### Attach operators and districts

In [148]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [149]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [150]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [151]:
crosswalk_df.shape

(189, 5)

In [152]:
routes_dir2 = pd.merge(
    route_dir, crosswalk_df, on="schedule_gtfs_dataset_key", how="left"
)

In [153]:
routes_dir2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date_x,organization_name,name,caltrans_district,service_date_y
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22,Marin County Transit District,Bay Area 511 Marin Schedule,04 - Oakland,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22,Marin County Transit District,Bay Area 511 Marin Schedule,04 - Oakland,2024-05-22


In [154]:
thousand_oaks = routes_dir2.loc[
    (routes_dir2.organization_name == "City of Thousand Oaks")
    & (routes_dir2.route_id == "3402")
    & (routes_dir2.time_period == "all_day")
].reset_index(drop=True)

In [155]:
visalia = routes_dir2.loc[
    (routes_dir2.organization_name == "City of Visalia")
    & (routes_dir2.route_id == "2042")
    & (routes_dir2.time_period == "all_day")
].reset_index(drop=True)

In [156]:
metro = routes_dir2.loc[
    (
        routes_dir2.organization_name
        == "Los Angeles County Metropolitan Transportation Authority"
    )
    & (routes_dir2.route_id == "204-13172")
    & (routes_dir2.time_period == "all_day")
].reset_index(drop=True)

In [157]:
# '33-13172'

metro_33 = routes_dir2.loc[
    (
        routes_dir2.organization_name
        == "Los Angeles County Metropolitan Transportation Authority"
    )
    & (routes_dir2.route_id == "33-13172")
    & (routes_dir2.time_period == "all_day")
].reset_index(drop=True)

In [158]:
routes = pd.concat([thousand_oaks, visalia, metro, metro_33])

In [159]:
routes.route_id.unique()

array(['3402', '2042', '204-13172', '33-13172'], dtype=object)

### Add Trips

In [160]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [161]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [162]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
    "route_type",
]

In [163]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [164]:
trips_routes = pd.merge(
    trips,
    routes,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [165]:
trips_routes.shape

(570, 25)

In [166]:
trips_routes.route_id.nunique()

4

In [167]:
# https://gtfs.org/documentation/schedule/reference/#
route_type_crosswalk = {
    "route_type": ["0", "1", "2", "3", "4", "5", "6", "7", "11", "12"],
    "route_type_str": [
        "Tram, Streetcar, Light rail",
        "Subway, Metro",
        "Rail",
        "Bus",
        "Ferry.",
        "Cable tram.",
        "Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).",
        "Funicular.",
        "Trolleybus.",
        "Monorail.",
    ],
}

In [168]:
route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)

In [169]:
# Merge for route_type
trips_routes = pd.merge(
    trips_routes, route_type_crosswalk_df, on=["route_type"], how="left"
)

In [170]:
trips_routes = trips_routes.drop(columns=["route_type"])

In [171]:
trips_routes = trips_routes.rename(columns={"route_type_str": "route_type"})

### Get Stop Times 

In [172]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [173]:
rt_stop_times.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
0,1d105244-776c-4b3f-af78-9c7ad78c2103,0b2443b6-b50f-452b-a749-464588ca93b8,8,60991.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61434
1,1d105244-776c-4b3f-af78-9c7ad78c2103,cd5650b0-9a18-4e78-aedc-385f3094fa0f,9,61179.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61616


In [174]:
# Find only stop times of trips that belong to high frequency trips
trips_routes_times = pd.merge(
    rt_stop_times,
    trips_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [175]:
(trips_routes_times.scheduled_arrival_sec.isna().sum())

0

### Taking out shape_array_key for now.

In [176]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# Do I need to include stop_id?
trips_routes_times2 = trips_routes_times.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "direction_id",
        "stop_sequence",
        "stop_id",
        "rt_arrival_sec",
    ]
).reset_index(drop=True)

In [177]:
preview_sort_col = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "stop_sequence",
    "rt_arrival_sec",
    "stop_id",
    "scheduled_arrival_sec",
]

In [178]:
trips_routes_times2 = trips_routes_times2.drop(columns=["service_date_x"]).rename(
    columns={"service_date_y": "service_date"}
)

In [179]:
trips_routes_times2.loc[
    (
        trips_routes_times2.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (trips_routes_times2.route_id == "204-13172")
    & (trips_routes_times2.stop_sequence == 66)
][preview_sort_col]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,stop_sequence,rt_arrival_sec,stop_id,scheduled_arrival_sec
14994,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,23073,3961,23160.0
14995,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,61233,3961,61260.0
30667,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,1594,14027,87840.0
30668,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,3117,14027,89460.0
30669,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,4938,14027,91200.0
30670,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,6568,14027,92880.0
30671,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,8194,14027,94680.0
30672,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,10328,14027,96480.0
30673,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,12016,14027,98280.0
30674,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,66,13693,14027,100080.0


### Convert scheduled and RT arrival times.

* If 82800  < `scheduled_arrival_time` < 86_400 but `rt_arrival_sec` is lower say 14_000 (4 am in the morning): then perhaps the bus was scheduled to arrive on May 21 (day before the service date) but it arrived a little later on the service date. 

* If  86_400 < `scheduled_arrival_time` and `rt_arrival_sec` is around 86_000 then this is the same service date. 

In [180]:
trips_routes_times2["rt_arrival_sec"].describe()

count   34317.00
mean    46219.31
std     19698.95
min        24.00
25%     31428.00
50%     46452.00
75%     61794.00
max     86399.00
Name: rt_arrival_sec, dtype: float64

In [181]:
trips_routes_times2.loc[trips_routes_times2["scheduled_arrival_sec"] == 86_400].shape

(11, 29)

In [182]:
more_than_86400 = trips_routes_times2.loc[
    trips_routes_times2["scheduled_arrival_sec"] > 86_400
]

In [183]:
more_than_86400["scheduled_arrival_sec"].describe()

count     2326.00
mean     94761.44
std       5297.65
min      86460.00
25%      90180.00
50%      94320.00
75%      99000.00
max     106680.00
Name: scheduled_arrival_sec, dtype: float64

In [184]:
timestamp_subset = [
    "converted_schd_arrival",
    "converted_rt_arrival",
    "scheduled_arrival_sec",
    "rt_arrival_sec",
    "service_date",
]

In [185]:
def adjust_days_and_time(sched_arrival_seconds, date, rt_arrival_sec):
    """
    Adjusts days and time of sched_arrival_seconds because it runs over 24 hours
    based on a combination of  sched_arrival_seconds and rt_arrival_sec.

    Parameters:
    sched_arrival_seconds (int): Number of seconds.
    date (datetime): Initial date.
    rt_arrival_sec (int): Arrival time in seconds.

    Returns:
    datetime: Adjusted date and time for sched_arrival_seconds
    """
    # If the rt arrival second is between 12-1AM and the scheduled arrival time is between
    # is between 11pm and 1am subtract a day
    if rt_arrival_sec < (60 * 60) and (82_800 < sched_arrival_seconds < 90_000):
        return pd.Timestamp(date + pd.Timedelta(days=-1)) + pd.Timedelta(
            seconds=sched_arrival_seconds % 86400
        )
    # If rt_arrival team is before 12 PM and scheduled_arrival_seconds is
    # more than 86_400 seconds, return the date plus the seconds that remain
    # elif rt_arrival_sec < (86_400 / 2) and (86_400 < sched_arrival_seconds):
    #    return pd.Timestamp(date) + pd.Timedelta(seconds=sched_arrival_seconds % 86400)
    else:
        # No change
        return pd.Timestamp(date) + pd.Timedelta(seconds=sched_arrival_seconds)

#### Subset to make the df smaller


In [186]:
subset = [
    "stop_id",
    "stop_sequence",
    "scheduled_arrival_sec",
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "rt_arrival_sec",
    "route_id",
    "shape_array_key",
    "route_long_name",
    "direction_id",
    "organization_name",
    "caltrans_district",
    "service_date",
    "route_type",
    "feed_key",
]

In [187]:
trips_routes_times2 = trips_routes_times2[subset]

In [188]:
trips_routes_times2["converted_schd_arrival"] = trips_routes_times2.apply(
    lambda row: adjust_days_and_time(
        row["scheduled_arrival_sec"], row["service_date"], row["rt_arrival_sec"]
    ),
    axis=1,
)

In [189]:
trips_routes_times2["converted_rt_arrival"] = pd.to_datetime(
    trips_routes_times2["service_date"]
) + pd.to_timedelta(trips_routes_times2["rt_arrival_sec"] % 86400, unit="s")

### Deal with delays
* Some very extreme values.

In [190]:
def deal_with_23_hours(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function takes a pandas DataFrame `df` as input, modifies it, and returns the modified DataFrame.
    The function performs two operations:
    1. It adds one day to the `converted_schd_arrival` column values if
    the corresponding `delay_min` value exceeds 23 hours (1380 minutes).
    This is done using the `apply` method with a lambda function.
    2. It recalculates the `delay_min` column by subtracting the `converted_schd_arrival`.

    Parameters:
    df (pandas.DataFrame): Input DataFrame containing the columns `converted_schd_arrival` and `delay_min`.

    Returns:
    pandas.DataFrame: Modified DataFrame with updated `converted_schd_arrival` and `delay_min` columns.

    Note: This function assumes that the input DataFrame has the required columns and that the `rt_stop_times3` DataFrame is defined in the global scope.
    """
    df["delay_min"] = (
        df["converted_rt_arrival"] - df["converted_schd_arrival"]
    ).dt.total_seconds() / 60

    display(df["delay_min"].describe())

    # If the delay is  23 hours btwn sched and rt arrival
    # subtract a day to the converted scheduled arrival
    df["converted_schd_arrival"] = df.apply(
        lambda row: row["converted_schd_arrival"] - pd.Timedelta(days=1)
        if row["delay_min"] < (23 * 60)
        else row["converted_schd_arrival"],
        axis=1,
    )

    # If the delay is more -23 hours
    # add a day to the converted scheduled arrival
    df["converted_schd_arrival"] = df.apply(
        lambda row: row["converted_schd_arrival"] + pd.Timedelta(days=1)
        if row["delay_min"] > -(23 * 60)
        else row["converted_schd_arrival"],
        axis=1,
    )

    df["delay_min"] = (
        df["converted_rt_arrival"] - df["converted_schd_arrival"]
    ).dt.total_seconds() / 60

    display(df["delay_min"].describe())
    return df

In [191]:
more_than_86400 = deal_with_23_hours(trips_routes_times2)

count   34317.00
mean      -49.39
std       371.89
min     -1462.75
25%        -0.78
50%         0.93
75%         3.92
max      1452.15
Name: delay_min, dtype: float64

count   34317.00
mean        2.47
std         7.04
min       -22.75
25%        -0.52
50%         1.10
75%         3.87
max        99.27
Name: delay_min, dtype: float64

In [192]:
trips_routes_times2.iloc[15717]

stop_id                                                                          3784
stop_sequence                                                                      75
scheduled_arrival_sec                                                        86580.00
schedule_gtfs_dataset_key                            0666caf3ec1ecc96b74f4477ee4bc939
trip_instance_key                                    bb4b0ef88b4c640626cde94b313a4020
rt_arrival_sec                                                                    872
route_id                                                                     33-13172
shape_array_key                                      f4b24240020ddbe8f7898617283c086c
route_long_name                                                      Metro Local Line
direction_id                                                                     0.00
organization_name            Los Angeles County Metropolitan Transportation Authority
caltrans_district                                     

In [193]:
trips_routes_times2.iloc[31237]

stop_id                                                                         14012
stop_sequence                                                                      69
scheduled_arrival_sec                                                       100920.00
schedule_gtfs_dataset_key                            0666caf3ec1ecc96b74f4477ee4bc939
trip_instance_key                                    7c14fd47eb441b0cdd896089d3996443
rt_arrival_sec                                                                  14770
route_id                                                                     33-13172
shape_array_key                                      c6d9a42fec97c3496482134203a8d9c6
route_long_name                                                      Metro Local Line
direction_id                                                                     1.00
organization_name            Los Angeles County Metropolitan Transportation Authority
caltrans_district                                     

In [194]:
percentiles = [0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]

### Help, can't fix everythin...how to address these edge cases? 

In [195]:
trips_routes_times2.converted_schd_arrival.describe()

  trips_routes_times2.converted_schd_arrival.describe()


count                   34317
unique                   1452
top       2024-05-22 16:36:00
freq                       46
first     2024-05-21 23:51:00
last      2024-05-23 00:02:00
Name: converted_schd_arrival, dtype: object

In [196]:
trips_routes_times2[trips_routes_times2['converted_schd_arrival'].dt.strftime('%Y-%m-%d') == '2024-05-21'].head(2)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,route_long_name,direction_id,organization_name,caltrans_district,service_date,route_type,feed_key,converted_schd_arrival,converted_rt_arrival,delay_min
3122,6915,15,86280.0,0666caf3ec1ecc96b74f4477ee4bc939,96aeba64c5c13683c1f11e01659cd5be,26,33-13172,f4b24240020ddbe8f7898617283c086c,Metro Local Line,0.0,Los Angeles County Metropolitan Transportation Authority,07 - Los Angeles,2024-05-22,Bus,608992664173210532aa3e6cc573be2f,2024-05-21 23:58:00,2024-05-22 00:00:26,2.43
3412,6969,16,86340.0,0666caf3ec1ecc96b74f4477ee4bc939,96aeba64c5c13683c1f11e01659cd5be,72,33-13172,f4b24240020ddbe8f7898617283c086c,Metro Local Line,0.0,Los Angeles County Metropolitan Transportation Authority,07 - Los Angeles,2024-05-22,Bus,608992664173210532aa3e6cc573be2f,2024-05-21 23:59:00,2024-05-22 00:01:12,2.2


In [197]:
trips_routes_times2[trips_routes_times2['converted_schd_arrival'].dt.strftime('%Y-%m-%d') == '2024-05-23'].head(2)

Unnamed: 0,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,route_id,shape_array_key,route_long_name,direction_id,organization_name,caltrans_district,service_date,route_type,feed_key,converted_schd_arrival,converted_rt_arrival,delay_min
16721,475,5,86520.0,0666caf3ec1ecc96b74f4477ee4bc939,3eabc6856371e291067b81b9a9b74f85,86152,33-13172,33604cd768d576d5a8c87112fb4ca942,Metro Local Line,1.0,Los Angeles County Metropolitan Transportation Authority,07 - Los Angeles,2024-05-22,Bus,608992664173210532aa3e6cc573be2f,2024-05-23 00:02:00,2024-05-22 23:55:52,-6.13
18085,140960,11,86400.0,0666caf3ec1ecc96b74f4477ee4bc939,252dc54fdd5952df75e1e0902ab298f4,86383,204-13172,013c88496835cd19be42988f8925c274,Metro Local Line,1.0,Los Angeles County Metropolitan Transportation Authority,07 - Los Angeles,2024-05-22,Bus,608992664173210532aa3e6cc573be2f,2024-05-23 00:00:00,2024-05-22 23:59:43,-0.28


In [198]:
trips_routes_times2.converted_rt_arrival.describe()

  trips_routes_times2.converted_rt_arrival.describe()


count                   34317
unique                  27103
top       2024-05-22 14:10:53
freq                        7
first     2024-05-22 00:00:24
last      2024-05-22 23:59:59
Name: converted_rt_arrival, dtype: object

#### Throw away things more than 2 hrs because that is not bunching.

In [199]:
print(trips_routes_times2.delay_min.describe(percentiles))

count   34317.00
mean        2.47
std         7.04
min       -22.75
1%         -4.83
2%         -3.67
5%         -2.45
10%        -1.68
50%         1.10
90%         7.72
95%        10.60
98%        14.95
99%        18.03
max        99.27
Name: delay_min, dtype: float64


In [200]:
trips_routes_times2 = trips_routes_times2.loc[
    trips_routes_times2.delay_min < 120
].reset_index(drop=True)

In [201]:
trips_routes_times2 = trips_routes_times2.loc[
    trips_routes_times2.delay_min > -120
].reset_index(drop=True)

### Calculate the actual & scheduled headway the `operator-route-direction_id-stop_sequence-stop_id-` grain


In [202]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "route_long_name",
    "direction_id",
    "stop_id",
    "stop_sequence",
]

In [203]:
trips_routes_times2["actual_arrival_lag_min"] = (
    trips_routes_times2.groupby(groupby_cols)["converted_rt_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

In [204]:
trips_routes_times2["scheduled_arrival_lag_min"] = (
    trips_routes_times2.groupby(groupby_cols)["converted_schd_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

### Also throw away any actual arrival lags that are above 2 hours b/c  that's not bunching?

In [205]:
trips_routes_times2["scheduled_arrival_lag_min"].describe(percentiles)

count   33641.00
mean       18.31
std        20.69
min       -45.00
1%        -10.00
2%          9.00
5%         10.00
10%        10.00
50%        15.00
90%        30.00
95%        40.00
98%        60.00
99%        81.00
max       635.00
Name: scheduled_arrival_lag_min, dtype: float64

In [206]:
trips_routes_times2["actual_arrival_lag_min"].describe(percentiles)

count   33641.00
mean       18.46
std        19.61
min         0.00
1%          1.23
2%          2.08
5%          4.67
10%         7.47
50%        14.87
90%        31.18
95%        42.37
98%        54.63
99%        70.77
max       636.00
Name: actual_arrival_lag_min, dtype: float64

In [207]:
preview_time_col = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "stop_id",
    "stop_sequence",
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
]

In [208]:
trips_routes_times2.loc[trips_routes_times2["actual_arrival_lag_min"] > 600][
    preview_time_col
]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,stop_id,stop_sequence,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min
14995,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,3961,66,2024-05-22 17:00:33,636.0,2024-05-22 17:01:00,635.0


In [209]:
trips_routes_times2.loc[
    (
        trips_routes_times2.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (trips_routes_times2.route_id == "204-13172")
    & (trips_routes_times2.stop_id == "3961")
][preview_time_col]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,stop_id,stop_sequence,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min
14994,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,3961,66,2024-05-22 06:24:33,,2024-05-22 06:26:00,
14995,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,3961,66,2024-05-22 17:00:33,636.0,2024-05-22 17:01:00,635.0


### Many lags are actually empty b/c it's the first of that groupby-sequence.

In [210]:
trips_routes_times2[trips_routes_times2["scheduled_arrival_lag_min"].isna()][
    preview_time_col
].sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,stop_id,stop_sequence,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min
29695,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,14025,62,2024-05-22 00:20:39,,2024-05-22 00:21:00,


In [211]:
trips_routes_times2.loc[
    (
        trips_routes_times2.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (trips_routes_times2.route_id == "204-13172")
    & (trips_routes_times2.stop_sequence == 2)
][preview_time_col]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,stop_id,stop_sequence,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min
128,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 00:06:52,,2024-05-22 00:08:00,
129,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 00:44:22,37.5,2024-05-22 00:41:00,33.0
130,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 01:12:29,28.12,2024-05-22 01:11:00,30.0
131,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 01:39:47,27.3,2024-05-22 01:41:00,30.0
132,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 02:11:40,31.88,2024-05-22 02:11:00,30.0
133,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 02:39:14,27.57,2024-05-22 02:41:00,30.0
134,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 03:12:43,33.48,2024-05-22 03:11:00,30.0
135,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 03:41:05,28.37,2024-05-22 03:42:00,31.0
136,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 04:11:44,30.65,2024-05-22 04:12:00,30.0
137,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7093,2,2024-05-22 04:45:07,33.38,2024-05-22 04:49:00,37.0


In [212]:
trips_routes_times2.loc[
    (
        trips_routes_times2.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (trips_routes_times2.route_id == "204-13172")
    & (trips_routes_times2.stop_sequence == 59)
][preview_time_col]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,stop_id,stop_sequence,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min
13607,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 00:31:52,,2024-05-22 00:27:00,
13608,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 00:56:42,24.83,2024-05-22 00:57:00,30.0
13609,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 01:35:40,38.97,2024-05-22 01:27:00,30.0
13610,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 01:58:05,22.42,2024-05-22 01:57:00,30.0
13611,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 02:27:17,29.2,2024-05-22 02:27:00,30.0
13612,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 02:57:36,30.32,2024-05-22 02:57:00,30.0
13613,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 03:26:47,29.18,2024-05-22 03:27:00,30.0
13614,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 04:00:15,33.47,2024-05-22 03:57:00,30.0
13615,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 04:25:41,25.43,2024-05-22 04:27:00,30.0
13616,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,7091,59,2024-05-22 05:03:26,37.75,2024-05-22 04:57:00,30.0


In [213]:
trips_routes_times2.loc[
    (
        trips_routes_times2.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (trips_routes_times2.route_id == "204-13172")
    & (trips_routes_times2.stop_sequence == 46)& (trips_routes_times2.stop_id == "5685")
][preview_time_col]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,stop_id,stop_sequence,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min
10395,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 00:22:02,,2024-05-22 00:16:00,
10396,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 00:46:39,24.62,2024-05-22 00:46:00,30.0
10397,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 01:24:30,37.85,2024-05-22 01:16:00,30.0
10398,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 01:48:25,23.92,2024-05-22 01:46:00,30.0
10399,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 02:16:15,27.83,2024-05-22 02:16:00,30.0
10400,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 02:44:30,28.25,2024-05-22 02:46:00,30.0
10401,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 03:13:04,28.57,2024-05-22 03:16:00,30.0
10402,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 03:48:36,35.53,2024-05-22 03:46:00,30.0
10403,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 04:16:06,27.5,2024-05-22 04:16:00,30.0
10404,0666caf3ec1ecc96b74f4477ee4bc939,204-13172,5685,46,2024-05-22 04:52:55,36.82,2024-05-22 04:46:00,30.0


In [214]:
# rt_stop_times4 = rt_stop_times4.fillna(0)

### Transit Matters Method

In [215]:
transit_matters_df1 = trips_routes_times2.copy()

In [216]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_arrival_lag_min
    / transit_matters_df1.scheduled_arrival_lag_min
)

In [217]:
transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

In [218]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

not bunched   0.97
bunched       0.03
Name: bunched_y_n, dtype: float64

#### Aggregate.
* At this point, it doesn't matter the sequence, we just care about how bunched the traffic is around one partiuclar stop. 
* See how many trips for that grain are considered "bunched" or not.

In [219]:
def agg_final_df(df: pd.DataFrame) -> pd.DataFrame:
    groupby_cols = [
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "feed_key",
        "organization_name",
        "route_long_name",
        "route_type",
        "route_id",
        "direction_id",
        "stop_id",
        "stop_sequence"
    ]

    # Find total trips that are bunched
    bunched = df.loc[df.bunched_y_n == "bunched"].reset_index(drop=True)
    bunched = (
        bunched.groupby(groupby_cols)
        .agg({"trip_instance_key": "nunique"})
        .reset_index()
    ).rename(columns={"trip_instance_key": "bunched_trips"})

    # Find total trips that are NOT bunched
    not_bunched = df.loc[df.bunched_y_n == "not bunched"].reset_index(drop=True)
    not_bunched = (
        not_bunched.groupby(groupby_cols)
        .agg({"trip_instance_key": "nunique"})
        .reset_index()
    ).rename(columns={"trip_instance_key": "not_bunched_trips"})

    # Merge
    m1 = pd.merge(not_bunched, bunched, on=groupby_cols, how="outer")

    # Find the % of bunched trips
    m1 = m1.fillna(0)
    m1["all_trips"] = m1.not_bunched_trips + m1.bunched_trips
    m1["per_trip_bunched_per_stop"] = m1.bunched_trips / m1.all_trips

    # Filter out any rows with only one trip of that groupby combo
    # for that service date
    m1 = m1.loc[m1.all_trips > 1].reset_index(drop=True)
    m1 = m1.drop(columns=["not_bunched_trips", "bunched_trips"])

    return m1

In [220]:
transit_matters_m1 = agg_final_df(transit_matters_df1)

In [221]:
# transit_matters_m1 = (
#   transit_matters_m1.sort_values(by=["all_trips"], ascending=False)
#   .drop_duplicates(subset=transit_matters_agg)
#   .reset_index(drop=True)
# )

### Swapped order of a bus is messing with the transit matters metric.

In [222]:
preview_cols = [
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
    "pct_actual_schd_headway",
    "bunched_y_n",
]

In [223]:
example2 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "2307719")
    & (transit_matters_df1.organization_name == "City of Visalia")
    & (transit_matters_df1.route_id == "2042")
    & (transit_matters_df1.shape_array_key == "60da59c7000ea5dcb5f845d8fa227f14")
]

In [224]:
example2[preview_cols]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min,pct_actual_schd_headway,bunched_y_n
33884,2024-05-22 06:32:47,,2024-05-22 06:42:00,,,not bunched
33885,2024-05-22 07:26:41,53.9,2024-05-22 07:27:00,45.0,1.2,not bunched
33886,2024-05-22 08:06:11,39.5,2024-05-22 08:12:00,45.0,0.88,not bunched
33887,2024-05-22 08:56:57,50.77,2024-05-22 08:57:00,45.0,1.13,not bunched
33888,2024-05-22 09:37:59,41.03,2024-05-22 09:42:00,45.0,0.91,not bunched
33889,2024-05-22 10:27:26,49.45,2024-05-22 10:27:00,45.0,1.1,not bunched
33890,2024-05-22 11:10:05,42.65,2024-05-22 11:12:00,45.0,0.95,not bunched
33891,2024-05-22 12:01:01,50.93,2024-05-22 11:57:00,45.0,1.13,not bunched
33892,2024-05-22 12:38:08,37.12,2024-05-22 12:42:00,45.0,0.82,not bunched
33893,2024-05-22 13:27:10,49.03,2024-05-22 13:27:00,45.0,1.09,not bunched


### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

* To Do: add back in route  & operator information

In [225]:
two_minutes_df = trips_routes_times2.copy()

In [226]:
two_minutes_df["bunched_y_n"] = np.where(
    two_minutes_df["actual_arrival_lag_min"] <= 2, "bunched", "not bunched"
)

In [227]:
two_minutes_df.bunched_y_n.value_counts()

not bunched    33678
bunched          639
Name: bunched_y_n, dtype: int64

In [228]:
final_two_minute = agg_final_df(two_minutes_df)

In [229]:
final_two_minute.loc[
    (final_two_minute.stop_id == "2307695")
    & (final_two_minute.organization_name == "City of Visalia")
    & (final_two_minute.route_id == "2042")
]

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
38,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307695,16,18.0,0.0


### Comparing both outcomes
* There are so many more bunched trips for the 2 minute approach.
* Add back in schedule_gtfs_key and then grab stop level data from the warehouse.

In [230]:
final_two_minute.per_trip_bunched_per_stop.describe(percentiles)

count   675.00
mean      0.01
std       0.02
min       0.00
1%        0.00
2%        0.00
5%        0.00
10%       0.00
50%       0.00
90%       0.05
95%       0.06
98%       0.08
99%       0.09
max       0.11
Name: per_trip_bunched_per_stop, dtype: float64

In [231]:
transit_matters_m1.per_trip_bunched_per_stop.describe(percentiles)

count   675.00
mean      0.03
std       0.05
min       0.00
1%        0.00
2%        0.00
5%        0.00
10%       0.00
50%       0.00
90%       0.10
95%       0.17
98%       0.17
99%       0.17
max       0.19
Name: per_trip_bunched_per_stop, dtype: float64

### Make Visuals

In [232]:
freq_range = [
    "#ccbb44",
    "#e9d868",
    "#fcb40e",
    "#ff9c42",
    "#fc5c04",
    "#dd217d",
    "#dd217d",
    "#dd217d",
]

In [233]:
import altair as alt

In [234]:
trips_routes_times2["hour"] = trips_routes_times2["converted_rt_arrival"].dt.hour
trips_routes_times2["min"] = trips_routes_times2["converted_rt_arrival"].dt.minute

In [236]:
trips_routes_times2[["hour","min","converted_rt_arrival"]].sample(10)

Unnamed: 0,hour,min,converted_rt_arrival
25006,21,43,2024-05-22 21:43:50
2460,16,11,2024-05-22 16:11:10
9612,4,19,2024-05-22 04:19:08
4836,16,34,2024-05-22 16:34:19
14768,19,8,2024-05-22 19:08:29
26603,18,27,2024-05-22 18:27:41
9015,10,48,2024-05-22 10:48:54
13945,22,3,2024-05-22 22:03:44
26143,4,43,2024-05-22 04:43:22
27925,18,45,2024-05-22 18:45:06


In [244]:
def compare_approaches(stop_id: str, organization_name: str, route_id: str, stop_sequence: int):
    transit_matter = transit_matters_m1.loc[
        (transit_matters_m1.stop_id == stop_id)
        & (transit_matters_m1.organization_name == organization_name)
        & (transit_matters_m1.route_id == route_id)
        & (transit_matters_m1.stop_sequence == stop_sequence)
    ]
    display(transit_matter)

    two_min = final_two_minute.loc[
        (final_two_minute.stop_id == stop_id)
        & (final_two_minute.organization_name == organization_name)
        & (final_two_minute.route_id == route_id)
        & (final_two_minute.stop_sequence == stop_sequence)
    ]

    display(two_min)
    total_trips = trips_routes_times2.loc[
        (trips_routes_times2.stop_id == stop_id)
        & (trips_routes_times2.organization_name == organization_name)
        & (trips_routes_times2.route_id == route_id)
        & (trips_routes_times2.stop_sequence == stop_sequence)
    ]

    display(total_trips.trip_instance_key.nunique())

    chart = (
        alt.Chart(total_trips)
        .mark_circle(size=500)
        .encode(
            x="hour",
            y="min",
            color=alt.Color(
                "actual_arrival_lag_min",
                scale=alt.Scale(range=freq_range),
            ),
            tooltip=["hour", "min", "actual_arrival_lag_min"],
        )
        .properties(width=800, height=400)
    )
    display(chart)
    return total_trips

In [245]:
test1 = compare_approaches(
    stop_id="5685",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="204-13172",
    stop_sequence = 46
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
99,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,204-13172,0.0,5685,46,105.0,0.1


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
99,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,204-13172,0.0,5685,46,105.0,0.09


105

In [254]:
test2= compare_approaches(
    stop_id="2307469",
    organization_name="City of Visalia",
    route_id="2042",
    stop_sequence = 27
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
28,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,16.0,0.19


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
28,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,16.0,0.0


16

In [255]:
test3= compare_approaches(
    stop_id="3104",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="33-13172",
    stop_sequence = 80
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
587,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,33-13172,1.0,3104,80,27.0,0.11


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
587,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,33-13172,1.0,3104,80,27.0,0.0


27

In [257]:
test4= compare_approaches(
    stop_id="15320",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="33-13172",
    stop_sequence = 64
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
566,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,33-13172,1.0,15320,64,41.0,0.1


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
566,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,33-13172,1.0,15320,64,41.0,0.02


41

In [259]:
test5= compare_approaches(
    stop_id="3288014",
    organization_name="City of Thousand Oaks",
    route_id="3402",
    stop_sequence = 16
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
640,07 - Los Angeles,1770249a5a2e770ca90628434d4934b1,926867fdee73d5fbfe4f011871bcd830,City of Thousand Oaks,Route 11,Bus,3402,0.0,3288014,16,26.0,0.0


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
640,07 - Los Angeles,1770249a5a2e770ca90628434d4934b1,926867fdee73d5fbfe4f011871bcd830,City of Thousand Oaks,Route 11,Bus,3402,0.0,3288014,16,26.0,0.0


26

In [250]:
transit_matters_m1.sort_values(by = ["per_trip_bunched_per_stop"], ascending = False)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
28,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,16.0,0.19
22,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,0.0,2307713,23,17.0,0.18
0,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,0.0,2307414,1,18.0,0.17
26,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307467,25,18.0,0.17
30,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307686,6,18.0,0.17
31,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307687,7,18.0,0.17
32,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307689,10,18.0,0.17
33,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307690,11,18.0,0.17
34,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307691,12,18.0,0.17
35,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307692,13,18.0,0.17


### Make Maps 
* Think I actually need the vehicle positions since stops are literally the stop's geometry, so it'll always be plotting on the same spot.

* https://github.com/cal-itp/data-analyses/blob/db19b70329f1e817236bda13707dd903c24abb4c/_shared_utils/shared_utils/gtfs_utils_v2.py#L371
* https://github.com/cal-itp/data-analyses/blob/main/gtfs_funnel/download_stops.py

In [None]:
stop

NameError: name 'stop' is not defined

In [None]:
# What is this file?
vps_gdf = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2024-05-22.parquet"
)

In [None]:
vps_gdf = vps_gdf[
    [
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "location_timestamp_local",
        "geometry",
    ]
]

In [None]:
vps_df = vps_gdf[
    [
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "location_timestamp_local",
    ]
]

In [None]:
vps_m1 = pd.merge(
    vps_df,
    trips_routes_times2,
    on=["schedule_gtfs_dataset_key", "trip_instance_key"],
    how="inner",
)

In [None]:
def one_stop(df: pd.DataFrame, stop_id: str, org_name: str, route_id: str):
    # Look at one route & stop
    test_route1 = df.loc[
        (df.organization_name == org_name)
        & (df.route_id == route_id)
        & (df.stop_id == stop_id)
    ]

    compare_approaches(stop_id=stop_id, organization_name=org_name, route_id=route_id)

    # display(test_route1.explore("time_int", marker_kwds = {'radius':20}))
    return test_route1

In [None]:
metro_test1 = one_stop(
    vps_m1,
    stop_id="5700",
    org_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="204-13172",
)

In [None]:
metro_test1.shape

In [None]:
metro_test1.sample(1)

### Other
* https://www.sciencedirect.com/science/article/pii/S1366554523003666
* https://www.sciencedirect.com/science/article/pii/S0968090X22002492?ref=pdf_download&fr=RR-2&rr=8d7d6fb73d8015be