## Kernel dies when I'm trying to do groupby

In [1]:
import datetime as dt

import altair as alt
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
df = pd.read_parquet("./transit_bunching.parquet")

In [4]:
def bunched_not_bunched(
    df: pd.DataFrame, bunched_y_n: str, groupby_cols: list
) -> pd.DataFrame:
    df2 = df.loc[df.bunched_y_n == bunched_y_n].reset_index(drop=True)

    bunched_y_n = bunched_y_n.replace(" ", "_")
    agg1 = (
        df2.groupby(groupby_cols).agg({"trip_instance_key": "nunique"}).reset_index()
    ).rename(columns={"trip_instance_key": f"{bunched_y_n}_trips"})
    return agg1

In [5]:
groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
    "stop_id",
]

In [6]:
def agg_final_df(groupby_cols: list, df: pd.DataFrame) -> pd.DataFrame:

    # Find total trips that are bunched
    bunched = bunched_not_bunched(df, "bunched", groupby_cols)
    print("done 1")
    # Find total trips that are NOT bunched
    not_bunched = bunched_not_bunched(df, "not bunched", groupby_cols)
    print("done 2")
    # Concat
    m1 = pd.merge(not_bunched, bunched, on=groupby_cols, how="outer")
    print("done 3")
    # Find the % of bunched trips
    m1 = m1.fillna(0)
    m1["all_trips"] = m1.not_bunched_trips + m1.bunched_trips
    m1["per_trip_bunched_per_stop"] = m1.bunched_trips / m1.all_trips

    # Filter out any rows with only one trip of that groupby combo
    # for that service date
    m1 = m1.loc[m1.all_trips > 1].reset_index(drop=True)
    m1 = m1.drop(columns=["not_bunched_trips", "bunched_trips"])

    return m1

### Transit Matters

In [7]:
transit_matters_df1 = df.copy()

In [8]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_arrival_lag_min
    / transit_matters_df1.scheduled_arrival_lag_min
)

In [9]:
transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

In [10]:
groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
    "stop_id",
    "stop_sequence"
]

In [11]:
transit_matters_m1 = agg_final_df(groupby_cols, transit_matters_df1)

done 1
done 2
done 3


In [12]:
len(transit_matters_m1)

141364

### Help: Swapped order of a bus is messing with the transit matters metric.
* How to solve for this?? 

In [13]:
preview_cols = [
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
    "pct_actual_schd_headway",
    "bunched_y_n",
]

In [14]:
example2 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "2307719")
    & (transit_matters_df1.organization_name == "City of Visalia")
    & (transit_matters_df1.route_id == "2042")
    & (transit_matters_df1.shape_array_key == "60da59c7000ea5dcb5f845d8fa227f14")
]

In [15]:
example2[preview_cols]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min,pct_actual_schd_headway,bunched_y_n
3057874,2024-05-22 06:32:47,,2024-05-22 06:42:00,,,not bunched
3057875,2024-05-22 06:32:47,0.0,2024-05-22 06:42:00,0.0,,not bunched
3057876,2024-05-22 06:32:47,0.0,2024-05-22 06:42:00,0.0,,not bunched
3057877,2024-05-22 07:26:41,53.9,2024-05-22 07:27:00,45.0,1.2,not bunched
3057878,2024-05-22 07:26:41,0.0,2024-05-22 07:27:00,0.0,,not bunched
3057879,2024-05-22 07:26:41,0.0,2024-05-22 07:27:00,0.0,,not bunched
3057880,2024-05-22 08:06:11,39.5,2024-05-22 08:12:00,45.0,0.88,not bunched
3057881,2024-05-22 08:06:11,0.0,2024-05-22 08:12:00,0.0,,not bunched
3057882,2024-05-22 08:06:11,0.0,2024-05-22 08:12:00,0.0,,not bunched
3057883,2024-05-22 08:56:57,50.77,2024-05-22 08:57:00,45.0,1.13,not bunched


### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

In [16]:
two_minutes_df = df.copy()

In [17]:
two_minutes_df["bunched_y_n"] = np.where(
    two_minutes_df["actual_arrival_lag_min"] <= 2, "bunched", "not bunched"
)

In [18]:
final_two_minute = agg_final_df(groupby_cols, two_minutes_df)

done 1
done 2
done 3


### Why the difference?

In [19]:
len(final_two_minute)

165668

In [20]:
len(transit_matters_m1)

141364

In [21]:
final_two_minute.head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
0,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,2698,13,20.0,0.5
1,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,2721,24,2.0,0.5
2,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,2750,10,20.0,0.5
3,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,775592,22,14.0,0.5
4,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,775595,5,20.0,0.5


### Transit Matters: trips missing...

In [22]:
transit_matters_m1.head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
0,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,2698,13,10.0,0.0
1,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,2750,10,10.0,0.0
2,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,775592,22,7.0,0.0
3,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,775595,5,10.0,0.0
4,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,775596,11,2.0,0.0


In [23]:
transit_matters_m1.per_trip_bunched_per_stop.describe()

count   141364.00
mean         0.01
std          0.03
min          0.00
25%          0.00
50%          0.00
75%          0.00
max          0.50
Name: per_trip_bunched_per_stop, dtype: float64

In [24]:
missing_routes = pd.merge(
    transit_matters_m1.drop(columns=["all_trips", "per_trip_bunched_per_stop"]),
    final_two_minute.drop(columns=["all_trips", "per_trip_bunched_per_stop"]),
    how="outer",
    indicator = True
)

In [25]:
missing_routes._merge.value_counts()

both          141359
right_only     24309
left_only          5
Name: _merge, dtype: int64

In [26]:
missing_routes.loc[missing_routes._merge == "right_only"].head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,_merge
141364,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,3 Route 3-Green 101 Northcrest,Bus,117,0.0,2721,24,right_only
141365,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,300 PM DNHS Tripper,Bus,14242,0.0,2706,24,right_only
141366,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,300 PM DNHS Tripper,Bus,14242,0.0,2711,10,right_only
141367,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,300 PM DNHS Tripper,Bus,14242,0.0,2782,12,right_only
141368,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,87c96d5026263d6986f2cabe6892390c,Redwood Coast Transit Authority,Route 199 Hiouchi / Gasquet,Bus,257,1.0,2711,17,right_only


### Compare

In [27]:
freq_range = [
    "#ccbb44",
    "#e9d868",
    "#fcb40e",
    "#ff9c42",
    "#fc5c04",
    "#dd217d",
    "#dd217d",
    "#dd217d",
]

In [28]:
df["hour"] = df["converted_rt_arrival"].dt.hour
df["min"] = df["converted_rt_arrival"].dt.minute

In [31]:
def compare_approaches(
    stop_id: str, organization_name: str, route_id: str, stop_sequence: int
):
    transit_matter = transit_matters_m1.loc[
        (transit_matters_m1.stop_id == stop_id)
        & (transit_matters_m1.organization_name == organization_name)
        & (transit_matters_m1.route_id == route_id)
        & (transit_matters_m1.stop_sequence == stop_sequence)
    ]
    display(transit_matter)

    two_min = final_two_minute.loc[
        (final_two_minute.stop_id == stop_id)
        & (final_two_minute.organization_name == organization_name)
        & (final_two_minute.route_id == route_id)
        & (final_two_minute.stop_sequence == stop_sequence)
    ]

    display(two_min)
    total_trips = df.loc[
        (df.stop_id == stop_id)
        & (df.organization_name == organization_name)
        & (df.route_id == route_id)
        & (df.stop_sequence == stop_sequence)
    ]

    display(total_trips.trip_instance_key.nunique())

    chart = (
        alt.Chart(total_trips)
        .mark_circle(size=500)
        .encode(
            x="hour",
            y="min",
            color=alt.Color(
                "hour",
                scale=alt.Scale(range=freq_range),
            ),
            tooltip=["hour", "min", "actual_arrival_lag_min"],
        )
        .properties(width=800, height=400)
    )
    display(chart)
    return total_trips

In [32]:
test1 = compare_approaches(
    stop_id="5685",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="204-13172",
    stop_sequence=46,
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
41501,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,204-13172,0.0,5685,46,116.0,0.09


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
49521,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,204-13172,0.0,5685,46,201.0,0.52


105

### Something going wrong for `all_trips` once I aggregate for all operators.

In [33]:
test3 = compare_approaches(
    stop_id="3104",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="33-13172",
    stop_sequence=80,
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
47535,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,33-13172,1.0,3104,80,30.0,0.1


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
55965,07 - Los Angeles,0666caf3ec1ecc96b74f4477ee4bc939,608992664173210532aa3e6cc573be2f,Los Angeles County Metropolitan Transportation Authority,Metro Local Line,Bus,33-13172,1.0,3104,80,53.0,0.51


27