## Transit Bunching 
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)
### 11/8
* Figure out how to address City of Visalia: one of the buses that is scheduled to arrive earlier arrives later than another bus. 
* This leads to a negative time stamp and makes it appear like there is a lot of bunching per the Transit Matters approach.



In [1]:
import datetime as dt

import altair as alt
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Grab Sample Routes

In [5]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [6]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [7]:
route_dir_columns = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
    "route_primary_direction",
    "frequency",
    "service_date",
]

In [8]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[
    route_dir_columns
]

In [9]:
route_dir.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,frequency,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,0.92,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,0.62,2024-05-22
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,1.5,2024-05-22
3,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,all_day,Southbound,0.92,2024-05-22
4,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,offpeak,Southbound,0.69,2024-05-22


#### Attach operators and districts

In [10]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [11]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [12]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [13]:
crosswalk_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,name,caltrans_district,service_date
0,1770249a5a2e770ca90628434d4934b1,Ventura County Transportation Commission,VCTC GMV Schedule,07 - Los Angeles,2024-05-22
1,f8102a9c0693206bf36d302540bf1bcf,City of Corona,Corona Schedule,08 - San Bernardino,2024-05-22


In [14]:
crosswalk_df.shape

(189, 5)

In [15]:
routes_dir2 = pd.merge(
    route_dir,
    crosswalk_df,
    on=["schedule_gtfs_dataset_key", "service_date"],
    how="left",
)

In [16]:
thousand_oaks = routes_dir2.loc[
    (routes_dir2.organization_name == "City of Thousand Oaks")
    & (routes_dir2.route_id == "3402")
    & (routes_dir2.time_period == "all_day")
].reset_index(drop=True)

In [17]:
visalia = routes_dir2.loc[
    (routes_dir2.organization_name == "City of Visalia")
    & (routes_dir2.route_id == "2042")
    & (routes_dir2.time_period == "all_day")
].reset_index(drop=True)

In [18]:
metro = routes_dir2.loc[
    (
        routes_dir2.organization_name
        == "Los Angeles County Metropolitan Transportation Authority"
    )
    & (routes_dir2.route_id == "204-13172")
    & (routes_dir2.time_period == "all_day")
].reset_index(drop=True)

In [19]:
# '33-13172'
metro_33 = routes_dir2.loc[
    (
        routes_dir2.organization_name
        == "Los Angeles County Metropolitan Transportation Authority"
    )
    & (routes_dir2.route_id == "33-13172")
    & (routes_dir2.time_period == "all_day")
].reset_index(drop=True)

In [20]:
routes = pd.concat([thousand_oaks, visalia, metro, metro_33])

In [21]:
routes.route_id.unique()

array(['3402', '2042', '204-13172', '33-13172'], dtype=object)

In [22]:
routes.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,frequency,service_date,organization_name,name,caltrans_district
0,1770249a5a2e770ca90628434d4934b1,3402,0.0,all_day,Eastbound,1.17,2024-05-22,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles
1,1770249a5a2e770ca90628434d4934b1,3402,1.0,all_day,Westbound,1.17,2024-05-22,City of Thousand Oaks,VCTC GMV Schedule,07 - Los Angeles
0,3bda4652977200408690059ef2ec4b4d,2042,0.0,all_day,Eastbound,0.79,2024-05-22,City of Visalia,Visalia Schedule,06 - Fresno


### Add Trips

In [23]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [24]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [25]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
    "route_type",
]

In [26]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [27]:
trips_routes = pd.merge(
    trips,
    routes_dir2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [28]:
trips_routes.shape

(392497, 15)

In [29]:
trips_routes.route_id.nunique()

1338

#### I know we can get this from the warehouse but it seems cumbersome. Correct me if I'm wrong.

In [30]:
# https://gtfs.org/documentation/schedule/reference/#
route_type_crosswalk = {
    "route_type": ["0", "1", "2", "3", "4", "5", "6", "7", "11", "12"],
    "route_type_str": [
        "Tram, Streetcar, Light rail",
        "Subway, Metro",
        "Rail",
        "Bus",
        "Ferry.",
        "Cable tram.",
        "Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).",
        "Funicular.",
        "Trolleybus.",
        "Monorail.",
    ],
}

In [31]:
route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)

In [32]:
# Merge for route_type
trips_routes = pd.merge(
    trips_routes, route_type_crosswalk_df, on=["route_type"], how="left"
)

In [33]:
trips_routes = trips_routes.drop(columns=["route_type"])

In [34]:
trips_routes = trips_routes.rename(columns={"route_type_str": "route_type"})

### Get Stop Times 

In [35]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [36]:
rt_stop_times.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
0,1d105244-776c-4b3f-af78-9c7ad78c2103,0b2443b6-b50f-452b-a749-464588ca93b8,8,60991.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61434
1,1d105244-776c-4b3f-af78-9c7ad78c2103,cd5650b0-9a18-4e78-aedc-385f3094fa0f,9,61179.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61616


In [37]:
trips_routes_times = pd.merge(
    rt_stop_times,
    trips_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [38]:
(trips_routes_times.scheduled_arrival_sec.isna().sum())

44988

In [39]:
trips_routes_times.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11031910 entries, 0 to 11031909
Data columns (total 20 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   trip_id                    object        
 1   stop_id                    object        
 2   stop_sequence              int64         
 3   scheduled_arrival_sec      float64       
 4   schedule_gtfs_dataset_key  object        
 5   trip_instance_key          object        
 6   rt_arrival_sec             int64         
 7   route_id                   object        
 8   shape_array_key            object        
 9   feed_key                   object        
 10  route_long_name            object        
 11  direction_id               float64       
 12  time_period                object        
 13  route_primary_direction    object        
 14  frequency                  float64       
 15  service_date               datetime64[ns]
 16  organization_name          object 

### Sorting 

In [40]:
trips_routes_times2 = trips_routes_times.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "direction_id",
        "stop_sequence",
        "stop_id",
        "rt_arrival_sec",
    ]
).reset_index(drop=True)

In [41]:
preview_sort_col = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "stop_sequence",
    "rt_arrival_sec",
    "stop_id",
    "scheduled_arrival_sec",
]

In [42]:
trips_routes_times2.loc[
    (
        trips_routes_times2.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (trips_routes_times2.route_id == "204-13172")
    & (trips_routes_times2.direction_id == 1)
][preview_sort_col].shape

(21009, 7)

### Convert scheduled and RT arrival times.
* If 82800  < `scheduled_arrival_time` < 86_400 but `rt_arrival_sec` is lower say 14_000 (4 am in the morning): then the bus was scheduled to arrive on May 21 (day before the service date) but it arrived a little later on the actual service date we query the data for. 
* If  86_400 < `scheduled_arrival_time` and `rt_arrival_sec` is around 86_000 then this is the same service date. 

In [43]:
trips_routes_times2["rt_arrival_sec"].describe()

count   11031910.00
mean       48176.39
std        17806.20
min            0.00
25%        33526.00
50%        48304.00
75%        62400.00
max        86399.00
Name: rt_arrival_sec, dtype: float64

In [44]:
trips_routes_times2.loc[trips_routes_times2["scheduled_arrival_sec"] == 86_400].shape

(1024, 20)

####  Filter out for trips with `scheduled_arrival_sec` that's over 24 hours.

In [45]:
more_than_86400 = trips_routes_times2.loc[
    trips_routes_times2["scheduled_arrival_sec"] > 86_400
]

In [46]:
more_than_86400["scheduled_arrival_sec"].describe()

count   170306.00
mean     92818.97
std       5599.54
min      86401.00
25%      88080.00
50%      90960.00
75%      96720.00
max     111374.00
Name: scheduled_arrival_sec, dtype: float64

In [47]:
# more_than_86400[["service_date", "rt_arrival_sec", "scheduled_arrival_sec]].head()

In [48]:
timestamp_subset = [
    "converted_schd_arrival",
    "converted_rt_arrival",
    "scheduled_arrival_sec",
    "rt_arrival_sec",
    "service_date",
]

#### Subset to make the df smaller


In [49]:
subset = [
    "stop_id",
    "stop_sequence",
    "scheduled_arrival_sec",
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "rt_arrival_sec",
    "route_id",
    "shape_array_key",
    "route_long_name",
    "direction_id",
    "organization_name",
    "caltrans_district",
    "service_date",
    "route_type",
    "feed_key",
]

In [50]:
trips_routes_times2 = trips_routes_times2[subset]

In [51]:
trips_routes_times2.scheduled_arrival_sec = (
    trips_routes_times2.scheduled_arrival_sec.fillna(0)
)

In [52]:
trips_routes_times2["converted_rt_arrival"] = pd.to_datetime(
    trips_routes_times2["service_date"]
) + pd.to_timedelta(trips_routes_times2["rt_arrival_sec"] % 86400, unit="s")

In [53]:
trips_routes_times2["converted_schd_arrival"] = pd.to_datetime(
    trips_routes_times2["service_date"]
) + pd.to_timedelta(trips_routes_times2["scheduled_arrival_sec"] % 86400, unit="s")

In [54]:
trips_routes_times2.converted_schd_arrival.describe()

  trips_routes_times2.converted_schd_arrival.describe()


count                11031910
unique                  78579
top       2024-05-22 00:00:00
freq                    46012
first     2024-05-22 00:00:00
last      2024-05-22 23:59:59
Name: converted_schd_arrival, dtype: object

#### Checkout results from the code
* Make sure it makes sense.


In [55]:
trips_routes_times2.loc[trips_routes_times2["scheduled_arrival_sec"] > 86_400][
    [
        "scheduled_arrival_sec",
        "converted_schd_arrival",
        "rt_arrival_sec",
        "converted_rt_arrival",
    ]
].sample(20)

Unnamed: 0,scheduled_arrival_sec,converted_schd_arrival,rt_arrival_sec,converted_rt_arrival
9256346,90061.0,2024-05-22 01:01:01,3487,2024-05-22 00:58:07
1447567,93300.0,2024-05-22 01:55:00,6864,2024-05-22 01:54:24
10409546,86460.0,2024-05-22 00:01:00,77,2024-05-22 00:01:17
3973777,92291.0,2024-05-22 01:38:11,6380,2024-05-22 01:46:20
5599826,90960.0,2024-05-22 01:16:00,5091,2024-05-22 01:24:51
68203,93360.0,2024-05-22 01:56:00,7156,2024-05-22 01:59:16
1982639,93480.0,2024-05-22 01:58:00,7248,2024-05-22 02:00:48
369084,101280.0,2024-05-22 04:08:00,15020,2024-05-22 04:10:20
573748,96900.0,2024-05-22 02:55:00,10973,2024-05-22 03:02:53
2022748,100020.0,2024-05-22 03:47:00,13518,2024-05-22 03:45:18


### Deal with delays
* Some very extreme values.

In [56]:
percentiles = [
    0.001,
    0.005,
    0.01,
    0.02,
    0.05,
    0.1,
    0.2,
    0.3,
    0.4,
    0.5,
    0.6,
    0.7,
    0.8,
    0.9,
    0.95,
    0.98,
    0.99,
]

In [57]:
trips_routes_times2["delay_min"] = (
    trips_routes_times2["converted_rt_arrival"]
    - trips_routes_times2["converted_schd_arrival"]
).dt.total_seconds() / 60

In [58]:
delay_0 = trips_routes_times2.loc[trips_routes_times2.delay_min < 0]

In [59]:
display(delay_0["delay_min"].describe(percentiles))

count   3142148.00
mean         -4.42
std          60.66
min       -1439.78
0.1%      -1428.75
0.5%        -25.87
1%          -12.95
2%           -7.35
5%           -4.75
10%          -3.43
20%          -2.37
30%          -1.78
40%          -1.40
50%          -1.08
60%          -0.82
70%          -0.58
80%          -0.38
90%          -0.18
95%          -0.10
98%          -0.05
99%          -0.03
max          -0.02
Name: delay_min, dtype: float64

In [60]:
delay_24_min = trips_routes_times2.loc[trips_routes_times2.delay_min > 24]

In [61]:
display(delay_24_min["delay_min"].describe(percentiles))

count   112109.00
mean       363.86
std        421.20
min         24.02
0.1%        24.02
0.5%        24.07
1%          24.15
2%          24.27
5%          24.70
10%         25.53
20%         27.57
30%         30.63
40%         37.37
50%         51.30
60%        411.62
70%        607.02
80%        825.53
90%       1034.57
95%       1150.87
98%       1291.33
99%       1364.87
max       1439.98
Name: delay_min, dtype: float64

In [62]:
display(trips_routes_times2["delay_min"].describe(percentiles))

count   11031910.00
mean           5.18
std           64.78
min        -1439.78
0.1%         -37.03
0.5%          -8.03
1%            -5.53
2%            -4.07
5%            -2.55
10%           -1.57
20%           -0.58
30%            0.10
40%            0.75
50%            1.47
60%            2.32
70%            3.40
80%            4.97
90%            7.93
95%           11.35
98%           17.30
99%           24.22
max         1439.98
Name: delay_min, dtype: float64

In [63]:
411 / 60

6.85

In [64]:
600 / 60

10.0

#### Subtract one day off of `converted_rt_arrival` for rows in which there is a delay of +10 hours

In [65]:
trips_routes_times3 = trips_routes_times2.copy()

In [66]:
trips_routes_times3["converted_rt_arrival"] = np.where(
    trips_routes_times3["delay_min"] >= 600,
    trips_routes_times3["converted_rt_arrival"] - pd.Timedelta(days=1),
    trips_routes_times3["converted_rt_arrival"],
)

#### Subtract one day off `converted_schd_arrival` for rows with a delay of -10 hours

In [67]:
-1428 / 60

-23.8

In [68]:
trips_routes_times3["converted_schd_arrival"] = np.where(
    trips_routes_times3["delay_min"] <= -600,
    trips_routes_times3["converted_schd_arrival"] - pd.Timedelta(days=1),
    trips_routes_times3["converted_schd_arrival"],
)

### Recalculate delay minutes after fix to ensure everything looks ok.

In [69]:
trips_routes_times3["delay_min"] = (
    trips_routes_times3["converted_rt_arrival"]
    - trips_routes_times3["converted_schd_arrival"]
).dt.total_seconds() / 60

In [70]:
display(trips_routes_times3["delay_min"].describe(percentiles))

count   11031910.00
mean           1.54
std           35.44
min         -840.00
0.1%        -623.30
0.5%         -14.22
1%            -6.32
2%            -4.32
5%            -2.63
10%           -1.60
20%           -0.60
30%            0.08
40%            0.73
50%            1.45
60%            2.28
70%            3.37
80%            4.92
90%            7.82
95%           11.07
98%           16.27
99%           21.15
max          837.53
Name: delay_min, dtype: float64

In [71]:
def compare_delays(stop_id: str, stop_sequence: int, scheduled_arrival_sec: int):
    before_corrections = trips_routes_times2.loc[
        (trips_routes_times2.stop_id == stop_id)
        & (trips_routes_times2.stop_sequence == stop_sequence)
        & (trips_routes_times2.scheduled_arrival_sec == scheduled_arrival_sec)
    ]
    display(before_corrections)

    after_corrections = trips_routes_times3.loc[
        (trips_routes_times3.stop_id == stop_id)
        & (trips_routes_times3.stop_sequence == stop_sequence)
        & (trips_routes_times3.scheduled_arrival_sec == scheduled_arrival_sec)
    ]
    display(after_corrections)

#### Some rows just can't be fixed b/c the inherent data is a little off. 

In [72]:
extreme1 = trips_routes_times3.loc[trips_routes_times3.delay_min < -398.38]

In [73]:
extreme1.scheduled_arrival_sec.describe()

count   23366.00
mean     1487.55
std     10491.40
min         0.00
25%         0.00
50%         0.00
75%         0.00
max     86494.00
Name: scheduled_arrival_sec, dtype: float64

In [74]:
extreme1.rt_arrival_sec.describe()

count   23366.00
mean    49625.69
std      8083.68
min       492.00
25%     42813.00
50%     49778.00
75%     56438.00
max     85333.00
Name: rt_arrival_sec, dtype: float64

### I received the suggestion to throw away things more than 2 hrs because that is not bunching. 

In [75]:
trips_routes_times3 = trips_routes_times3.loc[
    trips_routes_times3.delay_min < 120
].reset_index(drop=True)

In [76]:
trips_routes_times3 = trips_routes_times3.loc[
    trips_routes_times3.delay_min > -120
].reset_index(drop=True)

### Calculate the actual & scheduled headway the `operator-route-direction_id-stop_sequence-stop_id-` grain


In [77]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "route_long_name",
    "direction_id",
    "stop_id",
    "stop_sequence",
]

In [78]:
trips_routes_times3["actual_arrival_lag_min"] = (
    trips_routes_times3.groupby(groupby_cols)["converted_rt_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

In [79]:
trips_routes_times3["scheduled_arrival_lag_min"] = (
    trips_routes_times3.groupby(groupby_cols)["converted_schd_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

In [80]:
trips_routes_times3["scheduled_arrival_lag_min"].describe(percentiles)

count   10802913.00
mean           6.28
std           53.40
min        -1439.00
0.1%        -840.00
0.5%           0.00
1%             0.00
2%             0.00
5%             0.00
10%            0.00
20%            0.00
30%            0.00
40%            0.00
50%            0.00
60%            0.00
70%            0.00
80%           10.00
90%           28.00
95%           40.00
98%           60.00
99%           69.00
max         1437.00
Name: scheduled_arrival_lag_min, dtype: float64

In [81]:
trips_routes_times3["actual_arrival_lag_min"].describe(percentiles)

count   10802913.00
mean           6.29
std           53.47
min        -1439.82
0.1%        -841.50
0.5%           0.00
1%             0.00
2%             0.00
5%             0.00
10%            0.00
20%            0.00
30%            0.00
40%            0.00
50%            0.00
60%            0.00
70%            0.00
80%            6.68
90%           26.40
95%           39.93
98%           60.92
99%           71.45
max         1442.10
Name: actual_arrival_lag_min, dtype: float64

In [82]:
preview_time_col = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "stop_id",
    "stop_sequence",
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
]

### Many lags are actually empty b/c it's the first of that groupby-sequence.

In [83]:
trips_routes_times3[trips_routes_times3["scheduled_arrival_lag_min"].isna()][
    preview_time_col
].sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,stop_id,stop_sequence,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min
3348295,5456c80d420043e15c8eb7368a8a4d89,397,340058,47,2024-05-22 03:28:03,,2024-05-22 03:27:08,


In [84]:
# rt_stop_times4 = rt_stop_times4.fillna(0)

In [85]:
len(trips_routes_times3)

10982933

In [86]:
trips_routes_times3.to_parquet("./transit_bunching.parquet")

### Transit Matters Method

In [87]:
transit_matters_df1 = trips_routes_times3.copy()

In [88]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_arrival_lag_min
    / transit_matters_df1.scheduled_arrival_lag_min
)

In [89]:
transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

In [90]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

not bunched   0.99
bunched       0.01
Name: bunched_y_n, dtype: float64

In [91]:
len(transit_matters_df1)

10982933

#### Aggregate.
* At this point, it doesn't matter the sequence, we just care about how bunched the traffic is around one partiuclar stop. 
* See how many trips for that grain are considered "bunched" or not.

In [92]:
def bunched_not_bunched(
    df: pd.DataFrame, bunched_y_n: str, groupby_cols: list
) -> pd.DataFrame:
    df2 = df.loc[df.bunched_y_n == bunched_y_n].reset_index(drop=True)

    bunched_y_n = bunched_y_n.replace(" ", "_")
    agg1 = (
        df2.groupby(groupby_cols).agg({"trip_instance_key": "nunique"}).reset_index()
    ).rename(columns={"trip_instance_key": f"{bunched_y_n}_trips"})
    return agg1

In [93]:
def agg_final_df(df: pd.DataFrame) -> pd.DataFrame:
    groupby_cols = [
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "feed_key",
        "organization_name",
        "route_long_name",
        "route_type",
        "route_id",
        "direction_id",
        "stop_id",
        "stop_sequence",
    ]

    # Find total trips that are bunched
    bunched = bunched_not_bunched(df, "bunched", groupby_cols)

    # Find total trips that are NOT bunched
    not_bunched = bunched_not_bunched(df, "not bunched", groupby_cols)

    # Merge
    m1 = pd.merge(not_bunched, bunched, on=groupby_cols, how="outer")

    # Find the % of bunched trips
    m1 = m1.fillna(0)
    m1["all_trips"] = m1.not_bunched_trips + m1.bunched_trips
    m1["per_trip_bunched_per_stop"] = m1.bunched_trips / m1.all_trips

    # Filter out any rows with only one trip of that groupby combo
    # for that service date
    m1 = m1.loc[m1.all_trips > 1].reset_index(drop=True)
    m1 = m1.drop(columns=["not_bunched_trips", "bunched_trips"])

    return m1

In [None]:
transit_matters_m1 = agg_final_df(transit_matters_df1)

In [None]:
# transit_matters_m1 = (
#   transit_matters_m1.sort_values(by=["all_trips"], ascending=False)
#   .drop_duplicates(subset=transit_matters_agg)
#   .reset_index(drop=True)
# )

### Help: Swapped order of a bus is messing with the transit matters metric.
* How to solve for this?? 

In [None]:
preview_cols = [
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
    "pct_actual_schd_headway",
    "bunched_y_n",
]

In [None]:
example2 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "2307719")
    & (transit_matters_df1.organization_name == "City of Visalia")
    & (transit_matters_df1.route_id == "2042")
    & (transit_matters_df1.shape_array_key == "60da59c7000ea5dcb5f845d8fa227f14")
]

#### Starting row 33484: the RT Arrival time is swapped. A bus that was scheduled to arrive at 4:27 arrived boefre the bus arrived at 3:42.
* This repeats again row 33486.

In [None]:
example2[preview_cols]

### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

* To Do: add back in route  & operator information

In [None]:
two_minutes_df = trips_routes_times2.copy()

In [None]:
two_minutes_df["bunched_y_n"] = np.where(
    two_minutes_df["actual_arrival_lag_min"] <= 2, "bunched", "not bunched"
)

In [None]:
two_minutes_df.bunched_y_n.value_counts()

In [None]:
final_two_minute = agg_final_df(two_minutes_df)

In [None]:
final_two_minute.loc[
    (final_two_minute.stop_id == "2307695")
    & (final_two_minute.organization_name == "City of Visalia")
    & (final_two_minute.route_id == "2042")
]

### Comparing both outcomes
* There are so many more bunched trips for the 2 minute approach.
* Add back in schedule_gtfs_key and then grab stop level data from the warehouse.

In [None]:
final_two_minute.per_trip_bunched_per_stop.describe(percentiles)

In [None]:
transit_matters_m1.per_trip_bunched_per_stop.describe(percentiles)

### Make Visuals

In [None]:
freq_range = [
    "#ccbb44",
    "#e9d868",
    "#fcb40e",
    "#ff9c42",
    "#fc5c04",
    "#dd217d",
    "#dd217d",
    "#dd217d",
]

In [None]:
trips_routes_times2["hour"] = trips_routes_times2["converted_rt_arrival"].dt.hour
trips_routes_times2["min"] = trips_routes_times2["converted_rt_arrival"].dt.minute

In [None]:
trips_routes_times2.head(2)

In [None]:
def compare_approaches(
    stop_id: str, organization_name: str, route_id: str, stop_sequence: int
):
    transit_matter = transit_matters_m1.loc[
        (transit_matters_m1.stop_id == stop_id)
        & (transit_matters_m1.organization_name == organization_name)
        & (transit_matters_m1.route_id == route_id)
        & (transit_matters_m1.stop_sequence == stop_sequence)
    ]
    display(transit_matter)

    two_min = final_two_minute.loc[
        (final_two_minute.stop_id == stop_id)
        & (final_two_minute.organization_name == organization_name)
        & (final_two_minute.route_id == route_id)
        & (final_two_minute.stop_sequence == stop_sequence)
    ]

    display(two_min)
    total_trips = trips_routes_times2.loc[
        (trips_routes_times2.stop_id == stop_id)
        & (trips_routes_times2.organization_name == organization_name)
        & (trips_routes_times2.route_id == route_id)
        & (trips_routes_times2.stop_sequence == stop_sequence)
    ]

    display(total_trips.trip_instance_key.nunique())

    chart = (
        alt.Chart(total_trips)
        .mark_circle(size=500)
        .encode(
            x="hour",
            y="min",
            color=alt.Color(
                "hour",
                scale=alt.Scale(range=freq_range),
            ),
            tooltip=["hour", "min", "actual_arrival_lag_min"],
        )
        .properties(width=800, height=400)
    )
    display(chart)
    return total_trips

In [None]:
test1 = compare_approaches(
    stop_id="5685",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="204-13172",
    stop_sequence=46,
)

In [None]:
test2 = compare_approaches(
    stop_id="2307469",
    organization_name="City of Visalia",
    route_id="2042",
    stop_sequence=27,
)

In [None]:
test3 = compare_approaches(
    stop_id="3104",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="33-13172",
    stop_sequence=80,
)

In [None]:
test4 = compare_approaches(
    stop_id="15320",
    organization_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="33-13172",
    stop_sequence=64,
)

In [None]:
test5 = compare_approaches(
    stop_id="3288014",
    organization_name="City of Thousand Oaks",
    route_id="3402",
    stop_sequence=16,
)

In [None]:
transit_matters_m1.sort_values(by=["per_trip_bunched_per_stop"], ascending=False)

### Make Maps 
* Think I actually need the vehicle positions since stops are literally the stop's geometry, so it'll always be plotting on the same spot.

* https://github.com/cal-itp/data-analyses/blob/db19b70329f1e817236bda13707dd903c24abb4c/_shared_utils/shared_utils/gtfs_utils_v2.py#L371
* https://github.com/cal-itp/data-analyses/blob/main/gtfs_funnel/download_stops.py

In [None]:
stop

In [None]:
# What is this file?
vps_gdf = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2024-05-22.parquet"
)

In [None]:
vps_gdf = vps_gdf[
    [
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "location_timestamp_local",
        "geometry",
    ]
]

In [None]:
vps_df = vps_gdf[
    [
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
        "location_timestamp_local",
    ]
]

In [None]:
vps_m1 = pd.merge(
    vps_df,
    trips_routes_times2,
    on=["schedule_gtfs_dataset_key", "trip_instance_key"],
    how="inner",
)

In [None]:
def one_stop(df: pd.DataFrame, stop_id: str, org_name: str, route_id: str):
    # Look at one route & stop
    test_route1 = df.loc[
        (df.organization_name == org_name)
        & (df.route_id == route_id)
        & (df.stop_id == stop_id)
    ]

    compare_approaches(stop_id=stop_id, organization_name=org_name, route_id=route_id)

    # display(test_route1.explore("time_int", marker_kwds = {'radius':20}))
    return test_route1

In [None]:
metro_test1 = one_stop(
    vps_m1,
    stop_id="5700",
    org_name="Los Angeles County Metropolitan Transportation Authority",
    route_id="204-13172",
)

In [None]:
metro_test1.shape

In [None]:
metro_test1.sample(1)

### Other
* https://www.sciencedirect.com/science/article/pii/S1366554523003666
* https://www.sciencedirect.com/science/article/pii/S0968090X22002492?ref=pdf_download&fr=RR-2&rr=8d7d6fb73d8015be