## Transit Bunching V2
* Incorporating Katrina and Eric's comments.
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)

In [1]:
import geopandas as gpd
import merge_data
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Get routes with short headways
* Katrina: <i>but want to understand how the original column is calculated (over what time period). I would also count the agencies/organizations represented in that subset to see if it fits our preconceptions about which agencies run frequent routes. Also check mix of buses/trains.</i>
* Eric: <i>Once you do the 60 / frequency calculation, it’s not really a frequency any more but rather a headway. headway_minutes might be a better way to label it than frequency_in_minutes.</i>
* <b> Amanda: figure out how Tiffany calculated frequncy. I also forgot which dataframe identifies the route type.</b>

In [5]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [6]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

'schedule_route_dir/schedule_route_direction_metrics'

In [130]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])

In [131]:
    route_dir.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,51.77,0.27,22,0.92,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,51.77,0.27,10,0.62,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,51.77,0.27,12,1.5,0.0,0.0,0.0,0.0,1.0,0.0,2024-05-22
3,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,all_day,Southbound,46.73,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22
4,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,offpeak,Southbound,46.73,0.28,11,0.69,0.0,1.0,0.0,0.0,1.0,0.0,2024-05-22


In [8]:
route_dir["headway_minutes"] = 60 / route_dir.frequency

#### QUESTION: Should I use mean or median for finding routes that are high frequency?
* Find Median.

In [9]:
route_freq_groupby = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
]

In [10]:
high_frequency_routes = (
    route_dir.groupby(route_freq_groupby)
    .agg({"headway_minutes": "mean"})
    .reset_index()
    .rename(columns={"headway_minutes": "mean_headway_minutes"})
)

In [11]:
high_frequency_routes_median = (
    route_dir.groupby(route_freq_groupby)
    .agg({"headway_minutes": "median"})
    .reset_index()
    .rename(columns={"headway_minutes": "med_headway_minutes"})
)

In [12]:
mean_v_median = pd.merge(
    high_frequency_routes,
    high_frequency_routes_median,
    on=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "direction_id",
        "route_primary_direction",
    ],
    suffixes=("_mean", "_median"),
)

In [13]:
mean_v_median["diff_btwn_mean_med"] = (
    mean_v_median.mean_headway_minutes - mean_v_median.med_headway_minutes
)

In [14]:
mean_v_median.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,mean_headway_minutes,med_headway_minutes,diff_btwn_mean_med
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,Northbound,67.33,65.22,2.11
1,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,Southbound,65.22,65.22,0.0


In [15]:
mean_v_median["diff_btwn_mean_med"].describe()

count   3417.00
mean       9.52
std       34.44
min       -5.30
25%       -0.23
50%        0.00
75%        1.63
max      291.59
Name: diff_btwn_mean_med, dtype: float64

In [16]:
mean_v_median["med_headway_minutes"].describe()

count   3417.00
mean     225.11
std      310.03
min        4.00
25%       46.51
50%       96.77
75%      240.00
max     1250.00
Name: med_headway_minutes, dtype: float64

In [17]:
mean_v_median["mean_headway_minutes"].describe()

count   3417.00
mean     234.64
std      312.42
min        4.00
25%       47.43
50%       97.71
75%      272.73
max     1250.00
Name: mean_headway_minutes, dtype: float64

In [18]:
mean_v_median.loc[mean_v_median.diff_btwn_mean_med > 291]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,mean_headway_minutes,med_headway_minutes,diff_btwn_mean_med
2141,c2a40ce92e76ec5beb88c40df3cd3a67,HCC Shuttle,0.0,Eastbound,367.54,75.95,291.59


In [19]:
mean_v_median.loc[mean_v_median.diff_btwn_mean_med > 1.63].shape

(860, 7)

In [20]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [21]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [22]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [23]:
crosswalk_df.shape

(168, 5)

#### Grab routes in the 5th percentile of frequency for now.
* Eric: <i>Taking the 5%ile (17.65min headway) is reasonable, but I suspect the worst bunching issues might be on routes with headways at/below the 10min mark? Maybe try 15 and 10 as well?</i>

In [24]:
high_frequency_routes_median["med_headway_minutes"].describe(
    percentiles=[0.05, 0.1, 0.9, 0.95]
)

count   3417.00
mean     225.11
std      310.03
min        4.00
5%        17.71
10%       23.62
50%       96.77
90%      750.00
95%     1000.00
max     1250.00
Name: med_headway_minutes, dtype: float64

In [25]:
high_frequency_routes2 = high_frequency_routes_median.loc[
    high_frequency_routes_median.med_headway_minutes <= 15
]

In [26]:
high_frequency_routes2.route_id.nunique()

69

In [27]:
high_frequency_routes2.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,route_primary_direction,med_headway_minutes
34,0666caf3ec1ecc96b74f4477ee4bc939,105-13172,0.0,Westbound,14.85


#### Attach operators and districts

In [28]:
high_frequency_routes2 = pd.merge(
    high_frequency_routes2, crosswalk_df, on="schedule_gtfs_dataset_key", how="left"
)

In [29]:
high_frequency_routes2.organization_name.unique()

array(['Los Angeles County Metropolitan Transportation Authority',
       'San Francisco International Airport',
       'City and County of San Francisco',
       'San Francisco Bay Area Rapid Transit District',
       'Flagship Cruises and Events Inc.',
       'Alameda-Contra Costa Transit District', 'City of Los Angeles',
       'Santa Clara Valley Transportation Authority'], dtype=object)

In [30]:
high_frequency_routes2.name.unique()

array(['LA Metro Bus Schedule', 'LA Metro Rail Schedule',
       'Bay Area 511 SFO AirTrain Schedule', 'Bay Area 511 Muni Schedule',
       'Bay Area 511 BART Schedule', 'San Diego Schedule',
       'Bay Area 511 AC Transit Schedule', 'LA DOT Schedule',
       'Bay Area 511 Santa Clara Transit Schedule'], dtype=object)

In [31]:
high_frequency_routes2.caltrans_district.unique()

array(['07 - Los Angeles', '04 - Oakland', '11 - San Diego'], dtype=object)

### Get trips of high frequency routes

In [32]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [33]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [34]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
]

In [35]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [36]:
# Find only trips that belong to high frequency routes
trips_freq_routes = pd.merge(
    trips,
    high_frequency_routes2,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [37]:
trips_freq_routes.shape

(16017, 13)

### `rt_stop_times2`: Get Stop Times of High Frequency Routes/Trips
* What's the difference btwn `trip_id` and `trip_instance_key`?
* Eric: <i>trip_instance_key is created by our warehouse (see Columns section), and is a composite including trip_id , service date, and feed URL in order to uniquely identify a specific trip while allowing for joins across schedule+RT. It’s probably the one to use here, but personally I sometimes like keeping trip_id around for context.</i>
* Amanda: Drop `trip_id`.

In [38]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [133]:
rt_stop_times.scheduled_arrival_sec.describe()

count   2583428.00
mean      49444.17
std       18080.08
min         720.00
25%       34080.00
50%       49200.00
75%       63180.00
max      111374.00
Name: scheduled_arrival_sec, dtype: float64

In [134]:
rt_stop_times.rt_arrival_sec.describe()

count   2601262.00
mean      48000.76
std       17998.55
min           0.00
25%       33209.00
50%       48232.00
75%       62385.00
max       86399.00
Name: rt_arrival_sec, dtype: float64

In [39]:
# How is it possible to have right_only trips?
pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

_merge    
left_only     2045948
both           555314
right_only       2265
dtype: int64

In [40]:
# Find only stop times of trips that belong to high frequency trips
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [41]:
rt_stop_times2.shape

(555314, 18)

In [42]:
rt_stop_times2.trip_id.nunique(), rt_stop_times2.trip_instance_key.nunique()

(13752, 13752)

###  `rt_stop_times3`: Some scheduled arrival seconds span longer than a day: filter them out
* Katrina: <i>I assume the scheduled arrival sec > 86400 are after midnight, don't need to throw these out. Does rt arrival sec behave the same way, or do you need to create a datetime?</i>
* Eric: <i>agree w/ Katrina’s comments on handling seconds around midnight, I don’t know the actual answer but if rt_arrival_sec does in fact always go to 0 at midnight instead of sometimes going >86400 when schedule does you could use the % operator on the scheduled value like scheduled_arrival_sec % 86400</i>
    * <b> Amanda: ask Eric again </b>.

In [43]:
rt_stop_times2["scheduled_arrival_sec2"] = rt_stop_times2[
    "scheduled_arrival_sec"
].apply(lambda x: x - 86400 if x > 86400 else x)

In [44]:
rt_stop_times2["diff_btw_sched"] = (
    rt_stop_times2["scheduled_arrival_sec2"] - rt_stop_times2["scheduled_arrival_sec"]
)

In [45]:
rt_stop_times2["diff_btw_sched"].describe()

count   555314.00
mean     -3751.53
std      17608.48
min     -86400.00
25%          0.00
50%          0.00
75%          0.00
max          0.00
Name: diff_btw_sched, dtype: float64

In [46]:
len(rt_stop_times2.loc[rt_stop_times2["diff_btw_sched"] < 0])

24112

In [47]:
(rt_stop_times2.loc[rt_stop_times2["diff_btw_sched"] < 0])[
    ["scheduled_arrival_sec", "rt_arrival_sec", "scheduled_arrival_sec2",]
].sample(5)

Unnamed: 0,scheduled_arrival_sec,rt_arrival_sec,scheduled_arrival_sec2
519550,100380.0,13968,13980.0
525600,87178.0,716,778.0
352155,89760.0,3336,3360.0
421267,103203.0,16899,16803.0
361780,92640.0,6450,6240.0


In [48]:
len(rt_stop_times2.loc[rt_stop_times2["diff_btw_sched"] == 0])

531202

In [49]:
rt_stop_times2.scheduled_arrival_sec.describe()

count   555314.00
mean     50556.86
std      19522.11
min       9420.00
25%      34158.00
50%      49740.00
75%      64346.00
max     108431.00
Name: scheduled_arrival_sec, dtype: float64

In [50]:
rt_stop_times2.scheduled_arrival_sec2.describe()

count   555314.00
mean     46805.33
std      19164.82
min          1.00
25%      31560.00
50%      47040.00
75%      61800.00
max      86400.00
Name: scheduled_arrival_sec2, dtype: float64

In [51]:
rt_stop_times2.rt_arrival_sec.describe()

count   555314.00
mean     46868.72
std      19213.66
min          0.00
25%      31698.00
50%      47077.00
75%      61982.00
max      86399.00
Name: rt_arrival_sec, dtype: float64

In [52]:
# rt_stop_times3 = rt_stop_times2.loc[
#    rt_stop_times2.scheduled_arrival_sec < 86400
# ].reset_index(drop=True)

In [53]:
# rt_stop_times3.scheduled_arrival_sec.describe()

### `rt_stop_times4`: Sort so `stop sequence` for the `operator-stop_id-route-id_direction_id` will be in order.
* Comparing bunching by STOP, so we have to look at the `stop sequence-stop_id.`
* Katrina: <i>Maybe you want to sort  by rt arrival seconds instead of scheduled?</i>

#### QUESTION: Go over sorting conundrum

In [54]:
# Rearrange: I want the stop sequence to be 1,2,3,4.
# stop ids can differ between trips of the same route and the same stop sequence is the same
rt_stop_times4 = rt_stop_times2.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "route_id",
        "shape_array_key",
        "direction_id",
        "stop_sequence",
        "rt_arrival_sec",
    ]
).reset_index(drop=True)

In [55]:
# Make sure sorting is right
fillmore_stop_seq_13 = rt_stop_times4.loc[
    (rt_stop_times4.shape_array_key == "1b678a66d0009c55bc573cfc37aa1029")
    & (rt_stop_times4.stop_id == "13086")
    & (rt_stop_times4.direction_id == 0)
    & (rt_stop_times4.stop_sequence == 13)
]

#### QUESTION: Keeping rows that are scheduled to arrive past midnight...becomes confusing? Is this leaving at 11 the night before the servie date and arriving at 12 am on the service date? 
* RT Arrival Sec: If I add 84,000 to this, then it becomes 86,000 which means it is around on time.


In [56]:
fillmore_stop_seq_13[["route_id", "scheduled_arrival_sec", "rt_arrival_sec"]]

Unnamed: 0,route_id,scheduled_arrival_sec,rt_arrival_sec
403920,22,88699.0,2669
403921,22,89299.0,3151
403922,22,67199.0,67051
403923,22,69106.0,69048
403924,22,69466.0,69723
403925,22,70006.0,70421
403926,22,72992.0,73064
403927,22,74432.0,75434
403928,22,75872.0,75943


### Calculate the difference btwn actual vs scheduled arrival.

In [57]:
def check_delay(df):
    df = df.assign(delay=df.rt_arrival_sec - df.scheduled_arrival_sec2)

    print(df.delay.describe(percentiles=[0.05, 0.1, 0.9, 0.95]))

    max_delay_min = df.delay.max() / 60
    p95_delay_min = df.delay.quantile(q=0.95) / 60

    min_delay_min = df.delay.min() / 60
    p5_delay_min = df.delay.quantile(q=0.05) / 60

    print(f"min / max delay (minutes):{min_delay_min:.2f},{min_delay_min:.2f}")
    print(f"5th / 95th delay (minutes):{p5_delay_min:.2f}, {p95_delay_min:.2f}")

    return df

In [58]:
# 709 is about 12 hours
rt_stop_times4 = check_delay(rt_stop_times4)

count   555314.00
mean        63.39
std       3088.41
min     -86400.00
5%        -164.00
10%       -106.00
50%         86.00
90%        507.00
95%        709.00
max      86338.00
Name: delay, dtype: float64
min / max delay (minutes):-1440.00,-1440.00
5th / 95th delay (minutes):-2.73, 11.82


In [59]:
709 / 60

11.816666666666666

In [60]:
len(rt_stop_times4.loc[rt_stop_times4.delay > 709])

27765

In [61]:
(rt_stop_times4.loc[rt_stop_times4.delay > 709])['delay'].describe()

count   27765.00
mean     1251.34
std      3846.55
min       710.00
25%       797.00
50%       925.00
75%      1162.00
max     86338.00
Name: delay, dtype: float64

In [62]:
(rt_stop_times4.loc[rt_stop_times4.delay > 709])['diff_btw_sched'].describe()

count    27765.00
mean     -3491.47
std      17014.21
min     -86400.00
25%          0.00
50%          0.00
75%          0.00
max          0.00
Name: diff_btw_sched, dtype: float64

In [63]:
len(rt_stop_times4.loc[rt_stop_times4.delay <0])

172326

In [64]:
len(rt_stop_times4)

555314

In [65]:
# The bus actually arrived 1 minute early
rt_stop_times4.loc[rt_stop_times4.delay == 86338.00][
    ["scheduled_arrival_sec", "rt_arrival_sec", "scheduled_arrival_sec2", "delay", "diff_btw_sched"]
]

Unnamed: 0,scheduled_arrival_sec,rt_arrival_sec,scheduled_arrival_sec2,delay,diff_btw_sched
494269,86402.0,86340,2.0,86338.0,-86400.0


In [66]:
rt_stop_times4.loc[(rt_stop_times4.delay > 709) & (rt_stop_times4.diff_btw_sched < 0)].sample(10)[
    ["scheduled_arrival_sec", "rt_arrival_sec", "scheduled_arrival_sec2", "delay", "diff_btw_sched"]
]

Unnamed: 0,scheduled_arrival_sec,rt_arrival_sec,scheduled_arrival_sec2,delay,diff_btw_sched
463993,103080.0,17469,16680.0,789.0,-86400.0
414112,92367.0,6698,5967.0,731.0,-86400.0
392026,88450.0,2863,2050.0,813.0,-86400.0
396693,92686.0,7001,6286.0,715.0,-86400.0
223636,91500.0,5914,5100.0,814.0,-86400.0
307138,101820.0,16237,15420.0,817.0,-86400.0
392933,97091.0,11455,10691.0,764.0,-86400.0
12792,91080.0,5391,4680.0,711.0,-86400.0
445130,89791.0,4280,3391.0,889.0,-86400.0
392255,89502.0,3813,3102.0,711.0,-86400.0


In [67]:
(rt_stop_times4.loc[rt_stop_times4.delay <0]).sample(10)[
    ["scheduled_arrival_sec", "rt_arrival_sec", "scheduled_arrival_sec2", "delay", "diff_btw_sched"]
]

Unnamed: 0,scheduled_arrival_sec,rt_arrival_sec,scheduled_arrival_sec2,delay,diff_btw_sched
11026,55740.0,55432,55740.0,-308.0,0.0
195900,41220.0,41180,41220.0,-40.0,0.0
445983,56388.0,56138,56388.0,-250.0,0.0
227989,24540.0,24171,24540.0,-369.0,0.0
342131,22860.0,22854,22860.0,-6.0,0.0
219943,59220.0,59168,59220.0,-52.0,0.0
306683,101880.0,15310,15480.0,-170.0,-86400.0
50692,46140.0,46052,46140.0,-88.0,0.0
454575,79310.0,79297,79310.0,-13.0,0.0
274452,51180.0,51148,51180.0,-32.0,0.0


### `rt_stop_times5`: Filter out values in `delay` that are in the 1 hour zone
* Actual times should not exceed more than an hour or less than hour.
* Katrina: <i>I am not sure if you need to throw out ">1 hour delay" trips, the customer experience we're interested in is actual wait times between stop arrivals</i>
    

#### QUESTION: forgot why Tiffany does this but she generally throws out delays that are ~one hour. I will clarify w/ her again but should we follow this convention if she does indeed throw away stuff?</b>.

In [68]:
# Filter to only delays that are an hour or less
rt_stop_times5 = rt_stop_times4[rt_stop_times4["delay"] <= 3600].reset_index(drop=True)

In [69]:
# Filter to only delays that are no less than
rt_stop_times5 = rt_stop_times5[rt_stop_times5["delay"] >= -3600].reset_index(drop=True)

In [70]:
rt_stop_times5.shape

(554338, 21)

In [71]:
rt_stop_times4.shape

(555314, 21)

In [72]:
len(rt_stop_times4) - len(rt_stop_times5)

976

### Calculate the actual headway the `operator-route-direction_id-stop_sequence-stop_id-` grain
#### QUESTION: Do I need to include feed key and shape array key? What is `feed_key` and how does it differ from `schedule_gtfs_dataset_key`? Still need help</b>

### Calculate scheduled headway
* Using the same grain.

In [73]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "shape_array_key",
    "direction_id",
    "route_primary_direction",
    "stop_sequence",
    "stop_id",
]

In [74]:
# Subtract rt_arrival_sec from the previous row to the target row
# using groupby columns
rt_stop_times5["actual_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "rt_arrival_sec"
].diff()

In [76]:
rt_stop_times5["schd_headway"] = rt_stop_times5.groupby(groupby_cols)[
    "scheduled_arrival_sec"
].diff()

In [77]:
rt_stop_times5.head(10)[['scheduled_arrival_sec','rt_arrival_sec','delay','actual_headway','schd_headway']]

Unnamed: 0,scheduled_arrival_sec,rt_arrival_sec,delay,actual_headway,schd_headway
0,14400.0,14421,21.0,,
1,16140.0,16165,25.0,1744.0,1740.0
2,17280.0,17429,149.0,1264.0,1140.0
3,18660.0,18682,22.0,1253.0,1380.0
4,19800.0,19812,12.0,1130.0,1140.0
5,23040.0,23293,253.0,3481.0,3240.0
6,24660.0,24688,28.0,1395.0,1620.0
7,25740.0,26041,301.0,1353.0,1080.0
8,50400.0,50361,-39.0,24320.0,24660.0
9,73200.0,73604,404.0,23243.0,22800.0


### Fill in `nans` with 0 
* I am not sure if `nans` impact calculations of the mean scheduled headway and whatnot?
* These `nans` are because the first `operator-route-stop_id-stop_sequence` combo won't have anything to compare it to.
* Katrina: <i>I would fill in the actual/schedule headway columns with 0 rather than dropping the first row  in each grouping. I wonder if it makes sense to use a more descriptive column name than headway, such as "minutes since last vehicle"</i>

In [78]:
rt_stop_times5 = rt_stop_times5.fillna(0)

### Transit Matters Method
* To Do: add back in route  & operator information

In [79]:
transit_matters_df1 = rt_stop_times5.copy()

In [80]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_headway / transit_matters_df1.schd_headway
)

In [81]:
import numpy as np

transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

#### There are some very extreme values: how to deal with this?


In [82]:
transit_matters_df1.pct_actual_schd_headway.describe()

count   536162.00
mean         0.92
std          0.62
min        -24.71
25%          0.77
50%          0.98
75%          1.15
max         10.62
Name: pct_actual_schd_headway, dtype: float64

In [83]:
len(transit_matters_df1.loc[transit_matters_df1.pct_actual_schd_headway < 0])

18560

In [84]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

not bunched   0.93
bunched       0.07
Name: bunched_y_n, dtype: float64

In [85]:
sf_38r_test = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "14295")
    & (
        transit_matters_df1.schedule_gtfs_dataset_key
        == "7cc0cb1871dfd558f11a2885c145d144"
    )
    & (transit_matters_df1.stop_sequence == 11)
    & (transit_matters_df1.route_id == "38R")
]

In [86]:
# Row 444797: scheduled to arrive at 69480, but actually arrives 69890

#### QUESTION: Wonder if I should convert time stamps to hours so it's at least in military time instead of seconds? Although I'm not really sure if this is sound.

In [87]:
sf_38r_test["sched_arrival_min"] = sf_38r_test.scheduled_arrival_sec / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_38r_test["sched_arrival_min"] = sf_38r_test.scheduled_arrival_sec / 60


In [88]:
sf_38r_test["rt_arrival_min"] = sf_38r_test.rt_arrival_sec / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_38r_test["rt_arrival_min"] = sf_38r_test.rt_arrival_sec / 60


In [89]:
sf_38r_test["actual_headway_min"] = sf_38r_test.actual_headway / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_38r_test["actual_headway_min"] = sf_38r_test.actual_headway / 60


In [90]:
sf_38r_test["schd_headway_min"] = sf_38r_test.schd_headway / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_38r_test["schd_headway_min"] = sf_38r_test.schd_headway / 60


In [91]:
sf_38r_test[
    [
        "sched_arrival_min",
        "rt_arrival_min",
        "actual_headway_min",
        "schd_headway_min",
        "pct_actual_schd_headway",
    ]
].tail(5)

Unnamed: 0,sched_arrival_min,rt_arrival_min,actual_headway_min,schd_headway_min,pct_actual_schd_headway
453525,1290.0,1293.37,15.28,10.0,1.53
453526,1314.0,1311.37,18.0,24.0,0.75
453527,1302.0,1315.93,4.57,-12.0,-0.38
453528,1326.0,1324.37,8.43,24.0,0.35
453529,1338.0,1351.82,27.45,12.0,2.29


* Row 466475 was scheduled to arrive after row 466476

#### Groupby grain and see how many trips for that grain are considered "bunched" or not.

In [92]:
transit_matters_df2 = (
    transit_matters_df1.groupby(
        [
            "schedule_gtfs_dataset_key",
            "route_long_name",
            "shape_array_key",
            "route_id",
            "stop_id",
            "direction_id",
            "route_primary_direction",
            "bunched_y_n",
        ]
    )
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
)

In [93]:
# Filter out only rows that are bunched.
bunched_only = transit_matters_df2.loc[
    transit_matters_df2.bunched_y_n == "bunched"
].reset_index(drop=True)

In [94]:
bunched_only = bunched_only.rename(columns={"trip_instance_key": "bunched_trips"})

In [95]:
transit_matters_agg = [
    "schedule_gtfs_dataset_key",
    "route_long_name",
    "shape_array_key",
    "route_id",
    "stop_id",
    "direction_id",
    "route_primary_direction",
]

In [96]:
# Aggregate all trips on the grain
transit_matters_all_trips = (
    transit_matters_df1.groupby(transit_matters_agg)
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
    .rename(columns={"trip_instance_key": "all_trips"})
)

In [97]:
# Merge back, using left merge to keep bunching
bunched_only = pd.merge(
    bunched_only, transit_matters_all_trips, on=transit_matters_agg, how="left"
)

In [98]:
bunched_only["pct_trips_bunched"] = (
    bunched_only.bunched_trips / bunched_only.all_trips * 100
)

In [99]:
bunched_only = bunched_only.drop(columns=["all_trips"])

In [100]:
# Merge back all rows that don't have bunching trips.
transit_matters_m1 = pd.merge(
    transit_matters_all_trips,
    bunched_only,
    on=transit_matters_agg,
    how="left",
)

In [101]:
transit_matters_m1 = transit_matters_m1.drop(
    columns=[
        "bunched_y_n",
    ]
)

In [102]:
transit_matters_m1.pct_trips_bunched = transit_matters_m1.pct_trips_bunched.fillna(0)

In [103]:
transit_matters_m1.pct_trips_bunched.describe()

count   17919.00
mean        5.95
std         9.08
min         0.00
25%         0.00
50%         1.92
75%         9.09
max        66.67
Name: pct_trips_bunched, dtype: float64

In [104]:
transit_matters_m1.loc[transit_matters_m1.pct_trips_bunched >= 10].shape

(4085, 10)

In [105]:
transit_matters_m1.loc[
    (transit_matters_m1.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (transit_matters_m1.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (transit_matters_m1.route_id == "1")
    & (transit_matters_m1.direction_id == 1)
    & (transit_matters_m1.stop_id == "13853")
]

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
13989,7cc0cb1871dfd558f11a2885c145d144,CALIFORNIA,955e2fc8f9f8a4be2c67c7212be874f6,1,13853,1.0,Eastbound,7,2.0,28.57


In [106]:
bunched_only.loc[
    (bunched_only.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (bunched_only.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (bunched_only.route_id == "1")
    & (bunched_only.direction_id == 1)
    & (bunched_only.stop_id == "13853")
]

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,bunched_y_n,bunched_trips,pct_trips_bunched
7048,7cc0cb1871dfd558f11a2885c145d144,CALIFORNIA,955e2fc8f9f8a4be2c67c7212be874f6,1,13853,1.0,Eastbound,bunched,2,28.57


### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

* To Do: add back in route  & operator information

In [107]:
two_minutess_df = rt_stop_times5.copy()

In [108]:
two_minutess_df.columns

Index(['trip_id', 'stop_id', 'stop_sequence', 'scheduled_arrival_sec',
       'schedule_gtfs_dataset_key', 'trip_instance_key', 'rt_arrival_sec',
       'route_id', 'shape_array_key', 'feed_key', 'route_long_name',
       'direction_id', 'route_primary_direction', 'med_headway_minutes',
       'organization_name', 'name', 'caltrans_district', 'service_date',
       'scheduled_arrival_sec2', 'diff_btw_sched', 'delay', 'actual_headway',
       'schd_headway'],
      dtype='object')

In [109]:
two_minutess_df["actual_headway_min"] = two_minutess_df.rt_arrival_sec / 60

In [110]:
two_minutess_df["bunched_y_n"] = np.where(
    two_minutess_df["actual_headway_min"] <= 2, "bunched", "not bunched"
)

In [111]:
two_minutess_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554338 entries, 0 to 554337
Data columns (total 25 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   trip_id                    554338 non-null  object        
 1   stop_id                    554338 non-null  object        
 2   stop_sequence              554338 non-null  int64         
 3   scheduled_arrival_sec      554338 non-null  float64       
 4   schedule_gtfs_dataset_key  554338 non-null  object        
 5   trip_instance_key          554338 non-null  object        
 6   rt_arrival_sec             554338 non-null  int64         
 7   route_id                   554338 non-null  object        
 8   shape_array_key            554338 non-null  object        
 9   feed_key                   554338 non-null  object        
 10  route_long_name            554338 non-null  object        
 11  direction_id               554338 non-null  float64 

In [112]:
two_minutess_df.bunched_y_n.value_counts()

not bunched    554197
bunched           141
Name: bunched_y_n, dtype: int64

#### Same code as Transit Matters Approach

In [113]:
two_minutes_agg1 = (
    two_minutess_df.groupby(
        [
            "schedule_gtfs_dataset_key",
            "route_long_name",
            "shape_array_key",
            "route_id",
            "stop_id",
            "direction_id",
            "route_primary_direction",
            "bunched_y_n",
        ]
    )
    .agg({"trip_instance_key": "nunique"})
    .reset_index()
)

In [114]:
bunched_only_two_min = (
    two_minutes_agg1.loc[two_minutes_agg1.bunched_y_n == "bunched"]
    .reset_index(drop=True)
    .rename(columns={"trip_instance_key": "bunched_trips"})
)

In [115]:
# I want to do a left merge because I'm only interested in trips that bunched.
bunched_only_two_min = pd.merge(
    bunched_only_two_min,
    transit_matters_all_trips,
    on=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "route_id",
        "stop_id",
        "direction_id",
        "route_primary_direction",
    ],
    how="left",
)

In [116]:
bunched_only_two_min["pct_trips_bunched"] = (
    bunched_only_two_min.bunched_trips / bunched_only_two_min.all_trips * 100
)

In [117]:
bunched_only_two_min = bunched_only_two_min.drop(columns=["all_trips"])

In [118]:
bunched_only_two_min.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,bunched_y_n,bunched_trips,pct_trips_bunched
0,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,bb7dcb0d51bb9674c1d17ba7c9347ec1,901-13172,15611,1.0,Westbound,bunched,1,1.09
1,0666caf3ec1ecc96b74f4477ee4bc939,Metro J Line 910/950,7b368939dbc2096ce3c978982b035a4b,910-13172,13460,0.0,Northbound,bunched,1,1.03


In [119]:
# Need to do a left merge on all trips for the stops that don't have bunching.
final_two_minute = pd.merge(
    transit_matters_all_trips,
    bunched_only_two_min,
    on=[
        "schedule_gtfs_dataset_key",
        "route_long_name",
        "shape_array_key",
        "route_id",
        "stop_id",
        "direction_id",
        "route_primary_direction",
    ],
    how="left",
)

In [120]:
final_two_minute.shape

(17919, 11)

In [121]:
final_two_minute = final_two_minute.drop(columns=["bunched_y_n"])

In [122]:
final_two_minute = final_two_minute.fillna(0)

In [123]:
final_two_minute.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
0,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15313,0.0,Eastbound,92,0.0,0.0
1,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15416,0.0,Eastbound,87,0.0,0.0
2,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15432,0.0,Eastbound,90,0.0,0.0
3,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15436,0.0,Eastbound,92,0.0,0.0
4,0666caf3ec1ecc96b74f4477ee4bc939,Metro G Line 901,12530c16e07a519c8a8543d487f26ade,901-13172,15453,0.0,Eastbound,90,0.0,0.0


In [124]:
bunched = final_two_minute.loc[final_two_minute.pct_trips_bunched != 0]

In [125]:
bunched.all_trips.describe()

count   139.00
mean     46.29
std      43.01
min       1.00
25%      10.50
50%      29.00
75%      82.50
max     144.00
Name: all_trips, dtype: float64

In [126]:
bunched.loc[
    (bunched.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (bunched.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (bunched.route_id == "1")
    & (bunched.direction_id == 1)
    & (bunched.stop_id == "13853")
]

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
13989,7cc0cb1871dfd558f11a2885c145d144,CALIFORNIA,955e2fc8f9f8a4be2c67c7212be874f6,1,13853,1.0,Eastbound,7,2.0,28.57


In [127]:
rt_stop_times5.loc[
    (rt_stop_times5.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (rt_stop_times5.shape_array_key == "955e2fc8f9f8a4be2c67c7212be874f6")
    & (rt_stop_times5.route_id == "1")
    & (rt_stop_times5.direction_id == 1)
    & (rt_stop_times5.stop_id == "13853")
][["scheduled_arrival_sec2", "rt_arrival_sec", "actual_headway", "schd_headway"]]

Unnamed: 0,scheduled_arrival_sec2,rt_arrival_sec,actual_headway,schd_headway
387874,78.0,49,0.0,0.0
387875,258.0,67,18.0,180.0
387876,71746.0,71920,71853.0,-14912.0
387877,72586.0,72522,602.0,840.0
387878,74598.0,74823,2301.0,2012.0
387879,76038.0,76135,1312.0,1440.0
387880,81738.0,81099,4964.0,5700.0


In [128]:
bunched.sort_values(by=["pct_trips_bunched"], ascending=False)

Unnamed: 0,schedule_gtfs_dataset_key,route_long_name,shape_array_key,route_id,stop_id,direction_id,route_primary_direction,all_trips,bunched_trips,pct_trips_bunched
17637,c499f905e33929a641f083dad55c521e,Uptown Oakland\ San Leandro BART Southbound OWL,26c9a0d239e56a4c303de0561bd6f1e9,1T,52574,1.0,Westbound,1,1.0,100.0
15276,7cc0cb1871dfd558f11a2885c145d144,JUDAH,4313b87d56efa7346875951c43b08094,N,15731,1.0,Eastbound,3,1.0,33.33
7839,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,90589eaf624d193bfc62ca073761724f,60-13172,12702,1.0,Southbound,3,1.0,33.33
7861,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,90589eaf624d193bfc62ca073761724f,60-13172,140842,1.0,Southbound,3,1.0,33.33
13989,7cc0cb1871dfd558f11a2885c145d144,CALIFORNIA,955e2fc8f9f8a4be2c67c7212be874f6,1,13853,1.0,Eastbound,7,2.0,28.57
13988,7cc0cb1871dfd558f11a2885c145d144,CALIFORNIA,955e2fc8f9f8a4be2c67c7212be874f6,1,13846,1.0,Eastbound,8,2.0,25.0
6404,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,755c581d818a061f93e6d3bf998bfa23,18-13172,7367,0.0,Eastbound,4,1.0,25.0
447,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,0688a14c97a2ebfe90f5674c1262d741,217-13172,13414,1.0,Southbound,4,1.0,25.0
455,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,0688a14c97a2ebfe90f5674c1262d741,217-13172,1438,1.0,Southbound,4,1.0,25.0
2954,0666caf3ec1ecc96b74f4477ee4bc939,Metro Local Line,33604cd768d576d5a8c87112fb4ca942,33-13172,13968,1.0,Westbound,5,1.0,20.0
