## Transit Bunching 
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)
### 11/8
* Figure out how to address City of Visalia: one of the buses that is scheduled to arrive earlier arrives later than another bus. 
* This leads to a negative time stamp and makes it appear like there is a lot of bunching per the Transit Matters approach.



In [1]:
import datetime as dt

import altair as alt
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
may_date = "2024-05-22"

In [4]:
drop_for_preview = [
    "schedule_gtfs_dataset_key",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "trip_id",
]

### Grab Routes

In [5]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [6]:
route_dir_columns = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
    "route_primary_direction",
    "frequency",
    "service_date",
]

In [7]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[
    route_dir_columns
]

In [8]:
route_dir.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,frequency,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,Northbound,0.92,2024-05-22
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,offpeak,Northbound,0.62,2024-05-22
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,peak,Northbound,1.5,2024-05-22
3,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,all_day,Southbound,0.92,2024-05-22
4,015d67d5b75b5cf2b710bbadadfb75f5,17,1.0,offpeak,Southbound,0.69,2024-05-22


In [9]:
# Filter for only all_day
route_dir = route_dir.loc[route_dir.time_period == "all_day"].reset_index(drop=True)

In [10]:
len(route_dir)

3417

#### Attach operators and districts

In [11]:
# Grab Crosswalk
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [12]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "organization_name",
    "name",
    "caltrans_district",
]

In [13]:
crosswalk_df = (
    time_series_utils.concatenate_datasets_across_dates(
        SCHED_GCS, CROSSWALK, [may_date], data_type="df", columns=crosswalk_cols
    )
    .sort_values(["service_date"])
    .reset_index(drop=True)
)

In [14]:
crosswalk_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,name,caltrans_district,service_date
0,1770249a5a2e770ca90628434d4934b1,Ventura County Transportation Commission,VCTC GMV Schedule,07 - Los Angeles,2024-05-22
1,f8102a9c0693206bf36d302540bf1bcf,City of Corona,Corona Schedule,08 - San Bernardino,2024-05-22


In [15]:
crosswalk_df.shape

(189, 5)

In [16]:
routes = pd.merge(
    route_dir,
    crosswalk_df,
    on=["schedule_gtfs_dataset_key", "service_date"],
    how="left",
)

In [17]:
# routes = pd.concat([thousand_oaks, visalia, metro, metro_33])

In [18]:
len(routes)

4922

### Add Trips

In [19]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [20]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [21]:
trips_subset = [
    "gtfs_dataset_key",
    "route_id",
    "trip_instance_key",
    "shape_array_key",
    "feed_key",
    "route_long_name",
    "direction_id",
    "route_type",
]

In [22]:
trips = pd.read_parquet(FILE)[trips_subset].rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [23]:
trips_routes = pd.merge(
    trips,
    routes,
    on=["schedule_gtfs_dataset_key", "route_id", "direction_id"],
    how="inner",
)

In [24]:
trips_routes.route_id.nunique()

1338

#### Help - I know we can get this from the warehouse but it seems cumbersome. Correct me if I'm wrong.

In [25]:
# https://gtfs.org/documentation/schedule/reference/#
route_type_crosswalk = {
    "route_type": ["0", "1", "2", "3", "4", "5", "6", "7", "11", "12"],
    "route_type_str": [
        "Tram, Streetcar, Light rail",
        "Subway, Metro",
        "Rail",
        "Bus",
        "Ferry.",
        "Cable tram.",
        "Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).",
        "Funicular.",
        "Trolleybus.",
        "Monorail.",
    ],
}

In [26]:
route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)

In [27]:
# Merge for route_type
trips_routes = pd.merge(
    trips_routes, route_type_crosswalk_df, on=["route_type"], how="left"
)

In [28]:
trips_routes = trips_routes.drop(columns=["route_type"])

In [29]:
trips_routes = trips_routes.rename(columns={"route_type_str": "route_type"})

In [30]:
trips_routes.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,trip_instance_key,shape_array_key,feed_key,route_long_name,direction_id,time_period,route_primary_direction,frequency,service_date,organization_name,name,caltrans_district,route_type
0,1770249a5a2e770ca90628434d4934b1,3408,c256553e28c4bba693e3136240b35419,8f644f847e987de68e0cb6fcd339cf41,926867fdee73d5fbfe4f011871bcd830,Route 21,0.0,all_day,Westbound,1.17,2024-05-22,Ventura County Transportation Commission,VCTC GMV Schedule,07 - Los Angeles,Bus


### Get Stop Times 

In [31]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [32]:
rt_stop_times.head(2)

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
0,1d105244-776c-4b3f-af78-9c7ad78c2103,0b2443b6-b50f-452b-a749-464588ca93b8,8,60991.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61434
1,1d105244-776c-4b3f-af78-9c7ad78c2103,cd5650b0-9a18-4e78-aedc-385f3094fa0f,9,61179.0,1fd2f07342d966919b15d5d37fda8cc8,45ae17540ca9fb5030c84dbb12e48e9a,61616


In [33]:
rt_stop_times.shape

(2601262, 7)

In [34]:
len(rt_stop_times.drop_duplicates())

2601262

In [35]:
trips_routes_times = pd.merge(
    rt_stop_times,
    trips_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [36]:
(trips_routes_times.scheduled_arrival_sec.isna().sum())

15058

#### Lots of duplicates??

In [37]:
len(trips_routes_times)

3687798

In [38]:
len(trips_routes_times.drop_duplicates())

3073590

In [39]:
trips_routes_times2 = trips_routes_times.drop_duplicates().reset_index(drop=True)

### Question: How Granular?
* San Diego example.

In [40]:
subset = [
    "service_date",
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "shape_array_key",
    "route_long_name",
    "route_type",
    "route_id",
    "direction_id",
    "stop_id",
    "stop_sequence",
    "rt_arrival_sec",
    "scheduled_arrival_sec",
    "trip_instance_key"
]

In [41]:
trips_routes_times3 = trips_routes_times2[subset]

In [42]:
trips_routes_times4 = trips_routes_times3.sort_values(
    by=[
        "schedule_gtfs_dataset_key",
        "shape_array_key",
        "route_id",
        "direction_id",
        "stop_id",
        "stop_sequence",
        "rt_arrival_sec",
    ],
    ascending=[True, True, True, True, True, True, True],
).reset_index(drop=True)

#### San Diego: Switches `shape_array_key` often.

In [43]:
sd_test = trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "fb467982dcc77a7f9199bebe709bb700"
    )
    & (trips_routes_times4.route_id == "BlueN")
    & (trips_routes_times4.stop_id == "64744")
]

In [44]:
sd_test2 = trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "fb467982dcc77a7f9199bebe709bb700"
    )
    & (trips_routes_times4.route_id == "BlueN")
    & (trips_routes_times4.direction_id == 0)
]

In [45]:
sd_test2.shape_array_key.nunique()

3

In [46]:
sd_test2

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,shape_array_key,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,trip_instance_key
2931295,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64744,2,1895,88181.0,f64d646206a4d18e5ce341a55815806a
2931296,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64744,2,86043,85601.0,c1afed6ea4ec26c09e856f7d2bc3b5dc
2931297,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64745,3,2078,88320.0,f64d646206a4d18e5ce341a55815806a
2931298,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64745,3,86147,85740.0,c1afed6ea4ec26c09e856f7d2bc3b5dc
2931299,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64746,4,2224,88440.0,f64d646206a4d18e5ce341a55815806a
2931300,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64746,4,86256,85860.0,c1afed6ea4ec26c09e856f7d2bc3b5dc
2931301,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64747,5,2355,88574.0,f64d646206a4d18e5ce341a55815806a
2931302,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64747,5,86369,85994.0,c1afed6ea4ec26c09e856f7d2bc3b5dc
2931303,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64748,6,76,86160.0,c1afed6ea4ec26c09e856f7d2bc3b5dc
2931304,2024-05-22,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,03882e60f629e3697b060114b4c3df23,Baypointe - Santa Teresa,"Tram, Streetcar, Light rail",BlueN,0.0,64748,6,2479,88740.0,f64d646206a4d18e5ce341a55815806a


#### Another San Diego Route
* Why is the `stop_id` and `stop_sequence` flip flopping?

In [47]:
sd_test3 = trips_routes_times4.loc[
    (trips_routes_times4.organization_name == "San Diego Metropolitan Transit System")
    & (trips_routes_times4.route_id == "834")
    & (trips_routes_times4.direction_id == 0)
]

In [48]:
sd_test3.shape_array_key.nunique()

1

In [49]:
sd_test3.direction_id.nunique()

1

In [50]:
sd_test3.stop_id.nunique()

20

In [51]:
sd_test3.shape

(62, 15)

##### Why does the number skip??

In [52]:
sd_test3

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,shape_array_key,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,trip_instance_key
1978668,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40173,4,29344,28200.0,bd604c3ac34b6665b6e8f9e98aea14d5
1978671,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40173,4,32750,31860.0,374d1596c1010a2b9e089b99eed5cda8
1978674,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40173,4,49842,49920.0,cf50cec4440e8082c458f56b077d9c22
1978677,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40173,4,53523,53520.0,43c788bb56c937fdfe424fde647379dc
1978680,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40259,9,32164,32220.0,374d1596c1010a2b9e089b99eed5cda8
1978683,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40259,9,35716,35820.0,974a3571e3567999e8e359a207163d67
1978686,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40382,8,32124,32160.0,374d1596c1010a2b9e089b99eed5cda8
1978689,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40382,8,35668,35760.0,974a3571e3567999e8e359a207163d67
1978692,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40400,5,23956,24060.0,8332e5b584fb71def8b2a47e0ecdf086
1978695,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,40400,5,28337,28260.0,bd604c3ac34b6665b6e8f9e98aea14d5


#### Seeing how many `shape_array_keys` correspond with different combinations

In [53]:
shape_array_counts = (
    trips_routes_times4.groupby(
        ["organization_name", "schedule_gtfs_dataset_key", "route_id", "direction_id"]
    )
    .agg({"shape_array_key": "nunique"})
    .reset_index()
    .sort_values(by=["shape_array_key"], ascending=False)
)

In [54]:
shape_array_counts.head(20)

Unnamed: 0,organization_name,schedule_gtfs_dataset_key,route_id,direction_id,shape_array_key
1769,Los Angeles County Metropolitan Transportation Authority,0666caf3ec1ecc96b74f4477ee4bc939,10-13172,0.0,11
3213,Santa Cruz Metropolitan Transit District,43d8d305ee692724a532f30ea63a1cbe,35,1.0,11
1926,Los Angeles County Metropolitan Transportation Authority,0666caf3ec1ecc96b74f4477ee4bc939,51-13172,1.0,11
1783,Los Angeles County Metropolitan Transportation Authority,0666caf3ec1ecc96b74f4477ee4bc939,115-13172,0.0,9
1778,Los Angeles County Metropolitan Transportation Authority,0666caf3ec1ecc96b74f4477ee4bc939,108-13172,1.0,9
1782,Los Angeles County Metropolitan Transportation Authority,0666caf3ec1ecc96b74f4477ee4bc939,111-13172,1.0,9
1914,Los Angeles County Metropolitan Transportation Authority,0666caf3ec1ecc96b74f4477ee4bc939,4-13172,1.0,9
1781,Los Angeles County Metropolitan Transportation Authority,0666caf3ec1ecc96b74f4477ee4bc939,111-13172,0.0,8
3208,Santa Cruz Metropolitan Transit District,43d8d305ee692724a532f30ea63a1cbe,17,1.0,8
1925,Los Angeles County Metropolitan Transportation Authority,0666caf3ec1ecc96b74f4477ee4bc939,51-13172,0.0,8


#### Other Tests

In [55]:
la_test = trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "0666caf3ec1ecc96b74f4477ee4bc939"
    )
    & (trips_routes_times4.route_id == "204-13172")
    & (trips_routes_times4.direction_id == 1)
]

In [56]:
sf_test = trips_routes_times4.loc[
    (
        trips_routes_times4.schedule_gtfs_dataset_key
        == "7cc0cb1871dfd558f11a2885c145d144"
    )
    & (trips_routes_times4.route_id == "30")
    & (trips_routes_times4.direction_id == 1)
]

In [57]:
sf_test.shape

(3206, 15)

In [58]:
sf_test.stop_sequence.nunique()

33

In [59]:
sf_test.shape_array_key.nunique()

7

In [60]:
sf_test.loc[sf_test.stop_id=="18027"]

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,shape_array_key,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,trip_instance_key
1217497,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,22557,22184.0,b77b7f7d198fe7f9f1abd5a2058b2222
1217498,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,23398,23039.0,cfe7c0260c51629829e6d7cd4ef87781
1217499,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,24219,23819.0,c00665255fb6af8158ae0ba218f78643
1217500,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,24773,24719.0,6e806f7c42d5adb04c1a7121bed23230
1217501,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,25934,25499.0,6a9fb998b5cc89dd7826b6fe520f1c92
1217502,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,26771,26279.0,f3edb62bac832b3caf1783e7307062b7
1217503,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,27531,27059.0,7c48c497b722cec206ec85435f705195
1217504,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,27953,27899.0,ee38c7dd28fd020b1b9012c90e13b7f1
1217505,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,29942,29414.0,f071473d99cc96fc9b3286a077a3c3d8
1217506,2024-05-22,04 - Oakland,7cc0cb1871dfd558f11a2885c145d144,7f69c2fdaa134642f14064a0b64d1495,City and County of San Francisco,1bf60afebc995e8a4adbc7effd4dd47e,STOCKTON,Bus,30,1.0,18027,2,30935,30854.0,21b673594f32e9784f7b050c18616e36


In [61]:
la_test.shape_array_key.nunique()

1

### Convert scheduled and RT arrival times.

In [62]:
trips_routes_times3["rt_arrival_sec"].describe()

count   3073590.00
mean      48047.93
std       17776.31
min           0.00
25%       33386.00
50%       48221.00
75%       62248.00
max       86399.00
Name: rt_arrival_sec, dtype: float64

In [63]:
trips_routes_times3["scheduled_arrival_sec"].describe()

count   3058532.00
mean      49252.10
std       17823.17
min         720.00
25%       34140.00
50%       49020.00
75%       62880.00
max      111374.00
Name: scheduled_arrival_sec, dtype: float64

In [64]:
trips_routes_times3.loc[trips_routes_times3["scheduled_arrival_sec"] >= 86_400].shape

(46947, 15)

In [65]:
# all_columns = trips_routes_times2.copy()

In [66]:
trips_routes_times4 = trips_routes_times3[subset]

In [67]:
trips_routes_times4["converted_rt_arrival"] = pd.to_datetime(
    trips_routes_times4["service_date"]
) + pd.to_timedelta(trips_routes_times4["rt_arrival_sec"] % 86400, unit="s")

In [68]:
trips_routes_times4["converted_schd_arrival"] = pd.to_datetime(
    trips_routes_times4["service_date"]
) + pd.to_timedelta(trips_routes_times4["scheduled_arrival_sec"] % 86400, unit="s")

### Deal with  extreme values through delays.

In [69]:
300 / 60

5.0

In [70]:
percentiles = [0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]

In [71]:
trips_routes_times4["delay_min"] = (
    trips_routes_times4["converted_rt_arrival"]
    - trips_routes_times4["converted_schd_arrival"]
).dt.total_seconds() / 60

In [72]:
print(trips_routes_times4.delay_min.describe(percentiles))

count   3058532.00
mean          2.01
std          33.54
min       -1439.78
1%           -5.29
2%           -3.88
5%           -2.47
10%          -1.52
50%           1.45
90%           7.68
95%          10.87
98%          15.83
99%          20.27
max        1439.98
Name: delay_min, dtype: float64


In [73]:
trips_routes_times4["converted_rt_arrival"] = np.where(
    trips_routes_times4["delay_min"] >= 600,
    trips_routes_times4["converted_rt_arrival"] - pd.Timedelta(days=1),
    trips_routes_times4["converted_rt_arrival"],
)

In [74]:
trips_routes_times4["converted_schd_arrival"] = np.where(
    trips_routes_times4["delay_min"] <= -600,
    trips_routes_times4["converted_schd_arrival"] - pd.Timedelta(days=1),
    trips_routes_times4["converted_schd_arrival"],
)

In [75]:
trips_routes_times4["converted_schd_arrival"] = np.where(
    trips_routes_times4["delay_min"] >= 600,
    trips_routes_times4["converted_schd_arrival"] + pd.Timedelta(days=1),
    trips_routes_times4["converted_schd_arrival"],
)

In [76]:
trips_routes_times4["converted_rt_arrival"] = np.where(
    trips_routes_times4["delay_min"] <= -600,
    trips_routes_times4["converted_rt_arrival"] + pd.Timedelta(days=1),
    trips_routes_times4["converted_rt_arrival"],
)

In [77]:
# Recalc delay_min
trips_routes_times4["delay_min"] = (
    trips_routes_times4["converted_rt_arrival"]
    - trips_routes_times4["converted_schd_arrival"]
).dt.total_seconds() / 60

In [78]:
# Have to do this twice for some reason.
trips_routes_times4["converted_rt_arrival"] = np.where(
    trips_routes_times4["delay_min"] <= -600,
    trips_routes_times4["converted_rt_arrival"] + pd.Timedelta(days=1),
    trips_routes_times4["converted_rt_arrival"],
)

In [79]:
# Recalc delay_min
trips_routes_times4["delay_min"] = (
    trips_routes_times4["converted_rt_arrival"]
    - trips_routes_times4["converted_schd_arrival"]
).dt.total_seconds() / 60

#### Question: How to know when to just throw away rows?

In [80]:
print(trips_routes_times4.delay_min.describe(percentiles))

count   3058532.00
mean          3.22
std          34.06
min        -839.98
1%           -5.18
2%           -3.85
5%           -2.45
10%          -1.52
50%           1.45
90%           7.70
95%          10.90
98%          15.93
99%          20.53
max        2277.53
Name: delay_min, dtype: float64


In [81]:
trips_routes_times4.loc[trips_routes_times4.delay_min > 2277]

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,shape_array_key,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,trip_instance_key,converted_rt_arrival,converted_schd_arrival,delay_min
2891062,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,Flagship Cruises and Events Inc.,ef09d95f35ccfd5a696eae1ea220b615,Santee - 12th & Imperial,"Tram, Streetcar, Light rail",530,0.0,75044,11,49892,86040.0,24a5c851c69af5a1bd292c27d44ab6a3,2024-05-23 13:51:32,2024-05-21 23:54:00,2277.53
2891063,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego International Airport,ef09d95f35ccfd5a696eae1ea220b615,Santee - 12th & Imperial,"Tram, Streetcar, Light rail",530,0.0,75044,11,49892,86040.0,24a5c851c69af5a1bd292c27d44ab6a3,2024-05-23 13:51:32,2024-05-21 23:54:00,2277.53
2891064,2024-05-22,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,ef09d95f35ccfd5a696eae1ea220b615,Santee - 12th & Imperial,"Tram, Streetcar, Light rail",530,0.0,75044,11,49892,86040.0,24a5c851c69af5a1bd292c27d44ab6a3,2024-05-23 13:51:32,2024-05-21 23:54:00,2277.53


In [82]:
trips_routes_times4.loc[trips_routes_times4.delay_min <= -839]

Unnamed: 0,service_date,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,shape_array_key,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,trip_instance_key,converted_rt_arrival,converted_schd_arrival,delay_min
216772,2024-05-22,04 - Oakland,b82a23bef8a501e980c086ef269ffec7,ed7a212f2a38fd8734244030b40e4d07,City of Union City,21dbb49fc4b8b008453a1bd6447f466f,Tamarack,Bus,4,1.0,3537271,7,58141,22140.0,e1f161b232c2de4d42e3de5b8658ba5f,2024-05-22 16:09:01,2024-05-23 06:09:00,-839.98


### Question: Last time, I received the suggestion to throw away things more than 2 hrs because that is not bunching. But wouldn't we want all rows to be included for this metric? 
* Filter out for just "extreme" values.

In [83]:
trips_routes_times4 = trips_routes_times4.loc[
    trips_routes_times4.delay_min < 120
].reset_index(drop=True)

In [84]:
trips_routes_times4 = trips_routes_times4.loc[
    trips_routes_times4.delay_min > -120
].reset_index(drop=True)

In [85]:
len(trips_routes_times3)

3073590

In [86]:
len(trips_routes_times4)

3055635

In [87]:
trips_routes_times5 = trips_routes_times4.drop(
    columns=[
        "service_date",
        "delay_min",
    ]
)

In [88]:
# Sort again?

### Calculate the actual & scheduled headway the `operator-route-direction_id-stop_sequence-stop_id-` grain

In [89]:
groupby_cols = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "feed_key",
    "organization_name",
    "route_id",
    "route_long_name",
    "route_type",
    "shape_array_key",
    "direction_id",
    "stop_id",
    "stop_sequence",
]

In [90]:
# trips_routes_times5["actual_arrival_lag_min"] = trips_routes_times5.groupby(groupby_cols)[
#    "converted_rt_arrival"
# ].transform(lambda x: (x - x.shift()).dt.total_seconds() / 60)

In [91]:
# trips_routes_times5["scheduled_arrival_lag_min"] = trips_routes_times5.groupby(groupby_cols)[
#    "converted_schd_arrival"
# ].transform(lambda x: (x - x.shift()).dt.total_seconds() / 60)

In [92]:
trips_routes_times5["actual_arrival_lag_min"] = (
    trips_routes_times5.groupby(groupby_cols)["converted_rt_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

In [93]:
trips_routes_times5["scheduled_arrival_lag_min"] = (
    trips_routes_times5.groupby(groupby_cols)["converted_schd_arrival"]
    .diff()
    .dt.total_seconds()
    / 60
)

#### Check San Diego

In [94]:
sd_test = trips_routes_times5.loc[
    (trips_routes_times5.organization_name == "San Diego Metropolitan Transit System")
    & (trips_routes_times5.route_id == "834")
]

In [95]:
sd_test.shape

(62, 18)

In [96]:
sd_test.shape_array_key.nunique()

1

In [97]:
trips_routes_times5["scheduled_arrival_lag_min"].describe(percentiles)

count   2835545.00
mean          8.22
std         326.66
min       -1430.00
1%         -882.00
2%         -756.48
5%         -586.00
10%        -420.00
50%          30.00
90%         400.00
95%         572.00
98%         738.00
99%         848.23
max        1425.00
Name: scheduled_arrival_lag_min, dtype: float64

In [98]:
trips_routes_times5["actual_arrival_lag_min"].describe(percentiles)

count   2835545.00
mean          8.24
std         327.00
min       -1435.78
1%         -882.72
2%         -757.53
5%         -586.85
10%        -419.47
50%          31.17
90%         400.52
95%         572.43
98%         738.65
99%         848.52
max        1434.42
Name: actual_arrival_lag_min, dtype: float64

### Checking out some extreme values

In [99]:
preview_time_col = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "stop_id",
    "stop_sequence",
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
]

### Many lags are actually empty b/c it's the first of that groupby-sequence.

### Transit Matters Method

In [100]:
transit_matters_df1 = trips_routes_times5.copy()

In [101]:
transit_matters_df1["pct_actual_schd_headway"] = (
    transit_matters_df1.actual_arrival_lag_min
    / transit_matters_df1.scheduled_arrival_lag_min
)

In [102]:
transit_matters_df1["bunched_y_n"] = np.where(
    transit_matters_df1["pct_actual_schd_headway"] < 0.25, "bunched", "not bunched"
)

In [103]:
transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)

not bunched   1.00
bunched       0.00
Name: bunched_y_n, dtype: float64

In [104]:
transit_matters_df1.head(1)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,shape_array_key,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,trip_instance_key,converted_rt_arrival,converted_schd_arrival,actual_arrival_lag_min,scheduled_arrival_lag_min,pct_actual_schd_headway,bunched_y_n
0,07 - Los Angeles,1fd2f07342d966919b15d5d37fda8cc8,e9a188003e67026bf648e639cf4b3f9d,"University of California, Los Angeles",4dbba8014c5e63cfb381249002aa683f,U4,Bus,e24126d6-fbad-46b1-a498-75026e763636,0.0,0e466c39-2ade-49f8-a79d-b929dde2cfe5,3,57978,57720.0,1bbf8cf8a0db82e6e56b9a20fe9414a6,2024-05-22 16:06:18,2024-05-22 16:02:00,,,,not bunched


#### Aggregate.
* At this point, it doesn't matter the sequence, we just care about how bunched the traffic is around one partiuclar stop. 
* See how many trips for that grain are considered "bunched" or not.

In [105]:
def bunched_not_bunched(
    df: pd.DataFrame, bunched_y_n: str, groupby_cols: list
) -> pd.DataFrame:
    df2 = df.loc[df.bunched_y_n == bunched_y_n].reset_index(drop=True)

    bunched_y_n = bunched_y_n.replace(" ", "_")
    agg1 = (
        df2.groupby(groupby_cols).agg({"trip_instance_key": "nunique"}).reset_index()
    ).rename(columns={"trip_instance_key": f"{bunched_y_n}_trips"})
    return agg1

In [106]:
def agg_final_df(df: pd.DataFrame) -> pd.DataFrame:
    groupby_cols = [
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "feed_key",
        "organization_name",
        "route_long_name",
        "route_type",
        "route_id",
        "direction_id",
        "stop_id",
        "stop_sequence",
    ]

    # Find total trips that are bunched
    bunched = bunched_not_bunched(df, "bunched", groupby_cols)

    # Find total trips that are NOT bunched
    not_bunched = bunched_not_bunched(df, "not bunched", groupby_cols)

    # Merge
    m1 = pd.merge(not_bunched, bunched, on=groupby_cols, how="outer")

    # Find the % of bunched trips
    m1 = m1.fillna(0)
    m1["all_trips"] = m1.not_bunched_trips + m1.bunched_trips
    m1["per_trip_bunched_per_stop"] = m1.bunched_trips / m1.all_trips

    # Filter out any rows with only one trip of that groupby combo
    # for that service date
    m1 = m1.loc[m1.all_trips > 1].reset_index(drop=True)
    m1 = m1.drop(columns=["not_bunched_trips", "bunched_trips"])

    return m1

In [107]:
transit_matters_m1 = agg_final_df(transit_matters_df1)

In [108]:
# transit_matters_m1 = (
#   transit_matters_m1.sort_values(by=["all_trips"], ascending=False)
#   .drop_duplicates(subset=transit_matters_agg)
#   .reset_index(drop=True)
# )

### Help: Swapped order of a bus is messing with the transit matters metric.
* How to solve for this?? 

In [109]:
preview_cols = [
    "converted_rt_arrival",
    "actual_arrival_lag_min",
    "converted_schd_arrival",
    "scheduled_arrival_lag_min",
    "pct_actual_schd_headway",
    "bunched_y_n",
]

In [110]:
example2 = transit_matters_df1.loc[
    (transit_matters_df1.stop_id == "2307719")
    & (transit_matters_df1.organization_name == "City of Visalia")
    & (transit_matters_df1.route_id == "2042")
    & (transit_matters_df1.shape_array_key == "60da59c7000ea5dcb5f845d8fa227f14")
]

#### Starting row 33484: the RT Arrival time is swapped. A bus that was scheduled to arrive at 4:27 arrived boefre the bus arrived at 3:42.
* This repeats again row 33486.

In [111]:
example2[preview_cols]

Unnamed: 0,converted_rt_arrival,actual_arrival_lag_min,converted_schd_arrival,scheduled_arrival_lag_min,pct_actual_schd_headway,bunched_y_n
2075195,2024-05-22 12:38:08,,2024-05-22 12:42:00,,,not bunched
2075445,2024-05-22 14:58:22,140.23,2024-05-22 14:57:00,135.0,1.04,not bunched
2075471,2024-05-22 20:09:43,311.35,2024-05-22 18:42:00,225.0,1.38,not bunched
2076247,2024-05-22 16:26:53,-222.83,2024-05-22 16:27:00,-135.0,1.65,not bunched
2076274,2024-05-22 17:08:47,41.9,2024-05-22 15:42:00,-45.0,-0.93,bunched
2076406,2024-05-22 12:01:01,-307.77,2024-05-22 11:57:00,-225.0,1.37,not bunched
2077765,2024-05-22 17:57:23,356.37,2024-05-22 17:57:00,360.0,0.99,not bunched
2077943,2024-05-22 06:32:47,-684.6,2024-05-22 06:42:00,-675.0,1.01,not bunched
2079315,2024-05-22 09:37:59,185.2,2024-05-22 09:42:00,180.0,1.03,not bunched
2079699,2024-05-22 18:41:37,543.63,2024-05-22 17:12:00,450.0,1.21,not bunched


### Use 2 minute benchmark
* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)
* Justifying the use of
headway maintenance. For example, in April
2022 the 66 bus significantly bunched around
several stops. When bunching is defined as
buses that run within two minutes or less of
each other, inbound buses towards Nubian
Square bunched 10% of the time at Brigham
Circle, 9% at Brookline Village and Roxbury
Crossing, and 8% of the time at Coolidge
Corner. Bunching is even more dramatic
outbound towards Harvard Square where
buses bunched over 35% of the time at Winship
St, 13% at Coolidge Corner and Harvard Ave at
Commonwealth Ave, and 12% at North Harvard
St at Western Ave. View more data about bus
bunching through the TransitMatters Data
Dashboard here.

* To Do: add back in route  & operator information

In [112]:
two_minutes_df = trips_routes_times5.copy()

In [113]:
two_minutes_df["bunched_y_n"] = np.where(
    two_minutes_df["actual_arrival_lag_min"] <= 2, "bunched", "not bunched"
)

In [114]:
two_minutes_df.bunched_y_n.value_counts() / len(two_minutes_df)

not bunched   0.65
bunched       0.35
Name: bunched_y_n, dtype: float64

In [115]:
final_two_minute = agg_final_df(two_minutes_df)

In [116]:
final_two_minute.loc[
    (final_two_minute.stop_id == "2307695")
    & (final_two_minute.organization_name == "City of Visalia")
    & (final_two_minute.route_id == "2042")
]

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
30698,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307695,16,18.0,0.39


### Comparing both outcomes
* There are so many more bunched trips for the 2 minute approach.
* Add back in schedule_gtfs_key and then grab stop level data from the warehouse.

In [117]:
final_two_minute.per_trip_bunched_per_stop.describe(percentiles)

count   141272.00
mean         0.31
std          0.20
min          0.00
1%           0.00
2%           0.00
5%           0.00
10%          0.00
50%          0.35
90%          0.52
95%          0.56
98%          0.62
99%          0.67
max          0.88
Name: per_trip_bunched_per_stop, dtype: float64

In [118]:
final_two_minute.sort_values(by=["per_trip_bunched_per_stop"], ascending=False).head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
75560,07 - Los Angeles,f1b35a50955aeb498533c1c6fdafbe44,a7d4b3eb24de3d941c939ca98d681a1a,Long Beach Transit,ALAMITOS - ORANGE,Bus,71,1.0,2842,62,8.0,0.88
75561,07 - Los Angeles,f1b35a50955aeb498533c1c6fdafbe44,a7d4b3eb24de3d941c939ca98d681a1a,Long Beach Transit,ALAMITOS - ORANGE,Bus,71,1.0,2844,63,8.0,0.88
125252,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego International Airport,Old Town - UTC via Pacific Beach,Bus,30,0.0,10399,56,5.0,0.8
94411,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,Foothill Transit,Azusa- Claremont- Montclair Transit Cent,Bus,188,1.0,1227,3827,5.0,0.8
80592,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Azusa- Claremont- Montclair Transit Cent,Bus,188,1.0,1227,3827,5.0,0.8


In [119]:
transit_matters_m1.per_trip_bunched_per_stop.describe(percentiles)

count   141272.00
mean         0.00
std          0.04
min          0.00
1%           0.00
2%           0.00
5%           0.00
10%          0.00
50%          0.00
90%          0.00
95%          0.01
98%          0.04
99%          0.12
max          0.75
Name: per_trip_bunched_per_stop, dtype: float64

In [120]:
transit_matters_m1.sort_values(by=["per_trip_bunched_per_stop"], ascending=False).head(
    10
)

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
21114,04 - Oakland,c499f905e33929a641f083dad55c521e,df01659fb8ecc1234246138e97fe36e6,Alameda-Contra Costa Transit District,Skyline High - 35th Ave.,Bus,654,0.0,55509,14,4.0,0.75
21106,04 - Oakland,c499f905e33929a641f083dad55c521e,df01659fb8ecc1234246138e97fe36e6,Alameda-Contra Costa Transit District,Skyline High - 35th Ave.,Bus,654,0.0,54045,13,4.0,0.75
21103,04 - Oakland,c499f905e33929a641f083dad55c521e,df01659fb8ecc1234246138e97fe36e6,Alameda-Contra Costa Transit District,Skyline High - 35th Ave.,Bus,654,0.0,52532,16,4.0,0.75
21112,04 - Oakland,c499f905e33929a641f083dad55c521e,df01659fb8ecc1234246138e97fe36e6,Alameda-Contra Costa Transit District,Skyline High - 35th Ave.,Bus,654,0.0,55269,12,4.0,0.75
21111,04 - Oakland,c499f905e33929a641f083dad55c521e,df01659fb8ecc1234246138e97fe36e6,Alameda-Contra Costa Transit District,Skyline High - 35th Ave.,Bus,654,0.0,55227,15,4.0,0.75
24557,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62137,16,3.0,0.67
24578,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,64877,15,3.0,0.67
24550,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62079,19,3.0,0.67
24575,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,64875,14,3.0,0.67
24547,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62078,13,3.0,0.67


### Make Visuals

In [136]:
freq_range = [
    "#ccbb44",
    "#e9d868",
    "#fcb40e",
    "#ff9c42",
    "#fc5c04",
    "#dd217d",
    "#dd217d",
    "#dd217d",
]

In [137]:
trips_routes_times5["hour"] = trips_routes_times5["converted_rt_arrival"].dt.hour
trips_routes_times5["min"] = trips_routes_times5["converted_rt_arrival"].dt.minute

In [138]:
def compare_approaches(
    stop_id: str, organization_name: str, route_id: str, stop_sequence: int
):
    transit_matter = transit_matters_m1.loc[
        (transit_matters_m1.stop_id == stop_id)
        & (transit_matters_m1.organization_name == organization_name)
        & (transit_matters_m1.route_id == route_id)
        & (transit_matters_m1.stop_sequence == stop_sequence)
    ]
    print("Transit Matters")
    display(transit_matter)

    two_min = final_two_minute.loc[
        (final_two_minute.stop_id == stop_id)
        & (final_two_minute.organization_name == organization_name)
        & (final_two_minute.route_id == route_id)
        & (final_two_minute.stop_sequence == stop_sequence)
    ]
    print("Two Minutes")
    display(two_min)
    total_trips = trips_routes_times5.loc[
        (trips_routes_times5.stop_id == stop_id)
        & (trips_routes_times5.organization_name == organization_name)
        & (trips_routes_times5.route_id == route_id)
        & (trips_routes_times5.stop_sequence == stop_sequence)
    ]

    og = trips_routes_times.loc[
        (trips_routes_times.stop_id == stop_id)
        & (trips_routes_times.organization_name == organization_name)
        & (trips_routes_times.route_id == route_id)
        & (trips_routes_times.stop_sequence == stop_sequence)
    ]

    display(total_trips.trip_instance_key.nunique())

    chart = (
        alt.Chart(total_trips)
        .mark_circle(size=500)
        .encode(
            x="hour",
            y="min",
            color=alt.Color(
                "hour",
                scale=alt.Scale(range=freq_range),
            ),
            tooltip=["hour", "min", "actual_arrival_lag_min"],
        )
        .properties(width=800, height=400)
    )
    display(chart)
    return og, total_trips

In [139]:
df_stc1, df_stc2 = compare_approaches(
    stop_id="62078",
    organization_name="Santa Clara Valley Transportation Authority",
    route_id="51",
    stop_sequence=13,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
24547,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62078,13,3.0,0.67


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
24547,04 - Oakland,fb467982dcc77a7f9199bebe709bb700,3fccf089909fbdb3d725a5c15fb062cb,Santa Clara Valley Transportation Authority,Moffett Field - West Valley Coll,Bus,51,0.0,62078,13,3.0,0.33


3

In [140]:
df_duarte1, df_duarte2 = compare_approaches(
    stop_id="2665",
    organization_name="City of Duarte",
    route_id="707",
    stop_sequence=3696,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
85283,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Montclair-Pomona- El Monte- L.A.,Bus,707,0.0,2665,3696,2.0,0.5


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
85283,07 - Los Angeles,f74424acf8c41e4c1e9fd42838c4875c,96358f776e5fcd8d2b6066507aed6645,City of Duarte,Montclair-Pomona- El Monte- L.A.,Bus,707,0.0,2665,3696,2.0,0.5


2

In [141]:
df_vis1, df_vis2 = compare_approaches(
    stop_id="2307469",
    organization_name="City of Visalia",
    route_id="2042",
    stop_sequence=27,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
30688,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,16.0,0.06


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
30688,06 - Fresno,3bda4652977200408690059ef2ec4b4d,0e89d1fd3bd2a09bbbd0d4f79ea5663b,City of Visalia,Route 9,Bus,2042,1.0,2307469,27,16.0,0.44


16

#### Looks like some examples aren't calculating the lag times correctly. Also some rows are duplicated.

In [142]:
df_sd1, df_sd2 = compare_approaches(
    stop_id="88949",
    organization_name="San Diego Metropolitan Transit System",
    route_id="834",
    stop_sequence=19,
)

Transit Matters


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
132934,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,West Santee Loop,Bus,834,0.0,88949,19,2.0,0.0


Two Minutes


Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,all_trips,per_trip_bunched_per_stop
132934,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,West Santee Loop,Bus,834,0.0,88949,19,2.0,0.0


2

In [143]:
df_sd2

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,feed_key,organization_name,shape_array_key,route_long_name,route_type,route_id,direction_id,stop_id,stop_sequence,rt_arrival_sec,scheduled_arrival_sec,trip_instance_key,converted_rt_arrival,converted_schd_arrival,actual_arrival_lag_min,scheduled_arrival_lag_min,hour,min
2416126,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,88949,19,32654,32760.0,374d1596c1010a2b9e089b99eed5cda8,2024-05-22 09:04:14,2024-05-22 09:06:00,,,9,4
2875628,11 - San Diego,baeeb157e85a901e47b828ef9fe75091,db8c6e0cf5ece2a8cdb5bdc71d049bd1,San Diego Metropolitan Transit System,d3fdb491652ec5aa4313a64aa1c91080,West Santee Loop,Bus,834,0.0,88949,19,36275,36360.0,974a3571e3567999e8e359a207163d67,2024-05-22 10:04:35,2024-05-22 10:06:00,60.35,60.0,10,4
