## Transit Bunching 
* cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest
* https://github.com/cal-itp/data-analyses/issues/1099

In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

ModuleNotFoundError: No module named 'segment_speed_utils'

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

In [None]:
analysis_date_list

In [None]:
may_date = "2024-05-22"

### Find frequent routes
* Try to figure out where `frequency` is calculated
    * `gtfs_funnel/schedule_stats_by_route`

In [None]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [None]:
ROUTE_DIR_EXPORT

#### Use `merge_data.concatenate_schedule_by_route_direction()`

In [None]:
import merge_data

In [None]:
subset = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "route_primary_direction",
    "service_date",
    "frequency",
]

In [None]:
route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[subset]

In [None]:
route_dir.head(2)

In [None]:
route_dir["frequency_in_minutes"] = 60 / route_dir.frequency

In [None]:
route_dir2 = route_dir.loc[route_dir.frequency_in_minutes <= 10]

In [None]:
route_dir2.frequency_in_minutes.describe()

In [None]:
route_dir.frequency_in_minutes.describe()

#### Crosswalk stuff
* Operators who run these high freq routes (by gtfs key)
* Route ID of high frequency routes

In [None]:
operators_with_high_frequency_routes = list(
    route_dir2.schedule_gtfs_dataset_key.unique()
)

In [None]:
frequent_routes = list(route_dir2.route_id.unique())

### Look at scheduled trips

In [None]:
TABLE = GTFS_DATA_DICT.schedule_downloads.trips

In [None]:
FILE = f"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet"

In [None]:
trips = pd.read_parquet(FILE)

In [None]:
trips_freq_routes = trips.loc[trips.route_id.isin(frequent_routes)][
    ["gtfs_dataset_key", "route_id", "trip_instance_key", "shape_array_key", "feed_key"]
].drop_duplicates()

In [None]:
trips_freq_routes = trips_freq_routes.rename(
    columns={"gtfs_dataset_key": "schedule_gtfs_dataset_key"}
)

In [None]:
trips_freq_routes.head(2)

### `rt_scheduled_v_ran/scripts/rt_stop_times.py`
* Tiffany already combined realtime and scheduled arrivals

In [None]:
RT_SCHED_GCS

In [None]:
GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times

In [None]:
rt_stop_times = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet"
)

In [None]:
# Get only relevant trips that are of "frequent routes"
pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
rt_stop_times2 = pd.merge(
    rt_stop_times,
    trips_freq_routes,
    on=[
        "schedule_gtfs_dataset_key",
        "trip_instance_key",
    ],
    how="inner",
)

In [None]:
rt_stop_times2.sample(5)

In [None]:
(45999-45840)/60

In [None]:
one_route = rt_stop_times2.loc[rt_stop_times2.route_id == "14"]

In [None]:
one_route = one_route.loc[
    one_route.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144"
]

In [None]:
one_stop = one_route.loc[one_route.stop_sequence == 2]

In [None]:
len(one_stop)

In [None]:
one_stop['rt_minus_schd_mins'] = (one_stop.rt_arrival_sec - one_stop.scheduled_arrival_sec)/60

In [None]:
one_stop = one_stop.sort_values(by=["stop_sequence", "scheduled_arrival_sec"])

In [None]:
one_stop['actual_headway'] = one_stop['rt_arrival_sec'].diff()

In [None]:
one_stop['schd_headway'] = one_stop['scheduled_arrival_sec'].diff()

In [None]:
17551-17539

In [None]:
one_stop['actual_headway_minus_schd'] = (one_stop.actual_headway - one_stop.schd_headway).fillna(0)

In [None]:
one_stop['mean_schd_headway'] = one_stop.schd_headway.mean()

In [None]:
one_stop['std_dev'] = ((one_stop['actual_headway'] - one_stop['mean_schd_headway']) ** 2).mean() ** 0.5

In [None]:
one_stop.drop(columns = ['schedule_gtfs_dataset_key','trip_instance_key','shape_array_key','feed_key'])

In [None]:
12+8+14+6+7+13

In [None]:
mean_schd_headway = one_stop.schd_headway.mean()

In [None]:
mean_schd_headway

In [None]:
one_stop.actual_headway.std()

In [None]:
import numpy as np

values = [12, 8, 14, 6, 7, 13]
mean = 10

# Calculate standard deviation
standard_deviation = np.std([value - mean for value in values])

print(standard_deviation)

In [None]:
standard_deviation/mean
