In [None]:
%%sh
cd ~/data-analyses/rt_segment_speeds
pip install -r requirements.txt

Obtaining file:///home/jovyan/data-analyses/rt_segment_speeds (from -r requirements.txt (line 1))
  Preparing metadata (setup.py): started


In [None]:
from shared_utils import catalog_utils, rt_dates, gtfs_utils_v2
import geopandas as gpd
import pandas as pd
import numpy as np
import google.auth

In [None]:
from retrospective_feed_generation import *
from warehouse_utils import *
from gtfs_utils import *

In [None]:
credentials, _ = google.auth.default()

In [None]:
TARGET_DATE = rt_dates.DATES["feb2025"]
EXAMPLE_FEED_SCHEDULE_NAME = "Big Blue Bus Schedule"

In [None]:
feed_lookup_response = (
    gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
        selected_date=TARGET_DATE, keep_cols=["name", "gtfs_dataset_key", "feed_key"]
    )
    .set_index("name")
    .loc[EXAMPLE_FEED_SCHEDULE_NAME]
)
gtfs_dataset_key = feed_lookup_response["gtfs_dataset_key"]
feed_key = feed_lookup_response["feed_key"]

In [None]:
rt_vs_schedule_stop_times_table = schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(
    gtfs_dataset_key,
    TARGET_DATE
)

In [None]:
rt_vs_schedule_stop_times_table_sorted = rt_vs_schedule_stop_times_table.sort_values(
    ["schedule_gtfs_dataset_key", "trip_instance_key", "stop_sequence"], kind="stable"
)
grouped_by_trip = rt_vs_schedule_stop_times_table_sorted.groupby(
    ["schedule_gtfs_dataset_key", "trip_instance_key"]
)
shifted_grouped = grouped_by_trip[["scheduled_arrival_sec", "rt_arrival_sec"]].shift(1)
rt_vs_schedule_stop_times_table_sorted["non_sequential_rt_arrival"] = (
    shifted_grouped["rt_arrival_sec"] > rt_vs_schedule_stop_times_table_sorted["rt_arrival_sec"]
)
rt_vs_schedule_stop_times_table_sorted["non_sequential_scheduled_arrival"] = (
    shifted_grouped["scheduled_arrival_sec"] > rt_vs_schedule_stop_times_table_sorted["scheduled_arrival_sec"]
)

## Exploring non-sequential stops

In [None]:
# Are there any non sequential schedule stop-times
rt_vs_schedule_stop_times_table_sorted.non_sequential_scheduled_arrival.any()

In [None]:
# Looks like there are non sequential rt stop times
non_sequential_rt_subset = rt_vs_schedule_stop_times_table_sorted.loc[
    rt_vs_schedule_stop_times_table_sorted.non_sequential_rt_arrival
].copy()
non_sequential_rt_subset.trip_id.value_counts()

In [None]:
# Map stops by the number of nonsequential, to see if they're random or if there's a pattern
gtfs_data_dict = catalog_utils.get_catalog("gtfs_analytics_data")
read_parquet_kwargs = {
    "storage_options": {"token": credentials.token},
    "filters": [("feed_key", "=", feed_key)],
}
stops_uri = (
    f"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.stops}_{TARGET_DATE}.parquet"
)
stops_response = gpd.read_parquet(stops_uri, **read_parquet_kwargs)
stops_merged = stops_response.merge(
    non_sequential_rt_subset.stop_id.value_counts().rename("nonsequential_counts"),
    left_on="stop_id",
    right_index=True,
    validate="one_to_one",
    how="left"
)
stops_merged["nonsequential_counts"] = stops_merged["nonsequential_counts"].fillna(0)

### Map nonsequential stops

In [None]:
stops_merged[["stop_id", "stop_name", "nonsequential_counts", "geometry"]].explore(column="nonsequential_counts")

### Do any routes have a large number of non-sequential stops?

In [None]:
trips_uri = (
    f"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.trips}_{TARGET_DATE}.parquet"
)
trips_response = pd.read_parquet(
    trips_uri, 
    columns=["trip_id", "route_id", "shape_id"],
    **read_parquet_kwargs
)
trips_with_nonsequential_stops = trips_response.merge(
    non_sequential_rt_subset.trip_id.value_counts().rename("nonsequential_counts"),
    left_on="trip_id",
    right_index=True,
    how="inner",
    validate="one_to_one"
)
stop_times_with_route = rt_vs_schedule_stop_times_table_sorted.merge(
    trips_response,
    on="trip_id",
    how="left",
    validate="many_to_one"
)
route_total_stop_times = stop_times_with_route.route_id.value_counts()
route_total_nonsequential_stops = trips_with_nonsequential_stops.route_id.value_counts()
non_sequential_stop_proportion = (route_total_nonsequential_stops / route_total_stop_times).sort_values(ascending=False)

In [None]:
non_sequential_stop_proportion

### Exploring skipped stops

In [None]:
from segment_speed_utils import helpers, segment_calcs
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS, RT_SCHED_GCS

def assemble_scheduled_rt_stop_times_outer_merge(
    analysis_date: str,
    trip_stop_cols: list
) -> pd.DataFrame: 
    """
    Merge scheduled and rt stop times so we can compare
    scheduled arrival (seconds) and RT arrival (seconds).
    """
    sched_stop_times = prep_scheduled_stop_times(analysis_date)
    rt_stop_times = prep_rt_stop_times(analysis_date, trip_stop_cols)
    
    df = pd.merge(
        sched_stop_times,
        rt_stop_times,
        on = trip_stop_cols,
        how = "inner"
    )
    
    return df

def shortcut_assemble_scheduled_rt_stop_times_outer_merge(analysis_date: str) -> pd.DataFrame:
    return assemble_scheduled_rt_stop_times_outer_merge(analysis_date, [*gtfs_data_dict.rt_stop_times.trip_stop_cols])

In [None]:
outer_merged_stop_times = shortcut_assemble_scheduled_rt_stop_times_outer_merge(TARGET_DATE)