In [None]:
"""%%sh
cd ~/data-analyses/rt_segment_speeds
pip install -r requirements.txt"""

In [None]:
from shared_utils import catalog_utils, rt_dates, gtfs_utils_v2
import geopandas as gpd
import pandas as pd
import numpy as np
import google.auth

In [None]:
from retrospective_feed_generation import *
from warehouse_utils import *
from gtfs_utils import *

In [None]:
credentials, _ = google.auth.default()

In [None]:
TARGET_DATE = rt_dates.DATES["apr2025"]
EXAMPLE_FEED_SCHEDULE_NAME = "LA Metro Bus Schedule"

In [None]:
feed_lookup_response = (
    gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
        selected_date=TARGET_DATE, keep_cols=["name", "gtfs_dataset_key", "feed_key"]
    )
    .set_index("name")
    .loc[EXAMPLE_FEED_SCHEDULE_NAME]
)
gtfs_dataset_key = feed_lookup_response["gtfs_dataset_key"]
feed_key = feed_lookup_response["feed_key"]

In [None]:
rt_vs_schedule_stop_times_table = schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(
    gtfs_dataset_key,
    TARGET_DATE
)

In [None]:
rt_vs_schedule_stop_times_table_sorted = rt_vs_schedule_stop_times_table.sort_values(
    ["schedule_gtfs_dataset_key", "trip_instance_key", "stop_sequence"], kind="stable"
)
grouped_by_trip = rt_vs_schedule_stop_times_table_sorted.groupby(
    ["schedule_gtfs_dataset_key", "trip_instance_key"]
)
shifted_grouped = grouped_by_trip[["scheduled_arrival_sec", "rt_arrival_sec"]].shift(1)
rt_vs_schedule_stop_times_table_sorted["non_sequential_rt_arrival"] = (
    shifted_grouped["rt_arrival_sec"] > rt_vs_schedule_stop_times_table_sorted["rt_arrival_sec"]
)
rt_vs_schedule_stop_times_table_sorted["non_sequential_scheduled_arrival"] = (
    shifted_grouped["scheduled_arrival_sec"] > rt_vs_schedule_stop_times_table_sorted["scheduled_arrival_sec"]
)

## Exploring non-sequential stops

In [None]:
# Are there any non sequential schedule stop-times
rt_vs_schedule_stop_times_table_sorted.non_sequential_scheduled_arrival.any()

In [None]:
# Looks like there are non sequential rt stop times
non_sequential_rt_subset = rt_vs_schedule_stop_times_table_sorted.loc[
    rt_vs_schedule_stop_times_table_sorted.non_sequential_rt_arrival
].copy()
non_sequential_rt_subset.trip_id.value_counts()

In [None]:
# Map stops by the number of nonsequential, to see if they're random or if there's a pattern
gtfs_data_dict = catalog_utils.get_catalog("gtfs_analytics_data")
read_parquet_kwargs = {
    "storage_options": {"token": credentials.token},
    "filters": [("feed_key", "=", feed_key)],
}
stops_uri = (
    f"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.stops}_{TARGET_DATE}.parquet"
)
stops_response = gpd.read_parquet(stops_uri, **read_parquet_kwargs)
stops_merged = stops_response.merge(
    non_sequential_rt_subset.stop_id.value_counts().rename("nonsequential_counts"),
    left_on="stop_id",
    right_index=True,
    validate="one_to_one",
    how="left"
)
stops_merged["nonsequential_counts"] = stops_merged["nonsequential_counts"].fillna(0)

### Map nonsequential stops

In [None]:
stops_merged[["stop_id", "stop_name", "nonsequential_counts", "geometry"]].explore(column="nonsequential_counts")

### Do any routes have a large number of non-sequential stops?

In [None]:
trips_uri = (
    f"{gtfs_data_dict.schedule_downloads.dir}{gtfs_data_dict.schedule_downloads.trips}_{TARGET_DATE}.parquet"
)
trips_response = pd.read_parquet(
    trips_uri, 
    columns=["trip_id", "route_id", "shape_id"],
    **read_parquet_kwargs
)
trips_with_nonsequential_stops = trips_response.merge(
    non_sequential_rt_subset.trip_id.value_counts().rename("nonsequential_counts"),
    left_on="trip_id",
    right_index=True,
    how="inner",
    validate="one_to_one"
)
stop_times_with_route = rt_vs_schedule_stop_times_table_sorted.merge(
    trips_response,
    on="trip_id",
    how="left",
    validate="many_to_one"
)
route_total_stop_times = stop_times_with_route.route_id.value_counts()
route_total_nonsequential_stops = trips_with_nonsequential_stops.route_id.value_counts()
non_sequential_stop_proportion = (route_total_nonsequential_stops / route_total_stop_times).sort_values(ascending=False)

In [None]:
non_sequential_stop_proportion

In [None]:
"""example_17_trip_id = trips_with_nonsequential_stops.loc[
    (trips_with_nonsequential_stops.route_id == "720"),
    "trip_id"
].iloc[0]
example_trip = rt_vs_schedule_stop_times_table_sorted.loc[
    rt_vs_schedule_stop_times_table_sorted.trip_id == example_17_trip_id
]
gdf_one_trip_stops = gpd.GeoDataFrame(
    example_trip.merge(
        stops_response[["stop_id", stops_response.geometry.name]],
        how="left",
        on="stop_id"
    )
)
gdf_one_trip_stops.explore(column="non_sequential_rt_arrival")"""

In [None]:
gdf_one_trip_stops

### Exploring skipped stops

In [None]:
from segment_speed_utils import helpers, segment_calcs

SEGMENT_GCS = GTFS_DATA_DICT.gcs_paths.SEGMENT_GCS
RT_SCHED_GCS = GTFS_DATA_DICT.gcs_paths.RT_SCHED_GCS

# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now
def prep_scheduled_stop_times(
    analysis_date: str
) -> pd.DataFrame: 
    """
    Import scheduled stop times and merge in 
    gtfs_dataset_key and trip_instance_key.
    """
    trips = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["feed_key", "gtfs_dataset_key",
                   "trip_id", "trip_instance_key"],
        get_pandas = True
    )

    stop_times = helpers.import_scheduled_stop_times(
        analysis_date,
        columns = ["feed_key", "trip_id", 
                   "stop_id", "stop_sequence",
                   "arrival_sec",
                  ],
        get_pandas = True,
        with_direction = False
    ).merge(
        trips,
        on = ["feed_key", "trip_id"],
        how = "inner"
    ).drop(
        columns = ["feed_key"]
    ).rename(
        columns = {"arrival_sec": "scheduled_arrival_sec"}
    )
    
    return stop_times

# Unchanged from rt_scheduled_v_ran, but isn't in a package so we have to copy paste for now
def prep_rt_stop_times(
    analysis_date: str,
    trip_stop_cols: list
) -> pd.DataFrame: 
    """
    For RT stop arrivals, drop duplicates based on interpolated
    arrival times. Keep the first arrival time,
    the rest would violate a monotonically increasing condition.
    """
    STOP_ARRIVALS = GTFS_DATA_DICT.rt_stop_times.stage3
    
    df = pd.read_parquet(
        f"{SEGMENT_GCS}{STOP_ARRIVALS}_{analysis_date}.parquet",
        columns = trip_stop_cols + ["arrival_time"]
    ).rename(columns = {"arrival_time": "rt_arrival"})

    df2 = df.sort_values(
        trip_stop_cols
    ).drop_duplicates(
        subset=["trip_instance_key", "rt_arrival"]
    ).reset_index(drop=True)
    
    df2 = segment_calcs.convert_timestamp_to_seconds(
        df2, ["rt_arrival"]
    ).drop(columns = "rt_arrival")
    
    return df2

def assemble_scheduled_rt_stop_times_outer_merge(
    analysis_date: str,
    trip_stop_cols: list
) -> pd.DataFrame: 
    """
    Merge scheduled and rt stop times so we can compare
    scheduled arrival (seconds) and RT arrival (seconds).
    """
    sched_stop_times = prep_scheduled_stop_times(analysis_date)
    rt_stop_times = prep_rt_stop_times(analysis_date, trip_stop_cols)
    
    df = pd.merge(
        sched_stop_times,
        rt_stop_times,
        on = trip_stop_cols,
        how = "outer"
    )
    
    return df

def shortcut_assemble_scheduled_rt_stop_times_outer_merge(analysis_date: str) -> pd.DataFrame:
    return assemble_scheduled_rt_stop_times_outer_merge(analysis_date, [*gtfs_data_dict.rt_stop_times.trip_stop_cols])

In [None]:
outer_merged_stop_times = shortcut_assemble_scheduled_rt_stop_times_outer_merge(TARGET_DATE)

In [None]:
outer_merged_stop_times_filtered = outer_merged_stop_times.loc[
    outer_merged_stop_times.schedule_gtfs_dataset_key == gtfs_dataset_key
].copy()
outer_merged_stop_times_filtered["rt_skipped"] = (
    outer_merged_stop_times_filtered.rt_arrival_sec.isna()
    & ~outer_merged_stop_times.scheduled_arrival_sec.isna()
)
outer_merged_stop_times_no_rt_time = outer_merged_stop_times_filtered.loc[
    outer_merged_stop_times_filtered.rt_skipped
]
n_skipped_stops_by_trip = outer_merged_stop_times_no_rt_time.trip_instance_key.value_counts()
rt_trips_with_skipped_stops = n_skipped_stops_by_trip.loc[
    n_skipped_stops_by_trip != outer_merged_stop_times_filtered.trip_instance_key.value_counts().loc[n_skipped_stops_by_trip.index]
]
outer_merged_stop_times_no_rt_time

In [None]:
example_trip = outer_merged_stop_times_filtered.loc[
    outer_merged_stop_times_filtered.trip_instance_key == rt_trips_with_skipped_stops.index[500]
]
gpd.GeoDataFrame(
    example_trip.merge(
        stops_response,
        how="left",
        on="stop_id"
    )[["geometry", "stop_id", "rt_arrival_sec", "rt_skipped"]]
).explore(column="rt_skipped")

##### stops_response