In [None]:
import datetime as dt
import pathlib

import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd
from gtfs_utils import *
from gtfslite import GTFS
from retrospective_feed_generation import *
from retrospective_feed_generation import _filter_na_stop_times, _filter_non_rt_trips
from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates
from warehouse_utils import *
import columns as col

### Get RT Data

In [None]:
CREDENTIALS, _ = google.auth.default()

In [None]:
SAMPLE_DATE_STR = rt_dates.DATES["apr2025"]
FEED_NAME = "Big Blue Bus Schedule"

In [None]:
SAMPLE_DATE_STR

In [None]:
gtfs_dataset_key = (
    gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
        selected_date=SAMPLE_DATE_STR, keep_cols=["name", "gtfs_dataset_key"]
    )
    .set_index("name")
    .at[FEED_NAME, "gtfs_dataset_key"]
)
gtfs_dataset_key

In [None]:
schedule_rt_stop_times_single_agency = _filter_non_rt_trips(
    get_schedule_rt_stop_times_table(gtfs_dataset_key, SAMPLE_DATE_STR), col.DEFAULT_COLUMN_MAP
).reset_index(drop=True)

In [None]:
schedule_rt_stop_times_single_agency.to_parquet("test.parquet")

In [None]:
schedule_rt_stop_times_single_agency = _filter_non_rt_trips(
    pd.read_parquet("test.parquet"), columns=col.DEFAULT_COLUMN_MAP
).reset_index(drop=True)

In [None]:
impute_unrealistic_rt_times(
    schedule_rt_stop_times_single_agency, max_gap_length=5, columns=col.DEFAULT_COLUMN_MAP
)

In [None]:
schedule_rt_stop_times_single_agency["gap_imputed_sec"] = impute_unrealistic_rt_times(
    schedule_rt_stop_times_single_agency, max_gap_length=5, columns=col.DEFAULT_COLUMN_MAP
)

In [None]:
schedule_rt_stop_times_single_agency

### Get schedule feed

In [None]:
# TODO: right now this was just a download based on the url in airtable
# Need to make it traceable instead
GTFS_FEED_PARENT = f"../conveyal_update/feeds_{SAMPLE_DATE_STR}/socal/"
GTFS_FEED_GLOB = "Big_Blue_Bus_Schedule_*.zip/*.zip"

# GTFS_FEED_PARENT = "./feeds/"
# GTFS_FEED_GLOB = "big_blue_bus_2025-03*.zip"

ARBITRARY_SERVICE_ID = "0"
GTFS_DATE_STRFTIME_CODE = "%Y%m%d"

In [None]:
feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)
feed_path = next(feed_paths)
assert next(feed_paths, None) is None, "Ambiguous Schedule Feed"
feed = GTFS.load_zip(feed_path)
feed_filtered = subset_schedule_feed_to_one_date(
    feed, dt.date.fromisoformat(SAMPLE_DATE_STR)
)

### Merge schedule / rt

In [None]:
output_feed = make_retrospective_feed_single_date(
    filtered_input_feed=feed_filtered,
    stop_times_table=schedule_rt_stop_times_single_agency,
    stop_times_desired_columns=[
        "trip_id",
        "arrival_time",
        "departure_time" "drop_off_type",
        "pickup_type",
        "stop_headsign",
        "stop_id",
        "stop_sequence",
    ],
    stop_times_table_columns={**col.DEFAULT_COLUMN_MAP, col.RT_ARRIVAL_SEC: "gap_imputed_sec"}
)

In [None]:
output_feed

In [None]:
output_feed.write_zip(f"output_feeds/bbb_test_{SAMPLE_DATE_STR}.zip")

### Dropped shapes and stops

In [None]:
print("Get dropped shapes by their frequency")
feed_filtered.trips.loc[
    ~feed_filtered.trips.shape_id.isin(output_feed.trips.shape_id.unique()), "shape_id"
].value_counts()

In [None]:
print("Get dropped stops by the number of trips serving them in the original feed")
pd.DataFrame(
    feed_filtered.stop_times.loc[
        ~feed_filtered.stop_times.stop_id.isin(output_feed.stop_times.stop_id.unique()),
        "stop_id",
    ]
    .value_counts()
    .rename("stop_count")
).merge(
    feed_filtered.stops.set_index("stop_id")["stop_name"],
    how="left",
    left_index=True,
    right_index=True,
).head()

### Sample Trip

In [None]:
schedule_rt_stop_times_single_agency.loc[
    schedule_rt_stop_times_single_agency.trip_id == "902110"
]

In [None]:
output_feed.stop_times.loc[output_feed.stop_times["trip_id"] == "143110"]

In [None]:
feed_filtered.stop_times.loc[
    feed_filtered.stop_times["trip_id"] == "902110"
].sort_values("stop_sequence")