In [None]:
import datetime as dt
import pathlib

import columns as col
import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd
from gtfs_utils import *
#  pip install gtfs-lite
from gtfslite import GTFS
from retrospective_feed_generation import *
from retrospective_feed_generation import _filter_na_stop_times, _filter_non_rt_trips
from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates
from warehouse_utils import *

### Edit these values to change output

In [None]:
# the target date for feed generation
TARGET_DATE = rt_dates.DATES["apr2025"]
# the name (from airtable) of the schedule feed
FEED_NAME = "Big Blue Bus Schedule"
# the local path to the parent directory of the schedule feed
GTFS_FEED_PARENT = f"../conveyal_update/feeds_{TARGET_DATE}/socal/"
# a glob that produces one result within GTFS_FEED_PARENT and leads to the schedule feed
GTFS_FEED_GLOB = "Big_Blue_Bus_Schedule_*.zip/*.zip"
# the maximum number of stops where a gap should be imputed
MAX_STOP_GAP = 5
# the name of the output feed
OUTPUT_FEED_PATH = f"output_feeds/bbb_test_{TARGET_DATE}.zip"

In [None]:
path = pathlib.Path('./output_feeds')
if not path.exists(): path.mkdir()

### Get RT Data

In [None]:
# Get the schedule gtfs dataset key
gtfs_dataset_key = (
    gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
        selected_date=TARGET_DATE, keep_cols=["name", "gtfs_dataset_key"]
    )
    .set_index("name")
    .at[FEED_NAME, "gtfs_dataset_key"]
)
gtfs_dataset_key

In [None]:
# Get the merged schedule/stop times table
schedule_rt_stop_times_single_agency = _filter_non_rt_trips(
    get_schedule_rt_stop_times_table(gtfs_dataset_key, TARGET_DATE),
    col.DEFAULT_COLUMN_MAP,
).reset_index(drop=True)

In [None]:
#schedule_rt_stop_times_single_agency.to_parquet("cached_feed.parquet")

In [None]:
#schedule_rt_stop_times_single_agency = _filter_non_rt_trips(
#    pd.read_parquet("cached_feed.parquet"), columns=col.DEFAULT_COLUMN_MAP
#).reset_index(drop=True)

In [None]:
# Impute certain unrealistic (first/last, nonmonotonic, short gap) stop times
# Logic here is wip
schedule_rt_stop_times_single_agency["gap_imputed_sec"] = impute_unrealistic_rt_times(
    schedule_rt_stop_times_single_agency,
    max_gap_length=MAX_STOP_GAP,
    columns=col.DEFAULT_COLUMN_MAP,
)

### Get schedule feed

In [None]:
# Get the path to the schedule feed
feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)
feed_path = next(feed_paths)
assert next(feed_paths, None) is None, "Ambiguous Schedule Feed"

# Load the schedule feed using gtfs-lite and filter it
feed = GTFS.load_zip(feed_path)
feed_filtered = subset_schedule_feed_to_one_date(
    feed, dt.date.fromisoformat(TARGET_DATE)
)

### Merge schedule / rt

In [None]:
# Generate the feed based on the imputed rt times and the downloaded schedule feed
output_feed = make_retrospective_feed_single_date(
    filtered_input_feed=feed_filtered,
    stop_times_table=schedule_rt_stop_times_single_agency,
    stop_times_desired_columns=[
        "trip_id",
        "arrival_time",
        "departure_time" "drop_off_type",
        "pickup_type",
        "stop_headsign",
        "stop_id",
        "stop_sequence",
    ],
    stop_times_table_columns={
        **col.DEFAULT_COLUMN_MAP,
        col.RT_ARRIVAL_SEC: "gap_imputed_sec",
    },
)

In [None]:
# Save the output to a zip file
output_feed.write_zip(OUTPUT_FEED_PATH)

### Dropped shapes and stops

In [None]:
print("Get dropped shapes by their frequency")
feed_filtered.trips.loc[
    ~feed_filtered.trips.shape_id.isin(output_feed.trips.shape_id.unique()), "shape_id"
].value_counts()

In [None]:
print("Get dropped stops by the number of trips serving them in the original feed")
pd.DataFrame(
    feed_filtered.stop_times.loc[
        ~feed_filtered.stop_times.stop_id.isin(output_feed.stop_times.stop_id.unique()),
        "stop_id",
    ]
    .value_counts()
    .rename("stop_count")
).merge(
    feed_filtered.stops.set_index("stop_id")["stop_name"],
    how="left",
    left_index=True,
    right_index=True,
).head()

### Sample Trip

In [None]:
schedule_rt_stop_times_single_agency.loc[
    schedule_rt_stop_times_single_agency.trip_id == "902110"
]

In [None]:
output_feed.stop_times.loc[output_feed.stop_times["trip_id"] == "143110"]

In [None]:
feed_filtered.stop_times.loc[
    feed_filtered.stop_times["trip_id"] == "902110"
].sort_values("stop_sequence")