In [1]:
import datetime as dt
import pathlib

import retrospective_feed_generation.columns as col
import geopandas as gpd
import numpy as np
import pandas as pd
from retrospective_feed_generation.gtfs_utils import *
#  pip install gtfs-lite
from gtfslite import GTFS
from retrospective_feed_generation.retrospective_feed_generation import *
from retrospective_feed_generation.retrospective_feed_generation import _filter_non_rt_trips
from shared_utils import gtfs_utils_v2, rt_dates
from retrospective_feed_generation.warehouse_utils import *

### Edit these values to change output

In [2]:
# the target date for feed generation
TARGET_DATE = rt_dates.DATES["apr2025"]
# the name (from airtable) of the schedule feed
FEED_NAME = "Big Blue Bus Schedule"
# the local path to the parent directory of the schedule feed
#GTFS_FEED_PARENT = f"../conveyal_update/feeds_{TARGET_DATE}/socal/"
GTFS_FEED_PARENT = "./"
# a glob that produces one result within GTFS_FEED_PARENT and leads to the schedule feed
#GTFS_FEED_GLOB = "Big_Blue_Bus_Schedule_*.zip/*.zip"
GTFS_FEED_GLOB = "bbb_original.zip"
# the maximum number of stops where a gap should be imputed
MAX_STOP_GAP = 5
# the name of the output feed
OUTPUT_FEED_PATH = f"output_feeds/bbb_test_{TARGET_DATE}.zip"

In [3]:
path = pathlib.Path('./output_feeds')
if not path.exists(): path.mkdir()

### Get RT Data

In [4]:
# Get the schedule gtfs dataset key
gtfs_dataset_key = (
    gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
        selected_date=TARGET_DATE, keep_cols=["name", "gtfs_dataset_key"]
    )
    .set_index("name")
    .at[FEED_NAME, "gtfs_dataset_key"]
)
gtfs_dataset_key

'c65bd95ac0009a74df9ff840fc416771'

In [5]:
# Get the merged schedule/stop times table
schedule_rt_stop_times_single_agency = _filter_non_rt_trips(
    get_schedule_rt_stop_times_table(gtfs_dataset_key, TARGET_DATE),
    col.DEFAULT_COLUMN_MAP,
).reset_index(drop=True)

In [6]:
#schedule_rt_stop_times_single_agency.to_parquet("cached_feed.parquet")

In [7]:
#schedule_rt_stop_times_single_agency = _filter_non_rt_trips(
#    pd.read_parquet("cached_feed.parquet"), columns=col.DEFAULT_COLUMN_MAP
#).reset_index(drop=True)

In [8]:
# Impute certain unrealistic (first/last, nonmonotonic, short gap) stop times
# Logic here is wip
schedule_rt_stop_times_single_agency["gap_imputed_sec"] = impute_unrealistic_rt_times(
    schedule_rt_stop_times_single_agency,
    max_gap_length=MAX_STOP_GAP,
    columns=col.DEFAULT_COLUMN_MAP,
)

### Get schedule feed

In [9]:
# Get the path to the schedule feed
feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)
feed_path = next(feed_paths)
assert next(feed_paths, None) is None, "Ambiguous Schedule Feed"

# Load the schedule feed using gtfs-lite and filter it
feed = GTFS.load_zip(feed_path)
feed_filtered = subset_schedule_feed_to_one_date(
    feed, dt.date.fromisoformat(TARGET_DATE)
)

### Merge schedule / rt

In [10]:
# Generate the feed based on the imputed rt times and the downloaded schedule feed
output_feed = make_retrospective_feed_single_date(
    filtered_input_feed=feed_filtered,
    stop_times_table=schedule_rt_stop_times_single_agency,
    stop_times_desired_columns=[
        "trip_id",
        "arrival_time",
        "departure_time",
        "drop_off_type",
        "pickup_type",
        "stop_headsign",
        "stop_id",
        "stop_sequence",
    ],
    stop_times_table_columns={
        **col.DEFAULT_COLUMN_MAP,
        col.RT_ARRIVAL_SEC: "gap_imputed_sec",
    },
)

merged_filtered columns Index(['trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence',
       'stop_headsign', 'pickup_type', 'drop_off_type', 'shape_dist_traveled',
       'timepoint', 'feed_arrival_sec', 'warehouse_stop_id',
       'warehouse_scheduled_arrival_sec', 'schedule_gtfs_dataset_key',
       'trip_instance_key', 'rt_arrival_sec', 'gap_imputed_sec',
       'rt_arrival_gtfs_time'],
      dtype='object')
       trip_id arrival_time departure_time stop_id  stop_sequence  \
0         6110     17:20:00       17:20:00     963              1   
1         6110     17:20:25       17:20:25     484              2   
2         6110     17:21:13       17:21:13      92              3   
3         6110     17:22:23       17:22:23      93              4   
4         6110     17:23:57       17:23:57     293              5   
...        ...          ...            ...     ...            ...   
48624  3197110     07:43:18       07:43:18     545             13   
48625  3197110

In [11]:
# Save the output to a zip file
output_feed.write_zip(OUTPUT_FEED_PATH)

### Dropped shapes and stops

In [12]:
print("Get dropped shapes by their frequency")
feed_filtered.trips.loc[
    ~feed_filtered.trips.shape_id.isin(output_feed.trips.shape_id.unique()), "shape_id"
].value_counts()

Get dropped shapes by their frequency


shp-009-52    32
shp-009-01    28
shp-009-03     4
shp-009-51     2
Name: shape_id, dtype: int64

In [13]:
print("Get dropped stops by the number of trips serving them in the original feed")
pd.DataFrame(
    feed_filtered.stop_times.loc[
        ~feed_filtered.stop_times.stop_id.isin(output_feed.stop_times.stop_id.unique()),
        "stop_id",
    ]
    .value_counts()
    .rename("stop_count")
).merge(
    feed_filtered.stops.set_index("stop_id")["stop_name"],
    how="left",
    left_index=True,
    right_index=True,
).head()

Get dropped stops by the number of trips serving them in the original feed


Unnamed: 0,stop_count,stop_name
125,64,MARQUEZ LOOP
862,34,CHAUTAUQUA BLVD & LA CUMBRE DR
853,34,SUNSET BLVD & BAYLOR ST
854,34,SUNSET BLVD & BIENVENIDA AVE
855,34,SUNSET BLVD & EL MEDIO AVE


### Sample Trip

In [14]:
schedule_rt_stop_times_single_agency.loc[
    schedule_rt_stop_times_single_agency.trip_id == "902110"
]

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,gap_imputed_sec
22098,902110,962,1,37740.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,,37599.0
22099,902110,112,2,37800.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,37707.0,37659.0
22100,902110,495,3,37832.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,37691.0,37691.0
22101,902110,496,4,37876.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,,37747.0
22102,902110,497,5,37931.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,37818.0,37818.0
22103,902110,498,6,37965.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,37912.0,37912.0
22104,902110,55,7,38010.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,37963.0,37963.0
22105,902110,386,8,38082.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,38031.0,38031.0
22106,902110,474,9,38116.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,38065.0,38065.0
22107,902110,365,10,38182.0,c65bd95ac0009a74df9ff840fc416771,75a9cd5a7d094c8b1dcd5ee58b632abf,38122.0,38122.0


In [15]:
output_feed.stop_times.loc[output_feed.stop_times["trip_id"] == "143110"]

Unnamed: 0,arrival_time,departure_time,drop_off_type,pickup_type,stop_headsign,stop_id,stop_sequence,trip_id
2214,12:05:16,12:05:16,0,0,14 BRENTWOOD,1015,1,143110
2215,12:08:34,12:08:34,0,0,14 BRENTWOOD,1016,2,143110
2216,12:11:23,12:11:23,0,0,14 BRENTWOOD,1017,3,143110
2217,12:12:19,12:12:19,0,0,14 BRENTWOOD,1019,4,143110
2218,12:13:48,12:13:48,0,0,14 BRENTWOOD,1000,5,143110
2219,12:16:58,12:16:58,0,0,14 BRENTWOOD,144,6,143110
2220,12:17:36,12:17:36,0,0,14 BRENTWOOD,145,7,143110
2221,12:19:01,12:19:01,0,0,14 BRENTWOOD,146,8,143110
2222,12:19:45,12:19:45,0,0,14 BRENTWOOD,147,9,143110
2223,12:21:04,12:21:04,0,0,14 BRENTWOOD,148,10,143110


In [16]:
feed_filtered.stop_times.loc[
    feed_filtered.stop_times["trip_id"] == "902110"
].sort_values("stop_sequence")

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
16151,902110,10:29:00,10:29:00,962,1,1 UCLA,0,0,0.0,1
16152,902110,10:30:00,10:30:00,112,2,1 UCLA,0,0,201.46,1
16153,902110,10:30:32,10:30:32,495,3,1 UCLA,0,0,421.59,0
16154,902110,10:31:16,10:31:16,496,4,1 UCLA,0,0,670.85,0
16155,902110,10:32:11,10:32:11,497,5,1 UCLA,0,0,1014.24,0
16156,902110,10:32:45,10:32:45,498,6,1 UCLA,0,0,1171.95,0
16157,902110,10:33:30,10:33:30,55,7,1 UCLA,0,0,1451.83,0
16158,902110,10:34:42,10:34:42,386,8,1 UCLA,0,0,1865.59,0
16159,902110,10:35:16,10:35:16,474,9,1 UCLA,0,0,2079.43,0
16160,902110,10:36:22,10:36:22,365,10,1 UCLA,0,0,2445.39,0
