In [1]:
import google.auth
import datetime as dt
import geopandas as gpd
import pandas as pd
import numpy as np
from gtfslite import GTFS
from shared_utils import rt_dates, catalog_utils, gtfs_utils_v2
import pathlib

In [2]:
# Get RT data

In [3]:
CREDENTIALS, _ = google.auth.default()
# not used
def safe_read_geoparquet(*args, **kwargs):
    assert "storage_options" not in kwargs
    return gpd.read_parquet(
        *args, 
        **kwargs,
        storage_options={"token": CREDENTIALS.token}
    )

In [4]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
SAMPLE_DATE_STR = rt_dates.DATES["apr2025"]
FEED_NAME = "Big Blue Bus Schedule"

In [5]:
SAMPLE_DATE_STR

'2025-04-16'

In [6]:
feed_key = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
    selected_date=SAMPLE_DATE_STR,
    keep_cols=["name", "gtfs_dataset_key"]
).set_index("name").at["Big Blue Bus Schedule", "gtfs_dataset_key"]
feed_key

'c65bd95ac0009a74df9ff840fc416771'

In [7]:
rt_schedule_stop_times_uri = f"{GTFS_DATA_DICT.rt_vs_schedule_tables.dir}{GTFS_DATA_DICT.rt_vs_schedule_tables.schedule_rt_stop_times}_{SAMPLE_DATE_STR}.parquet"
schedule_rt_stop_times = pd.read_parquet(rt_schedule_stop_times_uri)

In [8]:
schedule_rt_stop_times.head()

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
0,TL-1845,TL-17,60,0.0,2f1c266fc20f9875777fb752af32a66e,ff9d64006546fcaad9e1077b5ac9c1eb,82700
1,TL-1845,TL-18,70,480.0,2f1c266fc20f9875777fb752af32a66e,ff9d64006546fcaad9e1077b5ac9c1eb,82872
2,TL-1845,TL-16,50,,2f1c266fc20f9875777fb752af32a66e,ff9d64006546fcaad9e1077b5ac9c1eb,82599
3,TL-1630,TL-6,50,3240.0,2f1c266fc20f9875777fb752af32a66e,1e84e87e6f17443ef22a689448a7c580,84142
4,TL-1630,TL-7,60,3420.0,2f1c266fc20f9875777fb752af32a66e,1e84e87e6f17443ef22a689448a7c580,84146


In [9]:
schedule_rt_stop_times_single_agency = schedule_rt_stop_times.loc[
    schedule_rt_stop_times["schedule_gtfs_dataset_key"] == feed_key
]
schedule_rt_stop_times_single_agency.drop_duplicates(subset=["trip_id"], keep="first").sort_values("trip_id")

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec
475677,1007110,94,36,51326.0,c65bd95ac0009a74df9ff840fc416771,10546ca63c494bcf9619b24f00f8273b,51837
474075,1011110,168,7,57910.0,c65bd95ac0009a74df9ff840fc416771,e786e7fc9ac4cc13f51b40c12c02b96b,59366
472221,1027110,52,20,49192.0,c65bd95ac0009a74df9ff840fc416771,bad286e64a408a75bab09c3a6a49e00d,49065
501208,1028110,610,2,26679.0,c65bd95ac0009a74df9ff840fc416771,49e8559e2caadde947e88fcc442d69ad,26903
474026,103110,790,14,32728.0,c65bd95ac0009a74df9ff840fc416771,f8a254341a1202e793d9a07dd60afb14,32777
...,...,...,...,...,...,...,...
490205,991110,738,39,39074.0,c65bd95ac0009a74df9ff840fc416771,27d83363fcd482f684f17941cad2d72d,39744
472614,994110,155,27,43191.0,c65bd95ac0009a74df9ff840fc416771,1a94ddb2415be4187cd586bd1ea9c04f,43264
495795,995110,748,31,33940.0,c65bd95ac0009a74df9ff840fc416771,0ea983cc4a042fe814c45be1bf2c82f9,33809
487387,996110,434,34,19649.0,c65bd95ac0009a74df9ff840fc416771,e877808264283b34ff246c03f8112ab8,19642


In [10]:
# Get schedule feed

In [11]:
#TODO: right now this was just a download based on the url in airtable
# Need to make it traceable instead
GTFS_FEED_PARENT = "../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/"
GTFS_FEED_GLOB = "*.zip"

ARBITRARY_SERVICE_ID = "0"
GTFS_DATE_STRFTIME_CODE = "%Y%m%d"

In [12]:
def copy_GTFS(feed: GTFS) -> GTFS:
    """Deep copy a gtfslite GTFS object"""
    return GTFS(
        agency=feed.agency,
        stops=feed.stops,
        routes=feed.routes,
        trips=feed.trips,
        stop_times=feed.stop_times,
        calendar=feed.calendar,
        calendar_dates=feed.calendar_dates,
        fare_attributes=feed.fare_attributes,
        fare_rules=feed.fare_rules,
        shapes=feed.shapes,
        frequencies=feed.frequencies,
        transfers=feed.transfers,
        pathways=feed.pathways,
        levels=feed.levels,
        translations=feed.translations,
        feed_info=feed.feed_info,
        attributions=feed.attributions
    )

def subset_schedule_feed_to_one_date(feed: GTFS, service_date: dt.datetime) -> GTFS:
    assert feed.valid_date(service_date), f"Feed not valid on {service_date.isoformat()}"
    # Define a new calendar dates, since the synthetic feed will only be valid on the service date
    new_calendar_dates = pd.DataFrame(
        {
            "service_id": [ARBITRARY_SERVICE_ID],
            "date": [service_date.strftime(GTFS_DATE_STRFTIME_CODE)],
            "exception_type": [1]
        },
        index=[0]
    )
    # Get only trips on the calendar date, and update their service id to match the new_calendar_dates
    trips_on_service_date = feed.date_trips(service_date).reset_index(drop=True)    
    trips_on_service_date["service_id"] = ARBITRARY_SERVICE_ID
    # Get only stop_times on the calendar date
    stop_times_on_service_date = feed.stop_times.loc[
        feed.stop_times["trip_id"].isin(trips_on_service_date["trip_id"]) # check if this is slow
    ].reset_index(drop=True)
    #TODO: evaluate whether it is necessary to remove stops, shapes, and transfers that do not have service
    #TODO: add any additional behavior for feeds with frequencies.txt
    #TODO: update feed_info.txt
    # Copy the feed, and update it to only be valid on the service date
    schedule_feed_service_date_only = copy_GTFS(feed)
    schedule_feed_service_date_only.calendar_dates = new_calendar_dates.copy()
    schedule_feed_service_date_only.calendar = None
    schedule_feed_service_date_only.trips = trips_on_service_date
    schedule_feed_service_date_only.stop_times = stop_times_on_service_date
    return schedule_feed_service_date_only


In [13]:
feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)
feed_path = next(feed_paths)
assert (next(feed_paths, None) is None), "Ambiguous Schedule Feed"
feed = GTFS.load_zip(feed_path)
feed_filtered = subset_schedule_feed_to_one_date(
    feed,
    dt.date.fromisoformat(SAMPLE_DATE_STR)
)

In [14]:
# Merge schedule / rt

In [15]:
RT_COLUMN_RENAME_MAP = {
    "stop_id": "warehouse_stop_id",
    "scheduled_arrival_sec": "warehouse_scheduled_arrival_sec",
}

In [16]:
# NOTE: Looks like BBB doesn't run any service after midnight (confirmed by looking at schedule pdfs), need to test with an agency that does
feed_filtered.stop_times.arrival_time.str.split(":").map(lambda x: x[0]).max()

'23'

In [17]:
time_string_to_time_since_midnight = (
    lambda column: column.str.split(":").map(lambda s: int(s[0]) * 3600 + int(s[1]) * 60 + int(s[2]))
)

In [18]:
schedule_trips_original = feed_filtered.trips.set_index("trip_id")
schedule_stop_times_original = feed_filtered.stop_times.copy()
schedule_stop_times_original["feed_departure_sec"] = time_string_to_time_since_midnight(
    schedule_stop_times_original["departure_time"]
)
schedule_stop_times_original["feed_arrival_sec"] = time_string_to_time_since_midnight(
    schedule_stop_times_original["arrival_time"]
)
rt_trip_ids = schedule_rt_stop_times_single_agency["trip_id"].drop_duplicates(keep="first")

schedule_trips_in_rt = schedule_trips_original.loc[rt_trip_ids]
stop_times_merged = schedule_stop_times_original.merge(
    schedule_rt_stop_times_single_agency.rename(
        columns=RT_COLUMN_RENAME_MAP
    ),
    on=["trip_id", "stop_sequence"],
    how="left", #TODO: left for proof of concept to simplifyZ, should be outer
    validate="one_to_one"
)

In [19]:
# Validation
# Stop ids match or are na
assert (
    (stop_times_merged["stop_id"] == stop_times_merged["warehouse_stop_id"])
    | stop_times_merged["warehouse_stop_id"].isna()
).all()
# Departure / arrival times match or are na
assert (
    (stop_times_merged["feed_arrival_sec"] == stop_times_merged["warehouse_scheduled_arrival_sec"])
    | stop_times_merged["feed_arrival_sec"].isna()
    | stop_times_merged["warehouse_scheduled_arrival_sec"].isna()
).all()
# All RT stop times have an arrival sec
assert (
    ~stop_times_merged["feed_arrival_sec"].isna()
    | stop_times_merged["schedule_gtfs_dataset_key"].isna()
).all()

In [20]:
def seconds_to_gtfs_format_time(time_column: pd.Series) -> pd.Series:
    """Convert time in seconds since midnight (from the warehouse) to gtfs format time"""
    #TODO: this will not handle dst correctly
    hours = (time_column // 3600).astype(int).astype(str).str.rjust(width=2, fillchar="0")
    minutes = ((time_column % 3600) // 60).astype(int).astype(str).str.rjust(width=2, fillchar="0")
    seconds = (time_column % 60).astype(int).astype(str).str.rjust(width=2, fillchar="0")
    formatted = hours + ":" + minutes + ":" + seconds
    return formatted
    
test = pd.Series([90085, 91382, 62425])
seconds_to_gtfs_format_time(test)

0    25:01:25
1    25:23:02
2    17:20:25
dtype: object

In [21]:
WAREHOUSE_TO_GTFS_FORMAT_COLUMN_MAP = {
    "rt_arrival_gtfs_time": "arrival_time",
}
GTFS_FORMAT_KEEP_COLUMNS = [
    "trip_id",
    "arrival_time",
    "stop_id",
    "stop_sequence",
    "stop_headsign", # TODO: included to make testing a little easier, remove after
    "pickup_type",
    "drop_off_type",
    "continuous_pickup",
    "continuous_drop_off"
]
# For we just remove stops that aren't in both stop times and stops. This should be fixed, since right now termini seem to always be dropped, as well as other random stops
# Probably need to figure out a way to interpolate these
stop_times_merged_filtered = stop_times_merged.loc[
    ~stop_times_merged["schedule_gtfs_dataset_key"].isna()
].reset_index(drop=True)
stop_times_merged_filtered["rt_arrival_gtfs_time"] = seconds_to_gtfs_format_time(
    stop_times_merged_filtered["rt_arrival_sec"]
)
stop_times_gtfs_format_with_rt_times = stop_times_merged_filtered.drop(
    ["arrival_time", "departure_time"], axis=1
).rename(
    columns=WAREHOUSE_TO_GTFS_FORMAT_COLUMN_MAP
)[
    np.intersect1d(GTFS_FORMAT_KEEP_COLUMNS, stop_times_merged_filtered.columns)
].copy()
# TODO: not sure if this is the correct thing to do, for first trips
stop_times_gtfs_format_with_rt_times["departure_time"] = stop_times_gtfs_format_with_rt_times["arrival_time"].copy()

In [22]:
stop_times_gtfs_format_with_rt_times

Unnamed: 0,arrival_time,drop_off_type,pickup_type,stop_headsign,stop_id,stop_sequence,trip_id,departure_time
0,17:20:37,0,0,41 17TH ST/SMC STATION E LINE,484,2,6110,17:20:37
1,17:21:22,0,0,41 17TH ST/SMC STATION E LINE,92,3,6110,17:21:22
2,17:21:45,0,0,41 17TH ST/SMC STATION E LINE,93,4,6110,17:21:45
3,17:24:16,0,0,41 17TH ST/SMC STATION E LINE,293,5,6110,17:24:16
4,17:25:36,0,0,41 17TH ST/SMC STATION E LINE,305,6,6110,17:25:36
...,...,...,...,...,...,...,...,...
41063,07:40:13,0,0,R3 DOWNTOWN SANTA MONICA,85,11,3197110,07:40:13
41064,07:42:03,0,0,R3 DOWNTOWN SANTA MONICA,625,12,3197110,07:42:03
41065,07:44:35,0,0,R3 DOWNTOWN SANTA MONICA,545,13,3197110,07:44:35
41066,07:47:21,0,0,R3 DOWNTOWN SANTA MONICA,743,14,3197110,07:47:21


In [23]:
# Output a new synthetic feed!

# Alter the feed with the new trips and stop times
altered_feed = copy_GTFS(feed_filtered)
altered_feed.trips = schedule_trips_in_rt.reset_index()
altered_feed.stop_times = stop_times_gtfs_format_with_rt_times

# Not sure if this is appropriate or not, since we're altering. Leaving commented out for now
# Possibly should go in subset_schedule_feed_to_one_date
"""
new_feed_info = pd.DataFrame({
    "feed_publisher_name": "California Department of Transportation",
    "feed_publisher_url": "https://dot.ca.gov",
    "feed_lang": np.nan if altered_feed.feed_info is not None else altered_feed.feed_info["feed_lang"].iloc[0],
    "feed_start_date": SAMPLE_DATE_STR,
    "feed_end_date": SAMPLE_DATE_STR,
    "feed_version": f"retrospective_{SAMPLE_DATE_STR}" if altered_feed.feed_info is not None else  f"retrospective_{altered_feed.feed_info["feed_version"]}_{SAMPLE_DATE_STR}"
})
"""
# Copy the feed - this is necessary to validate the feed meets the standard since gtfs-lite only validates feeds on creation
output_feed = copy_GTFS(altered_feed)

# Save the feed to a file
output_feed.write_zip(f"output_feeds/bbb_test_{SAMPLE_DATE_STR}.zip")

In [30]:
output_feed.stop_times.loc[output_feed.stop_times["trip_id"] == "902110"].sort_values("stop_sequence")

Unnamed: 0,arrival_time,drop_off_type,pickup_type,stop_headsign,stop_id,stop_sequence,trip_id,departure_time
12871,10:28:27,0,0,1 UCLA,112,2,902110,10:28:27
12872,10:28:11,0,0,1 UCLA,495,3,902110,10:28:11
12873,10:30:18,0,0,1 UCLA,497,5,902110,10:30:18
12874,10:31:52,0,0,1 UCLA,498,6,902110,10:31:52
12875,10:32:43,0,0,1 UCLA,55,7,902110,10:32:43
12876,10:33:51,0,0,1 UCLA,386,8,902110,10:33:51
12877,10:34:25,0,0,1 UCLA,474,9,902110,10:34:25
12878,10:35:22,0,0,1 UCLA,365,10,902110,10:35:22
12879,10:36:11,0,0,1 UCLA,366,11,902110,10:36:11
12880,10:37:01,0,0,1 UCLA,434,12,902110,10:37:01


In [34]:
output_feed.stops.set_index("stop_id").loc[["112", "495"]]

Unnamed: 0_level_0,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
112,2784,MAIN ST & MARKET ST,,33.988651,-118.471372,,,,,,2
495,MNSWSMNF,MAIN ST & WESTMINSTER AVE,,33.990316,-118.472653,,,,,,2


In [31]:
feed_filtered.stop_times.loc[feed_filtered.stop_times["trip_id"] == "902110"].sort_values("stop_sequence")

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
16151,902110,10:29:00,10:29:00,962,1,1 UCLA,0,0,0.0,1
16152,902110,10:30:00,10:30:00,112,2,1 UCLA,0,0,201.46,1
16153,902110,10:30:32,10:30:32,495,3,1 UCLA,0,0,421.59,0
16154,902110,10:31:16,10:31:16,496,4,1 UCLA,0,0,670.85,0
16155,902110,10:32:11,10:32:11,497,5,1 UCLA,0,0,1014.24,0
16156,902110,10:32:45,10:32:45,498,6,1 UCLA,0,0,1171.95,0
16157,902110,10:33:30,10:33:30,55,7,1 UCLA,0,0,1451.83,0
16158,902110,10:34:42,10:34:42,386,8,1 UCLA,0,0,1865.59,0
16159,902110,10:35:16,10:35:16,474,9,1 UCLA,0,0,2079.43,0
16160,902110,10:36:22,10:36:22,365,10,1 UCLA,0,0,2445.39,0


In [26]:
print("Get dropped shapes by their frequency")
feed_filtered.trips.loc[
    ~feed_filtered.trips.shape_id.isin(altered_feed.trips.shape_id.unique()),
    "shape_id"
].value_counts()

Get dropped shapes by their frequency


shp-009-52    32
shp-009-01    28
shp-009-03     4
shp-009-51     2
Name: shape_id, dtype: int64

In [27]:
print("Get dropped stops by the number of trips serving them in the original feed")
pd.DataFrame(
    feed_filtered.stop_times.loc[
        ~feed_filtered.stop_times.stop_id.isin(
            altered_feed.stop_times.stop_id.unique()
        ),
        "stop_id"
    ].value_counts().rename("stop_count")
).merge(
    feed_filtered.stops.set_index("stop_id")["stop_name"], 
    how="left", 
    left_index=True, 
    right_index=True
)

Get dropped stops by the number of trips serving them in the original feed


Unnamed: 0,stop_count,stop_name
130,303,7TH ST & OLYMPIC BLVD
786,285,UCLA HILGARD TERMINAL
77,212,WESTWOOD PLAZA & STRATHMORE (Gateway Plaza)
962,170,GRAND BLVD & RIVIERA AVE
969,168,MAIN ST & OLYMPIC DR
...,...,...
710,1,SAN VICENTE BLVD & AVONDALE AVE
711,1,SAN VICENTE BLVD & BRISTOL AVE
72,1,SAN VICENTE BLVD & ANITA AVE
689,1,SAN VICENTE BLVD & BUNDY DR


In [28]:
feed_filtered.stops.columns

Index(['stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat',
       'stop_lon', 'zone_id', 'stop_url', 'location_type', 'parent_station',
       'stop_timezone', 'wheelchair_boarding'],
      dtype='object')