In [1]:
"""
%%sh
pip install gtfs-lite
cd ~/data-analyses/rt_segment_speeds
pip install -r requirements.txt
"""

'\n%%sh\npip install gtfs-lite\ncd ~/data-analyses/rt_segment_speeds\npip install -r requirements.txt\n'

In [2]:
import datetime as dt
import pathlib

import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd
from gtfslite import GTFS
from shared_utils import catalog_utils, gtfs_utils_v2, rt_dates

In [3]:
from retrospective_feed_generation import *
from warehouse_utils import *
from gtfs_utils import *

### Get RT Data

In [4]:
CREDENTIALS, _ = google.auth.default()

In [5]:
SAMPLE_DATE_STR = rt_dates.DATES["apr2025"]
FEED_NAME = "Big Blue Bus Schedule"

In [6]:
SAMPLE_DATE_STR

'2025-04-16'

In [7]:
gtfs_dataset_key = (
    gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
        selected_date=SAMPLE_DATE_STR, keep_cols=["name", "gtfs_dataset_key"]
    )
    .set_index("name")
    .at[FEED_NAME, "gtfs_dataset_key"]
)
gtfs_dataset_key

'c65bd95ac0009a74df9ff840fc416771'

In [8]:
schedule_rt_stop_times_single_agency = get_schedule_rt_stop_times_table(
    gtfs_dataset_key,
    SAMPLE_DATE_STR
)

In [9]:
schedule_rt_stop_times_single_agency.to_parquet("test.parquet")

In [10]:
schedule_rt_stop_times_single_agency = pd.read_parquet("test.parquet")

In [11]:
flagged_stop_times = flag_nonsequential_stops(schedule_rt_stop_times_single_agency)
flagged_trips = flagged_stop_times.loc[
    flagged_stop_times["non_sequential_rt_arrival"],
    "trip_instance_key"
].drop_duplicates()

In [12]:
imputed_stop_times = impute_first_last(flagged_stop_times).dropna(subset=["imputed_arrival_sec"])

In [13]:
schedule_rt_stop_times_single_agency.trip_id.isna().sum()

0

In [14]:
trip_ids = schedule_rt_stop_times_single_agency.trip_instance_key.drop_duplicates()

In [15]:
test = imputed_stop_times.loc[
    imputed_stop_times.trip_instance_key == flagged_trips.iloc[1]
]
test

Unnamed: 0,trip_id,stop_id,stop_sequence,scheduled_arrival_sec,schedule_gtfs_dataset_key,trip_instance_key,rt_arrival_sec,non_sequential_rt_arrival,flag_surrounding_non_sequential_rt_arrival,imputed_arrival_sec
108,630110,311,1,21720.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,,False,False,21924.0
109,630110,312,2,21788.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,21992.0,False,False,21992.0
110,630110,646,3,21842.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,22063.0,False,False,22063.0
111,630110,641,4,21896.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,22126.0,False,False,22126.0
112,630110,885,5,21970.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,22201.0,False,False,22201.0
113,630110,412,6,22030.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,22266.0,False,False,22266.0
114,630110,883,7,22074.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,22296.0,False,False,22296.0
115,630110,884,8,22112.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,22359.0,False,False,22359.0
116,630110,879,9,22151.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,22397.0,False,False,22397.0
117,630110,881,10,22216.0,c65bd95ac0009a74df9ff840fc416771,0054055035c4602b628f8ac281549c3d,22455.0,False,False,22455.0


### Get schedule feed

In [16]:
#TODO: right now this was just a download based on the url in airtable
# Need to make it traceable instead
GTFS_FEED_PARENT = "../conveyal_update/feeds_2025-04-16/socal/Big_Blue_Bus_Schedule_7a3f513c343b16a30c135ed7d332b6d6_gtfs.zip/"
GTFS_FEED_GLOB = "*.zip"

#GTFS_FEED_PARENT = "./feeds/"
#GTFS_FEED_GLOB = "big_blue_bus_2025-03*.zip"

ARBITRARY_SERVICE_ID = "0"
GTFS_DATE_STRFTIME_CODE = "%Y%m%d"

In [17]:
feed_paths = pathlib.Path(GTFS_FEED_PARENT).glob(GTFS_FEED_GLOB)
feed_path = next(feed_paths)
assert next(feed_paths, None) is None, "Ambiguous Schedule Feed"
feed = GTFS.load_zip(feed_path)
feed_filtered = subset_schedule_feed_to_one_date(
    feed, dt.date.fromisoformat(SAMPLE_DATE_STR)
)

### Merge schedule / rt

In [18]:
output_feed = make_retrospective_feed_single_date(
    filtered_input_feed=feed_filtered,
    stop_times_table=imputed_stop_times,
    stop_times_desired_columns=[
        "trip_id",
        "arrival_time",
        "departure_time"
        "drop_off_type",
        "pickup_type",
        "stop_headsign",
        "stop_id",
        "stop_sequence",
    ]
)

In [19]:
output_feed.write_zip(f"output_feeds/bbb_test_{SAMPLE_DATE_STR}.zip")

### Dropped shapes and stops

In [20]:
print("Get dropped shapes by their frequency")
feed_filtered.trips.loc[
    ~feed_filtered.trips.shape_id.isin(output_feed.trips.shape_id.unique()), "shape_id"
].value_counts()

Get dropped shapes by their frequency


shp-009-52    32
shp-009-01    28
shp-009-03     4
shp-009-51     2
Name: shape_id, dtype: int64

In [21]:
print("Get dropped stops by the number of trips serving them in the original feed")
pd.DataFrame(
    feed_filtered.stop_times.loc[
        ~feed_filtered.stop_times.stop_id.isin(
            output_feed.stop_times.stop_id.unique()
        ),
        "stop_id",
    ]
    .value_counts()
    .rename("stop_count")
).merge(
    feed_filtered.stops.set_index("stop_id")["stop_name"],
    how="left",
    left_index=True,
    right_index=True,
)

Get dropped stops by the number of trips serving them in the original feed


Unnamed: 0,stop_count,stop_name
125,64,MARQUEZ LOOP
853,34,SUNSET BLVD & BAYLOR ST
852,34,SUNSET BLVD & ARNO WAY
854,34,SUNSET BLVD & BIENVENIDA AVE
855,34,SUNSET BLVD & EL MEDIO AVE
856,34,SUNSET BLVD & TEMESCAL CANYON RD
902,34,SUNSET BLVD & VIA DE LA PAZ
857,34,SUNSET BLVD & SWARTHMORE AVE
858,34,SUNSET BLVD & CAREY ST
859,34,SUNSET BLVD & DRUMMOND ST


### Sample Trip

In [22]:
output_feed.stop_times.loc[output_feed.stop_times["trip_id"] == "902110"].sort_values(
    "stop_sequence"
)

Unnamed: 0,arrival_time,pickup_type,stop_headsign,stop_id,stop_sequence,trip_id,departure_time
13642,10:27:07,0,1 UCLA,962,1,902110,10:27:07
13643,10:28:27,0,1 UCLA,112,2,902110,10:28:27
13644,10:28:11,0,1 UCLA,495,3,902110,10:28:11
13645,10:30:18,0,1 UCLA,497,5,902110,10:30:18
13646,10:31:52,0,1 UCLA,498,6,902110,10:31:52
13647,10:32:43,0,1 UCLA,55,7,902110,10:32:43
13648,10:33:51,0,1 UCLA,386,8,902110,10:33:51
13649,10:34:25,0,1 UCLA,474,9,902110,10:34:25
13650,10:35:22,0,1 UCLA,365,10,902110,10:35:22
13651,10:36:11,0,1 UCLA,366,11,902110,10:36:11


In [23]:
feed_filtered.stop_times.loc[
    feed_filtered.stop_times["trip_id"] == "902110"
].sort_values("stop_sequence")

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
16151,902110,10:29:00,10:29:00,962,1,1 UCLA,0,0,0.0,1
16152,902110,10:30:00,10:30:00,112,2,1 UCLA,0,0,201.46,1
16153,902110,10:30:32,10:30:32,495,3,1 UCLA,0,0,421.59,0
16154,902110,10:31:16,10:31:16,496,4,1 UCLA,0,0,670.85,0
16155,902110,10:32:11,10:32:11,497,5,1 UCLA,0,0,1014.24,0
16156,902110,10:32:45,10:32:45,498,6,1 UCLA,0,0,1171.95,0
16157,902110,10:33:30,10:33:30,55,7,1 UCLA,0,0,1451.83,0
16158,902110,10:34:42,10:34:42,386,8,1 UCLA,0,0,1865.59,0
16159,902110,10:35:16,10:35:16,474,9,1 UCLA,0,0,2079.43,0
16160,902110,10:36:22,10:36:22,365,10,1 UCLA,0,0,2445.39,0
