In [2]:
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from shared_utils import catalog_utils, rt_dates, portfolio_utils
from segment_speed_utils import helpers
g = GCSGeoPandas()

# Copy speedmaps data into public bucket to support MTC

In [3]:
relevant_dates = [
    rt_dates.DATES[i]
    for i in ["sep2025a", "sep2025", "sep2025b", "oct2025a", "oct2025", "oct2025b"]
]
relevant_dates

['2025-09-23',
 '2025-09-24',
 '2025-09-25',
 '2025-10-14',
 '2025-10-15',
 '2025-10-16']

In [4]:
gtfs_yml = catalog_utils.get_catalog("gtfs_analytics_data")
aggregated_parent = (
    f"{gtfs_yml.speedmap_segments.dir}{gtfs_yml.speedmap_segments.segment_timeofday}"
)
trip_speeds_parent = (
    f"{gtfs_yml.speedmap_segments.dir}{gtfs_yml.speedmap_segments.stage4}"
)

In [5]:
# Get aggregated speedmaps segments paths
aggregated_uris = [f"{aggregated_parent}_{date}.parquet" for date in relevant_dates]
aggregated_uris

['gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-23.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-24.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-25.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-14.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-15.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-16.parquet']

In [6]:
# Get trip level speeds paths
# trip_speeds_uris = [f"{trip_speeds_parent}_{date}.parquet" for date in relevant_dates]
trip_speeds_uris = [{"date": date, "uri": f"{trip_speeds_parent}_{date}.parquet"} for date in relevant_dates]
trip_speeds_uris

[{'date': '2025-09-23',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-23.parquet'},
 {'date': '2025-09-24',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-24.parquet'},
 {'date': '2025-09-25',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-25.parquet'},
 {'date': '2025-10-14',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-14.parquet'},
 {'date': '2025-10-15',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-15.parquet'},
 {'date': '2025-10-16',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-16.parquet'}]

### validation

Doesn't exactly match script aggregation but we're comfortable these are the right files. Could be revised.


In [None]:

# # Validate that aggregated speeds match trip speeds in a majority of cases
# GROUP_COLUMNS = ["time_of_day", "segment_id", "shape_id"]
# quantile_to_speed_column = {0.2: "p20_mph", 0.5: "p50_mph", 0.8: "p80_mph"}
# for aggregated_uri, trip_speeds_uri in zip(aggregated_uris, trip_speeds_uris):
#     # read aggregated and precursor data
#     aggregated = g.read_parquet(aggregated_uri)
#     trip_speeds = pd.read_parquet(trip_speeds_uri)
#     # reindex aggregated_speeds and group trip_speeds_grouped so they theoretically have the same data
#     aggregated_speeds = aggregated.set_index(GROUP_COLUMNS)[
#         list(quantile_to_speed_column.values())
#     ]
#     trip_speeds_grouped = trip_speeds.groupby(GROUP_COLUMNS)
#     trip_speeds_percentiles = pd.concat(
#         [
#             trip_speeds_grouped["speed_mph"].quantile(q).rename(name).round(2)
#             for q, name in quantile_to_speed_column.items()
#         ],
#         axis=1,
#     )
#     # the aggregated data appears to contain a subset of the trip speeds data, so only look at that subset
#     trip_speeds_percentiles_subset = trip_speeds_percentiles.loc[
#         aggregated_speeds.index
#     ]
#     # find the rows where at least one speed doesn't match
#     non_equal = (trip_speeds_percentiles_subset != aggregated_speeds).any(axis=1)
    
#     print(
#         f"for {aggregated_uri}, {non_equal.sum()} out of {len(aggregated_speeds.index)} speeds did not match"
#     )

## export

In [7]:
# Map source uris to output uris
output_aggregated_uris = {
    old_uri: f"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{old_uri.split('/')[-1]}"
    for old_uri in aggregated_uris
}
# output_trip_speeds_uris = {
#     old_uri['uri']: f"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{old_uri.split('/')[-1]}"
#     for old_uri in trip_speeds_uris
# }

In [13]:
for entry in trip_speeds_uris: entry['output_uri'] = f"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{entry['uri'].split('/')[-1]}"

In [14]:
trip_speeds_uris

[{'date': '2025-09-23',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-23.parquet',
  'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-23.parquet'},
 {'date': '2025-09-24',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-24.parquet',
  'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-24.parquet'},
 {'date': '2025-09-25',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-25.parquet',
  'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-25.parquet'},
 {'date': '2025-10-14',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-14.parquet',
  'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-14.parquet'},
 {'date': '2025-10-15',
  'uri': 'gs://calitp-analytics-data/data-an

In [15]:
def add_speeds_identifiers(speeds_df, analysis_date):
    trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_instance_key', 'trip_id', 'gtfs_dataset_key'])
    speeds_df = (speeds_df.merge(trips, on=['trip_instance_key', 'schedule_gtfs_dataset_key'])
                 .pipe(portfolio_utils.standardize_operator_info_for_exports, date=analysis_date)
                )
    return speeds_df

In [16]:
# Copy files from the private bucket to the public / mtc-visible bucket
fs = g.gcs_filesystem
# for old_uri, new_uri in output_aggregated_uris.items():
#     fs.copy(old_uri, new_uri)
for uri_group in trip_speeds_uris:
    speeds_df = pd.read_parquet(uri_group['uri'])
    speeds_df = add_speeds_identifiers(speeds_df, analysis_date=uri_group['date'])
    speeds_df.to_parquet(uri_group['output_uri'])
    print(uri_group['output_uri'])

gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-23.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-24.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-25.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-14.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-15.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-16.parquet
