In [13]:
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from calitp_data_analysis.gcs_pandas import GCSPandas

from shared_utils import catalog_utils, rt_dates, portfolio_utils
from segment_speed_utils import helpers
from functools import cache

@cache
def gcs_pandas():
    return GCSPandas()
    
g = GCSGeoPandas()

# Copy speedmaps data into public bucket to support MTC

In [2]:
# date_keys = ["sep2025d", "sep2025e", "sep2025f"]
# date_keys = ["sep2025c", "oct2025c", "oct2025d", "oct2025e", "oct2025f", "oct2025g"]
# date_keys = ["sep2025a", "sep2025", "sep2025b", "oct2025a", "oct2025", "oct2025b"]
date_keys = ['jan2025', 'feb2025', 'mar2025', 'apr2025', 'may2025', 'jun2025', 'jul2025', 'aug2025', 'sep2025', 'oct2025', 'nov2025']

In [3]:
relevant_dates = [
    rt_dates.DATES[i]
    for i in date_keys
]
relevant_dates

['2025-01-15',
 '2025-02-12',
 '2025-03-12',
 '2025-04-16',
 '2025-05-14',
 '2025-06-11',
 '2025-07-16',
 '2025-08-20',
 '2025-09-24',
 '2025-10-15',
 '2025-11-05']

In [4]:
gtfs_yml = catalog_utils.get_catalog("gtfs_analytics_data")
aggregated_parent = (
    f"{gtfs_yml.speedmap_segments.dir}{gtfs_yml.speedmap_segments.segment_timeofday}"
)
trip_speeds_parent = (
    f"{gtfs_yml.speedmap_segments.dir}{gtfs_yml.speedmap_segments.stage4}"
)

In [5]:
# Get aggregated speedmaps segments paths
aggregated_uris = [f"{aggregated_parent}_{date}.parquet" for date in relevant_dates]
aggregated_uris

['gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-01-15.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-02-12.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-03-12.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-04-16.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-05-14.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-06-11.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-07-16.parquet',
 'gs://calitp-analyt

In [6]:
# Get trip level speeds paths
# trip_speeds_uris = [f"{trip_speeds_parent}_{date}.parquet" for date in relevant_dates]
trip_speeds_uris = [{"date": date, "uri": f"{trip_speeds_parent}_{date}.parquet"} for date in relevant_dates]
trip_speeds_uris

[{'date': '2025-01-15',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-01-15.parquet'},
 {'date': '2025-02-12',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-02-12.parquet'},
 {'date': '2025-03-12',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-03-12.parquet'},
 {'date': '2025-04-16',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-04-16.parquet'},
 {'date': '2025-05-14',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-05-14.parquet'},
 {'date': '2025-06-11',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-06-11.parquet'},
 {'date': '2025-07-16',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-07-16.parquet'},
 {'date': '2025-08-20',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segme

### validation

Doesn't exactly match script aggregation but we're comfortable these are the right files. Could be revised.


In [7]:

# # Validate that aggregated speeds match trip speeds in a majority of cases
# GROUP_COLUMNS = ["time_of_day", "segment_id", "shape_id"]
# quantile_to_speed_column = {0.2: "p20_mph", 0.5: "p50_mph", 0.8: "p80_mph"}
# for aggregated_uri, trip_speeds_uri in zip(aggregated_uris, trip_speeds_uris):
#     # read aggregated and precursor data
#     aggregated = g.read_parquet(aggregated_uri)
#     trip_speeds = pd.read_parquet(trip_speeds_uri)
#     # reindex aggregated_speeds and group trip_speeds_grouped so they theoretically have the same data
#     aggregated_speeds = aggregated.set_index(GROUP_COLUMNS)[
#         list(quantile_to_speed_column.values())
#     ]
#     trip_speeds_grouped = trip_speeds.groupby(GROUP_COLUMNS)
#     trip_speeds_percentiles = pd.concat(
#         [
#             trip_speeds_grouped["speed_mph"].quantile(q).rename(name).round(2)
#             for q, name in quantile_to_speed_column.items()
#         ],
#         axis=1,
#     )
#     # the aggregated data appears to contain a subset of the trip speeds data, so only look at that subset
#     trip_speeds_percentiles_subset = trip_speeds_percentiles.loc[
#         aggregated_speeds.index
#     ]
#     # find the rows where at least one speed doesn't match
#     non_equal = (trip_speeds_percentiles_subset != aggregated_speeds).any(axis=1)
    
#     print(
#         f"for {aggregated_uri}, {non_equal.sum()} out of {len(aggregated_speeds.index)} speeds did not match"
#     )

## export

In [7]:
# Map source uris to output uris
output_aggregated_uris = {
    old_uri: f"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{old_uri.split('/')[-1]}"
    for old_uri in aggregated_uris
}
# output_trip_speeds_uris = {
#     old_uri['uri']: f"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{old_uri.split('/')[-1]}"
#     for old_uri in trip_speeds_uris
# }

In [8]:
for entry in trip_speeds_uris: entry['output_uri'] = f"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{entry['uri'].split('/')[-1]}"

In [9]:
trip_speeds_uris

[{'date': '2025-01-15',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-01-15.parquet',
  'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-01-15.parquet'},
 {'date': '2025-02-12',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-02-12.parquet',
  'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-02-12.parquet'},
 {'date': '2025-03-12',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-03-12.parquet',
  'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-03-12.parquet'},
 {'date': '2025-04-16',
  'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-04-16.parquet',
  'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-04-16.parquet'},
 {'date': '2025-05-14',
  'uri': 'gs://calitp-analytics-data/data-an

In [10]:
output_aggregated_uris

{'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-01-15.parquet': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-01-15.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-02-12.parquet': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-02-12.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-03-12.parquet': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-03-12.parquet',
 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-04-16.parquet': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday

In [11]:
def add_speeds_identifiers(speeds_df, analysis_date):
    trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_instance_key', 'trip_id', 'gtfs_dataset_key'])
    speeds_df = (speeds_df.merge(trips, on=['trip_instance_key', 'schedule_gtfs_dataset_key'])
                 .pipe(portfolio_utils.standardize_operator_info_for_exports, date=analysis_date)
                )
    return speeds_df

In [14]:
# Copy files from the private bucket to the public / mtc-visible bucket
fs = g.gcs_filesystem
for old_uri, new_uri in output_aggregated_uris.items():
    fs.copy(old_uri, new_uri)
for uri_group in trip_speeds_uris:
    speeds_df = gcs_pandas().read_parquet(uri_group['uri'])
    speeds_df = add_speeds_identifiers(speeds_df, analysis_date=uri_group['date'])
    gcs_pandas().data_frame_to_parquet(speeds_df, uri_group['output_uri'])
    print(uri_group['output_uri'])

gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-01-15.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-02-12.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-03-12.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-04-16.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-05-14.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-06-11.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-07-16.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-08-20.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-24.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-15.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-11-05.parquet


In [17]:
for old_uri, new_uri in output_aggregated_uris.items():
    print(new_uri)

gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-01-15.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-02-12.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-03-12.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-04-16.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-05-14.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-06-11.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-07-16.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_shape_timeofday_speedmap_segments_2025-08-20.parquet
gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_

In [16]:
speeds_df.columns

Index(['trip_instance_key', 'stop_sequence', 'stop_sequence1',
       'shape_array_key', 'stop_meters', 'arrival_time', 'arrival_time_sec',
       'subseq_arrival_time_sec', 'subseq_stop_meters', 'meters_elapsed',
       'sec_elapsed', 'speed_mph', 'schedule_gtfs_dataset_key', 'route_id',
       'direction_id', 'route_short_name', 'scheduled_service_minutes',
       'sched_rt_category', 'time_of_day', 'shape_id', 'segment_id',
       'stop_pair_name', 'stop_pair', 'trip_id', 'name', 'base64_url',
       'caltrans_district', 'analysis_name', 'source_record_id'],
      dtype='object')