In [3]:
import pandas as pd
import geopandas as gpd
from siuba import *
import shared_utils

from calitp_data_analysis import get_fs, geography_utils
from segment_speed_utils import helpers, time_series_utils, gtfs_schedule_wrangling
from segment_speed_utils.project_vars import SCHED_GCS, SEGMENT_GCS, GTFS_DATA_DICT, analysis_date

In [4]:
import dask.dataframe as dd
import dask_geopandas as dg

# Scratchpad for displaying excluded shapes on speedmaps

* implemented via `ca_transit_speedmaps/speedmap_utils.py` for now
* TODO: more systematic/clarify cause as speed data gap or shape not running in time period

In [5]:
catalog = shared_utils.catalog_utils.get_catalog('gtfs_analytics_data')

In [6]:
catalog

{'gcs_paths': {'GCS': 'gs://calitp-analytics-data/data-analyses/', 'COMPILED_CACHED_VIEWS': '${.GCS}rt_delay/compiled_cached_views/', 'SEGMENT_GCS': '${.GCS}rt_segment_speeds/', 'SCHED_GCS': '${.GCS}gtfs_schedule/', 'RT_SCHED_GCS': '${.GCS}rt_vs_schedule/', 'SHARED_GCS': '${.GCS}shared_data/', 'PREDICTIONS_GCS': '${.GCS}rt_predictions/', 'PUBLIC_GCS': 'gs://calitp-publish-data-analysis/'}, 'speed_vars': {'timestamp_col': 'location_timestamp_local', 'max_speed': 80, 'time_min_cutoff': 10, 'timestamp_cols': ['location_timestamp_local', 'moving_timestamp_local']}, 'schedule_downloads': {'dir': '${gcs_paths.COMPILED_CACHED_VIEWS}', 'trips': 'trips', 'shapes': 'routelines', 'stops': 'stops', 'stop_times': 'st'}, 'speeds_tables': {'dir': '${gcs_paths.SEGMENT_GCS}', 'raw_vp': 'vp', 'usable_vp': 'vp_usable', 'vp_dwell': 'vp_usable_dwell', 'vp_condensed_line': 'condensed/vp_condensed', 'timestamp_col': '${speed_vars.timestamp_col}', 'time_min_cutoff': '${speed_vars.time_min_cutoff}'}, 'schedule

In [7]:
vp_tables = [key for key in catalog.speeds_tables.keys() if 'vp' in key]

In [8]:
vp_tables

['raw_vp', 'usable_vp', 'vp_dwell', 'vp_condensed_line']

In [17]:
paths = [catalog.speeds_tables[key] for key in catalog.speeds_tables.keys() if 'vp' in key]

In [22]:
def diff_vp_tables(paths: list, dates: tuple):
    for table in paths:
        for date in dates:
            try:
                path = f'{catalog.speeds_tables.dir}{table}_{date}.parquet'
                df = pd.read_parquet(path)
                print(path)
                print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
                print(df.info())
                print(f'unique trips: {len(df.trip_instance_key.unique())}')
            except:
                path = f'{catalog.speeds_tables.dir}{table}_{date}/'
                ddf = dd.read_parquet(path)
                print(path)
                print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
                print(f'length: {ddf.shape[0].compute()}')
                print(f'unique trips: {len(ddf.trip_instance_key.unique().compute())}')
                print(ddf.info())

In [23]:
dates = (shared_utils.rt_dates.DATES['dec2024'],
        shared_utils.rt_dates.DATES['jan2025']
)

In [24]:
paths

['vp', 'vp_usable', 'vp_usable_dwell', 'condensed/vp_condensed']

In [25]:
diff_vp_tables(paths, dates)

gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2024-12-11.parquet
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15204207 entries, 0 to 15204206
Data columns (total 8 columns):
 #   Column                     Dtype              
---  ------                     -----              
 0   gtfs_dataset_name          object             
 1   schedule_gtfs_dataset_key  object             
 2   trip_id                    object             
 3   trip_instance_key          object             
 4   location_timestamp         datetime64[ns, UTC]
 5   location_timestamp_local   datetime64[ns]     
 6   gtfs_dataset_key           category           
 7   geometry                   object             
dtypes: category(1), datetime64[ns, UTC](1), datetime64[ns](1), object(5)
memory usage: 826.5+ MB
None
unique trips: 93389
gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2025-01-15.parquet
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [30]:
cond1 = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/rt_segment_speeds/condensed/vp_condensed_2024-12-11.parquet')

In [31]:
cond2 = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/rt_segment_speeds/condensed/vp_condensed_2025-01-15.parquet')

In [33]:
cond1.vp_primary_direction.explode().value_counts()

Eastbound     2804652
Westbound     2771695
Southbound    2679640
Northbound    2672020
Unknown       1627534
Name: vp_primary_direction, dtype: int64

In [34]:
cond2.vp_primary_direction.explode().value_counts()

Eastbound     2825746
Westbound     2792007
Southbound    2703923
Northbound    2692053
Unknown       1628248
Name: vp_primary_direction, dtype: int64

In [35]:
cond2.geometry

Unnamed: 0,trip_instance_key,geometry,vp_idx,location_timestamp_local,moving_timestamp_local,vp_primary_direction
0,00001f273308970887743ce0c3acf068,"LINESTRING (-117.06536 32.56955, -117.06539 32...","[8622806, 8622807, 8622808, 8622809, 8622812, ...","[2025-01-15T18:02:45.000000, 2025-01-15T18:03:...","[2025-01-15T18:02:45.000000, 2025-01-15T18:03:...","[Unknown, Westbound, Northbound, Unknown, East..."
1,00005ffae19105cd3addbd6642d7873a,"LINESTRING (-122.21661 37.79909, -122.21661 37...","[12821471, 12821472, 12821474, 12821475, 12821...","[2025-01-15T18:16:54.000000, 2025-01-15T18:17:...","[2025-01-15T18:16:54.000000, 2025-01-15T18:17:...","[Unknown, Unknown, Southbound, Northbound, Unk..."
2,0000b6cbbfc0fb65ecefcc495349015e,"LINESTRING (-122.52319 37.97165, -122.52321 37...","[14157063, 14157064, 14157065, 14157066, 14157...","[2025-01-15T07:47:56.000000, 2025-01-15T07:48:...","[2025-01-15T07:47:56.000000, 2025-01-15T07:48:...","[Unknown, Southbound, Southbound, Northbound, ..."
3,00012e56c8260fd35926619339b4fe17,"LINESTRING (-122.46008 37.70635, -122.46008 37...","[10710639, 10710640, 10710648, 10710649, 10710...","[2025-01-15T13:26:18.000000, 2025-01-15T13:26:...","[2025-01-15T13:26:18.000000, 2025-01-15T13:28:...","[Unknown, Unknown, Eastbound, Unknown, Eastbou..."
4,0002cdd1d51f33e7543289688c01c6e5,"LINESTRING (-117.88335 33.91635, -117.88369 33...","[9327071, 9327072, 9327073, 9327076, 9327077, ...","[2025-01-15T16:10:31.000000, 2025-01-15T16:10:...","[2025-01-15T16:10:31.000000, 2025-01-15T16:10:...","[Unknown, Southbound, Unknown, Southbound, Sou..."
...,...,...,...,...,...,...
90905,fffdd30816e4022838ee42dd45e59318,"LINESTRING (-122.29901 37.90289, -122.29901 37...","[14686171, 14686172, 14686193, 14686194, 14686...","[2025-01-15T12:49:16.000000, 2025-01-15T12:49:...","[2025-01-15T12:49:16.000000, 2025-01-15T12:56:...","[Unknown, Unknown, Westbound, Westbound, Westb..."
90906,fffe75702726c5c94333254be21e475f,"LINESTRING (-117.15356 32.71582, -117.15356 32...","[8085607, 8085608, 8085611, 8085612, 8085614, ...","[2025-01-15T07:57:40.000000, 2025-01-15T07:58:...","[2025-01-15T07:57:40.000000, 2025-01-15T07:59:...","[Unknown, Unknown, Eastbound, Unknown, Westbou..."
90907,fffed00f13282ae103eb572cd9cb9d6b,"LINESTRING (-117.10793 32.66203, -117.10793 32...","[8197030, 8197031, 8197032, 8197033, 8197034, ...","[2025-01-15T17:58:12.000000, 2025-01-15T17:58:...","[2025-01-15T17:58:12.000000, 2025-01-15T17:58:...","[Unknown, Unknown, Southbound, Northbound, Nor..."
90908,fffed04d5fcb59b478ff147c07f5e948,"LINESTRING (-122.50857 37.76033, -122.50857 37...","[11921403, 11921404, 11921410, 11921411, 11921...","[2025-01-15T21:07:00.000000, 2025-01-15T21:07:...","[2025-01-15T21:07:00.000000, 2025-01-15T21:08:...","[Unknown, Unknown, Eastbound, Unknown, Northbo..."
