In [2]:
import pandas as pd
import geopandas as gpd
from siuba import *
import shared_utils

from calitp_data_analysis import get_fs, geography_utils
from segment_speed_utils import helpers, time_series_utils, gtfs_schedule_wrangling
from segment_speed_utils.project_vars import SCHED_GCS, SEGMENT_GCS, GTFS_DATA_DICT, analysis_date

In [3]:
import dask.dataframe as dd
import dask_geopandas as dg

# What's up with January?

* Follow Tiffany suggestion to look at speed tables...
* https://github.com/cal-itp/data-analyses/pull/1356

In [4]:
catalog = shared_utils.catalog_utils.get_catalog('gtfs_analytics_data')

In [5]:
vp_tables = [key for key in catalog.speeds_tables.keys() if 'vp' in key]

In [6]:
vp_tables

['raw_vp', 'usable_vp', 'vp_dwell', 'vp_condensed_line']

In [7]:
paths = [catalog.speeds_tables[key] for key in catalog.speeds_tables.keys() if 'vp' in key]

In [8]:
paths

['vp', 'vp_usable', 'vp_usable_dwell', 'condensed/vp_condensed']

In [19]:
def diff_tables(paths: list, dates: tuple, gcs_dir = catalog.speeds_tables.dir):
    for table in paths:
        for date in dates:
            try:
                path = f'{gcs_dir}{table}_{date}.parquet'
                df = pd.read_parquet(path)
                print(path)
                print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
                print(df.info())
                print(f'unique trips: {len(df.trip_instance_key.unique())}')
            except:
                path = f'{gcs_dir}{table}_{date}/'
                ddf = dd.read_parquet(path)
                print(path)
                print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
                print(f'length: {ddf.shape[0].compute()}')
                print(f'unique trips: {len(ddf.trip_instance_key.unique().compute())}')
                print(ddf.info())

In [20]:
dates = (shared_utils.rt_dates.DATES['dec2024'],
        shared_utils.rt_dates.DATES['jan2025']
)

In [21]:
paths

['vp', 'vp_usable', 'vp_usable_dwell', 'condensed/vp_condensed']

In [22]:
diff_tables(paths=paths, dates=dates)

gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2024-12-11.parquet
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15204207 entries, 0 to 15204206
Data columns (total 8 columns):
 #   Column                     Dtype              
---  ------                     -----              
 0   gtfs_dataset_name          object             
 1   schedule_gtfs_dataset_key  object             
 2   trip_id                    object             
 3   trip_instance_key          object             
 4   location_timestamp         datetime64[ns, UTC]
 5   location_timestamp_local   datetime64[ns]     
 6   gtfs_dataset_key           category           
 7   geometry                   object             
dtypes: category(1), datetime64[ns, UTC](1), datetime64[ns](1), object(5)
memory usage: 826.5+ MB
None
unique trips: 93389
gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2025-01-15.parquet
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

## compare stage4 (trip speeds)

In [15]:
catalog.stop_segments.dir

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [16]:
catalog.stop_segments.stage4

'speeds_stop_segments'

In [23]:
diff_tables([catalog.stop_segments.stage2], dates, catalog.stop_segments.dir)

gs://calitp-analytics-data/data-analyses/rt_segment_speeds/nearest/nearest_vp_shape_segments_2024-12-11.parquet
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3120555 entries, 0 to 3120554
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   trip_instance_key  object 
 1   stop_sequence      int64  
 2   shape_array_key    object 
 3   stop_meters        float64
 4   prior_vp_idx       int64  
 5   subseq_vp_idx      int64  
 6   prior_vp_meters    int64  
 7   subseq_vp_meters   float64
dtypes: float64(2), int64(4), object(2)
memory usage: 214.3+ MB
None
unique trips: 79652
gs://calitp-analytics-data/data-analyses/rt_segment_speeds/nearest/nearest_vp_shape_segments_2025-01-15.parquet
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<class 'pandas.core.frame.DataFrame'>
Int64Index: 235003 entries, 0 to 235002
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dt

In [24]:
diff_tables([catalog.speedmap_segments.stage2], dates, catalog.speedmap_segments.dir)

gs://calitp-analytics-data/data-analyses/rt_segment_speeds/nearest/nearest_vp_speedmap_proxy_2024-12-11.parquet
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<class 'pandas.core.frame.DataFrame'>
Int64Index: 577121 entries, 0 to 577120
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   trip_instance_key  577121 non-null  object 
 1   stop_sequence      577121 non-null  int64  
 2   stop_sequence1     577121 non-null  float64
 3   shape_array_key    577121 non-null  object 
 4   stop_meters        577121 non-null  float64
 5   prior_vp_idx       577121 non-null  int64  
 6   subseq_vp_idx      577121 non-null  int64  
 7   prior_vp_meters    577121 non-null  int64  
 8   subseq_vp_meters   577121 non-null  float64
dtypes: float64(3), int64(4), object(2)
memory usage: 44.0+ MB
None
unique trips: 51468
gs://calitp-analytics-data/data-analyses/rt_segment_speeds/nearest/nearest_vp_speedmap_proxy_2025-01