In [1]:
import os
os.environ['USE_PYGEOS'] = '0'

import pandas as pd
import geopandas as gpd
from siuba import *

import datetime as dt
import os

# Example 4-day aggregation using speedmap data

* Export 4 days in April to support MTC Plan Bay Area 2050
* First, run speedmap pipeline for any days not already ran (can filter to subset of operators/districts if desired by using additional scripts and editing Makefile, see d4 example)
* Export 4 Days in October 2022 to support D4 Transit Plan
    * filter out mislabeled NCTD (itp_id 226)...

In [2]:
# dates = [f'2023-04-{day}' for day in range(10, 14)]
dates = [f'2022-10-0{day}' for day in range(3, 7)]

In [3]:
dates

['2022-10-03', '2022-10-04', '2022-10-05', '2022-10-06']

In [4]:
progress_all_dates = []
for date in dates:
    df = pd.read_parquet(f'./_rt_progress_{date}.parquet')
    df = df >> filter(_.caltrans_district == '04 - Oakland')
    progress_all_dates += [df]
four_days = pd.concat(progress_all_dates)

Basic checks on organizations ran

In [6]:
# four_days >> count(_.organization_name, _.status, _.organization_itp_id)

In [7]:
# four_days >> filter(_.status == 'already_ran')

In [8]:
am_filter = {'period': 'am', 'filter_args': {'start_time': '06:00', 'end_time': '09:00'}}
mid_filter = {'period': 'mid', 'filter_args': {'start_time': '10:00', 'end_time': '14:00'}}
pm_filter = {'period': 'pm', 'filter_args': {'start_time': '15:00', 'end_time': '19:00'}}
all_filters = [am_filter, mid_filter, pm_filter]

In [9]:
from tqdm.notebook import tqdm

In [10]:
from rt_analysis import signal_tools

In [11]:
signal_tools.concatenate_speedmap_segments?

[0;31mSignature:[0m
[0msignal_tools[0m[0;34m.[0m[0mconcatenate_speedmap_segments[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mprogress_df[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mitp_id_list[0m[0;34m:[0m [0mlist[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0manalysis_date[0m[0;34m:[0m [0mdatetime[0m[0;34m.[0m[0mdatetime[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpbar[0m[0;34m:[0m [0mtqdm[0m[0;34m.[0m[0mnotebook[0m[0;34m.[0m[0mtqdm_notebook[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilter_args[0m[0;34m:[0m [0mdict[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
get polygon segments from legacy speedmap workflow, with relevant ids attached
relati

In [12]:
pbar = tqdm()

0it [00:00, ?it/s]

## loop over dates, using `signal_tools` to aggregate all operators in each time period

In [14]:
for date in dates:
    progress_df = pd.read_parquet(f'./_rt_progress_{date}.parquet')
    progress_df = progress_df >> filter(_.caltrans_district == '04 - Oakland') >> filter(_.status == 'map_confirmed')
    gdfs = {}
    for time_filter in all_filters:
        gdfs[time_filter['period']] = signal_tools.concatenate_speedmap_segments(progress_df = progress_df, pbar=pbar, filter_args=time_filter['filter_args'])
    for period in gdfs.keys():
        gdfs[period].to_parquet(f'_{date}_{period}.parquet')

218
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_218_2022-10-03.parquet
290
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_290_2022-10-03.parquet
4
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_4_2022-10-03.parquet
315
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_315_2022-10-03.parquet
282
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_282_2022-10-03.parquet
247
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_247_2022-10-03.parquet
246
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_246_2022-10-03.parquet
310
found shapes parquet at gs://calitp-analytics-data/data-analyses/rt_delay/v2_cached_views/shapes_310_2022-10-03.parquet
167
found sh

## concatenate/aggregate 4 day period

* start by aggregating metrics for each time of day period across all dates/operators
* then prefix am/mid/pm observations and concat

In [15]:
def read_all_dates(period):
    gdfs = []
    for date in dates:
        gdfs += [gpd.read_parquet(f'_{date}_{period}.parquet')]
    return pd.concat(gdfs)

In [16]:
all_am = read_all_dates('am')

In [17]:
all_pm = read_all_dates('pm')

In [18]:
all_mid = read_all_dates('mid')

In [19]:
group_cols = ['geometry', 'shape_id', 'stop_sequence',
       'route_id', 'route_short_name', 'direction_id', 'gtfs_dataset_key', 'organization_name',
             'miles_from_last']

In [20]:
prefix_cols = ['p50_mph', 'p20_mph',
       'p80_mph', 'fast_slow_ratio', 'trips_per_hour',
       'time_formatted', 'system_p50_median', 'n_weekdays']

In [21]:
def aggregate_prefix(period_concat_df, period):

    counts = (period_concat_df >> count(_.shape_id, _.stop_sequence, _.gtfs_dataset_key, _.organization_name)
          >> select(_.shape_id, _.stop_sequence, _.gtfs_dataset_key, _.organization_name, _.n_weekdays == _.n))
    all_period_agged = period_concat_df.groupby(group_cols).median().reset_index()
    all_period_agged = all_period_agged >> inner_join(_, counts, on = ['shape_id', 'stop_sequence', 'gtfs_dataset_key', 'organization_name'])
    prefix = period + '_'
    prefixed = [prefix + col for col in prefix_cols]
    rename_dict = dict(zip(prefix_cols, prefixed))
    all_period_agged = all_period_agged.rename(columns=rename_dict)
    
    return all_period_agged

In [22]:
am_prefixed = aggregate_prefix(all_am, 'am')

  all_period_agged = period_concat_df.groupby(group_cols).median().reset_index()


In [23]:
pm_prefixed = aggregate_prefix(all_pm, 'pm')

  all_period_agged = period_concat_df.groupby(group_cols).median().reset_index()


In [24]:
mid_prefixed = aggregate_prefix(all_mid, 'mid')

  all_period_agged = period_concat_df.groupby(group_cols).median().reset_index()


In [25]:
joined = (am_prefixed >> full_join(_, mid_prefixed, on=group_cols)
          >> full_join(_, pm_prefixed, on=group_cols)
         
         )

In [26]:
joined.columns

Index(['geometry', 'shape_id', 'stop_sequence', 'route_id', 'route_short_name',
       'direction_id', 'gtfs_dataset_key', 'organization_name',
       'miles_from_last', 'am_p50_mph', 'am_p20_mph', 'am_p80_mph',
       'am_fast_slow_ratio', 'am_trips_per_hour', 'am_system_p50_median',
       'am_n_weekdays', 'mid_p50_mph', 'mid_p20_mph', 'mid_p80_mph',
       'mid_fast_slow_ratio', 'mid_trips_per_hour', 'mid_system_p50_median',
       'mid_n_weekdays', 'pm_p50_mph', 'pm_p20_mph', 'pm_p80_mph',
       'pm_fast_slow_ratio', 'pm_trips_per_hour', 'pm_system_p50_median',
       'pm_n_weekdays'],
      dtype='object')

## re-add stop id and stop name

* can't group on these since not present for virtual segments!

In [27]:
df = pd.concat([all_am, all_mid, all_pm]) >> distinct(_.shape_id, _.stop_sequence, _.gtfs_dataset_key,
                                                     _.stop_id, _.stop_name)

In [28]:
joined = joined >> left_join(_, df, on=['shape_id', 'stop_sequence', 'gtfs_dataset_key'])

In [29]:
joined >> count(_.stop_id, _.stop_name) >> arrange(-_.n)

Unnamed: 0,stop_id,stop_name,n
13930,,,5178
7687,55120,Decoto Rd & Brookmill Dr,20
8217,55877,Decoto Rd & Alvarado-Niles Rd,20
9180,58033,Mission Bell Dr & College Ln,20
11659,821017,Curtola Park & Ride,19
...,...,...,...
13914,893219,Old Sonoma Rd at Underhill Dr NE,1
13915,893220,Old Sonoma Rd at S Seymour St,1
13922,89328,Foothill Blvd at Elm St,1
13923,89329,Laurel St at Jade Ct,1


In [30]:
joined = gpd.GeoDataFrame(joined)

In [31]:
joined.crs

<Projected CRS: EPSG:3310>
Name: NAD83 / California Albers
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: United States (USA) - California.
- bounds: (-124.45, 32.53, -114.12, 42.01)
Coordinate Operation:
- name: California Albers
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [32]:
# (joined >> filter(_.stop_id.isna())).explore()

In [33]:
joined.to_parquet('d4_oct3_oct7.parquet')