In [1]:
import os
os.environ['USE_PYGEOS'] = '0'

import pandas as pd
import geopandas as gpd
from siuba import *

import datetime as dt
import os

In [2]:
from rt_analysis import rt_parser

In [3]:
from tqdm.notebook import tqdm

In [4]:
from rt_analysis import signal_tools

In [5]:
pbar = tqdm()

0it [00:00, ?it/s]

In [6]:
from shared_utils import rt_utils, rt_dates

# Example 4-day aggregation using speedmap data

* Export 4 days in April to support MTC Plan Bay Area 2050
* First, run speedmap pipeline for any days not already ran (can filter to subset of operators/districts if desired by using additional scripts and editing Makefile, see d4 example)
* Export 4 Days in October 2022 to support D4 Transit Plan
    * filter out mislabeled NCTD (itp_id 226)...

In [7]:
keys = ['nov2022a', 'nov2022b', 'nov2022c', 'nov2022d']
dates = [rt_dates.DATES[key] for key in keys]

In [8]:
dates

['2022-11-07', '2022-11-08', '2022-11-09', '2022-11-10']

In [9]:
progress_all_dates = []
for date in dates:
    df = pd.read_parquet(f'./_rt_progress_{date}.parquet')
    df = df >> filter(_.caltrans_district == '04 - Oakland')
    progress_all_dates += [df]
four_days = pd.concat(progress_all_dates)

Basic checks on organizations ran

In [None]:
# four_days >> filter(_.organization_name.str.contains('Valley'))

In [None]:
# four_days >> count(_.organization_name, _.status, _.organization_itp_id)

In [None]:
# four_days >> filter(_.status == 'already_ran')

In [None]:
am_filter = {'period': 'am', 'filter_args': {'start_time': '06:00', 'end_time': '09:00'}}
mid_filter = {'period': 'mid', 'filter_args': {'start_time': '10:00', 'end_time': '14:00'}}
pm_filter = {'period': 'pm', 'filter_args': {'start_time': '15:00', 'end_time': '19:00'}}
all_filters = [am_filter, mid_filter, pm_filter]

In [None]:
signal_tools.concatenate_speedmap_segments?

In [None]:
pbar = tqdm()

## loop over dates, using `signal_tools` to aggregate all operators in each time period

In [None]:
# for date in dates:
#     progress_df = pd.read_parquet(f'./_rt_progress_{date}.parquet')
#     progress_df = progress_df >> filter(_.caltrans_district == '04 - Oakland') >> filter(_.status == 'map_confirmed')
#     gdfs = {}
#     for time_filter in all_filters:
#         gdfs[time_filter['period']] = signal_tools.concatenate_speedmap_segments(progress_df = progress_df, pbar=pbar, filter_args=time_filter['filter_args'])
#     for period in gdfs.keys():
#         gdfs[period].to_parquet(f'_{date}_{period}.parquet')

## concatenate/aggregate 4 day period

* start by aggregating metrics for each time of day period across all dates/operators
* then prefix am/mid/pm observations and concat

In [None]:
def read_all_dates(period):
    gdfs = []
    for date in dates:
        gdfs += [gpd.read_parquet(f'_{date}_{period}.parquet')]
    return pd.concat(gdfs)

In [None]:
all_am = read_all_dates('am')

In [None]:
all_pm = read_all_dates('pm')

In [None]:
all_mid = read_all_dates('mid')

In [None]:
group_cols = ['geometry', 'shape_id', 'stop_sequence',
       'route_id', 'route_short_name', 'direction_id', 'gtfs_dataset_key', 'organization_name',
             'miles_from_last']

In [None]:
prefix_cols = ['p50_mph', 'p20_mph',
       'p80_mph', 'fast_slow_ratio', 'trips_per_hour',
       'time_formatted', 'system_p50_median', 'n_weekdays']

In [None]:
def aggregate_prefix(period_concat_df, period):

    counts = (period_concat_df >> count(_.shape_id, _.stop_sequence, _.gtfs_dataset_key, _.organization_name)
          >> select(_.shape_id, _.stop_sequence, _.gtfs_dataset_key, _.organization_name, _.n_weekdays == _.n))
    all_period_agged = period_concat_df.groupby(group_cols).median().reset_index()
    all_period_agged = all_period_agged >> inner_join(_, counts, on = ['shape_id', 'stop_sequence', 'gtfs_dataset_key', 'organization_name'])
    prefix = period + '_'
    prefixed = [prefix + col for col in prefix_cols]
    rename_dict = dict(zip(prefix_cols, prefixed))
    all_period_agged = all_period_agged.rename(columns=rename_dict)
    
    return all_period_agged

In [None]:
am_prefixed = aggregate_prefix(all_am, 'am')

In [None]:
pm_prefixed = aggregate_prefix(all_pm, 'pm')

In [None]:
mid_prefixed = aggregate_prefix(all_mid, 'mid')

In [None]:
joined = (am_prefixed >> full_join(_, mid_prefixed, on=group_cols)
          >> full_join(_, pm_prefixed, on=group_cols)
         
         )

In [None]:
joined.columns

## re-add stop id and stop name

* can't group on these since not present for virtual segments!

In [None]:
df = pd.concat([all_am, all_mid, all_pm]) >> distinct(_.shape_id, _.stop_sequence, _.gtfs_dataset_key,
                                                     _.stop_id, _.stop_name)

In [None]:
joined = joined >> left_join(_, df, on=['shape_id', 'stop_sequence', 'gtfs_dataset_key'])

In [None]:
joined >> count(_.stop_id, _.stop_name) >> arrange(-_.n)

In [None]:
joined = gpd.GeoDataFrame(joined)

In [None]:
joined.crs

In [None]:
# (joined >> filter(_.stop_id.isna())).explore()

In [None]:
joined.to_parquet('d4_nov7_nov10.parquet')

In [10]:
joined = gpd.read_parquet('d4_nov7_nov10.parquet')

In [11]:
mtc = gpd.read_parquet('./d4_apr10_apr13_express_fixed2.parquet')

In [12]:
joined >> distinct(_.organization_name) >> filter(-_.organization_name.isin(mtc.organization_name))

Unnamed: 0,organization_name
9,City of Petaluma
13,Contra Costa County Transportation Authority
15,Livermore / Amador\n Valley Transit Authority


In [13]:
mtc >> distinct(_.organization_name) >> filter(-_.organization_name.isin(joined.organization_name))

Unnamed: 0,organization_name
10,Cloverdale Transit
12,City of Santa Rosa
13,Napa Valley Transportation Authority
15,Solano Transportation Authority
16,Western Contra Costa Transit Authority
17,Central Contra Costa Transit Authority
19,Livermore / Amador Valley Transit Authority


In [14]:
mtc.shape

(66602, 32)

In [15]:
joined.shape

(52315, 32)