In [1]:
import geopandas as gpd
import pandas as pd
from siuba import *

from segment_speed_utils import helpers, gtfs_schedule_wrangling
from shared_utils import rt_dates

In [2]:
from update_vars import analysis_date, AM_PEAK, PM_PEAK

In [3]:
AM_PEAK

(datetime.time(6, 0), datetime.time(9, 0))

In [4]:
PM_PEAK

(datetime.time(15, 0), datetime.time(19, 0))

In [5]:
am_peak_hrs = list(range(AM_PEAK[0].hour, AM_PEAK[1].hour))
pm_peak_hrs = list(range(PM_PEAK[0].hour, PM_PEAK[1].hour))

In [6]:
both_peaks_hrs = am_peak_hrs + pm_peak_hrs

In [7]:
peaks_dict = {key: 'am_peak' for key in am_peak_hrs} | {key: 'pm_peak' for key in pm_peak_hrs}

In [8]:
peaks_dict

{6: 'am_peak',
 7: 'am_peak',
 8: 'am_peak',
 15: 'pm_peak',
 16: 'pm_peak',
 17: 'pm_peak',
 18: 'pm_peak'}

## sjoin stops to segs -- subset peak?

In [9]:
import sjoin_stops_to_segments

In [10]:
imported_st = helpers.import_scheduled_stop_times(
    analysis_date,
    get_pandas = True,
)

In [11]:
#  happens in st agg...
imported_st = imported_st.assign(
        departure_hour = pd.to_datetime(
            imported_st.departure_sec, unit="s").dt.hour
    )

In [12]:
imported_st.dtypes

feed_key           object
feed_timezone      object
base64_url         object
trip_id            object
stop_id            object
stop_sequence       int64
timepoint         float64
arrival_sec       float64
departure_sec     float64
arrival_hour      float64
departure_hour    float64
dtype: object

In [13]:
imported_st >> head(3)

Unnamed: 0,feed_key,feed_timezone,base64_url,trip_id,stop_id,stop_sequence,timepoint,arrival_sec,departure_sec,arrival_hour,departure_hour
0,55d8763ca8845a4efa50e40a9b972e1c,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,dgxbvk5qu,11,1,1.0,43200.0,43200.0,12.0,12.0
1,55d8763ca8845a4efa50e40a9b972e1c,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,992,GA,1,1.0,85200.0,85200.0,23.0,23.0
2,55d8763ca8845a4efa50e40a9b972e1c,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,ogkycex13,11,1,1.0,15000.0,15000.0,4.0,4.0


In [14]:
both_peak_filtered = imported_st[imported_st['arrival_hour'].isin(both_peaks_hrs)]

In [15]:
both_peak_filtered['peak'] = both_peak_filtered['arrival_hour'].map(peaks_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  both_peak_filtered['peak'] = both_peak_filtered['arrival_hour'].map(peaks_dict)


In [16]:
both_peak_filtered >> head(2)

Unnamed: 0,feed_key,feed_timezone,base64_url,trip_id,stop_id,stop_sequence,timepoint,arrival_sec,departure_sec,arrival_hour,departure_hour,peak
3,55d8763ca8845a4efa50e40a9b972e1c,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,hcm9spuol,11,1,1.0,59400.0,59400.0,16.0,16.0,pm_peak
5,55d8763ca8845a4efa50e40a9b972e1c,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,1335,LTP,1,1.0,64080.0,64080.0,17.0,17.0,pm_peak


In [17]:
# (1) Aggregate stop times - by stop_id, find max trips in AM/PM peak
# takes 1 min
max_arrivals_by_stop = both_peak_filtered.pipe(sjoin_stops_to_segments.stop_times_aggregation_max_by_stop, analysis_date)

In [18]:
#  peak filtered but old methodology...
max_arrivals_by_stop >> head(2)

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,pm_max_trips,n_trips
0,0139b1253130b33adcd4b3a4490530d2,00eb15cb-1430-4964-b8ae-ca6183e1d0ef,1,1,2.0
1,0139b1253130b33adcd4b3a4490530d2,02a30e39-496f-45d4-ba1c-ac8f3c66b621,3,4,7.0


In [19]:
def dev_stop_times_aggregation_max_by_stop(
    stop_times: pd.DataFrame, 
    analysis_date: str,
    single_route_dir: bool = False,
) -> pd.DataFrame:
    """
    Take the stop_times table 
    and group by stop_id-departure hour
    and count how many trips occur.
    """
    stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
    trips_per_hour_cols = ["peak"]
    
    if single_route_dir:
        trips_per_hour_cols += ["route_id", "direction_id"]

    gtfs_key = helpers.import_scheduled_trips(
        analysis_date,
        # columns = ["feed_key", "gtfs_dataset_key", "route_id", "direction_id"],
        columns = ["feed_key", "gtfs_dataset_key"],
        get_pandas = True
    )
    
    stop_times = stop_times.assign(
        departure_hour = pd.to_datetime(
            stop_times.departure_sec, unit="s").dt.hour
    ).merge(
        gtfs_key,
        on = "feed_key"
    )
    # display(stop_times.head(2))
            
    # Aggregate how many trips are made at that stop by departure hour
    trips_per_hour = gtfs_schedule_wrangling.stop_arrivals_per_stop(
        stop_times,
        group_cols = stop_cols + trips_per_hour_cols, #  TODO update script
        count_col = "trip_id"
    ).rename(columns = {"n_arrivals": "n_trips"})
    
    # return trips_per_hour
    
    # Subset to departure hour before or after 12pm
    am_trips = sjoin_stops_to_segments.max_trips_by_group(
        trips_per_hour[trips_per_hour.peak == 'am_peak'], 
        group_cols = stop_cols,
        max_col = "n_trips"
    ).rename(columns = {"n_trips": "am_max_trips"})
    
    pm_trips = sjoin_stops_to_segments.max_trips_by_group(
        trips_per_hour[trips_per_hour.peak == 'pm_peak'], 
        group_cols = stop_cols,
        max_col = "n_trips"
    ).rename(columns = {"n_trips": "pm_max_trips"})
    
    max_trips_by_stop = pd.merge(
        am_trips, 
        pm_trips,
        on = stop_cols,
        how = "left"
    )
    #  divide by length of peak to get trips/hr, keep n_trips a raw sum
    max_trips_by_stop = max_trips_by_stop.assign(
        am_max_trips_hr = (max_trips_by_stop.am_max_trips.fillna(0) / len(am_peak_hrs)).astype(int),
        pm_max_trips_hr = (max_trips_by_stop.pm_max_trips.fillna(0) / len(pm_peak_hrs)).astype(int),
        n_trips = (max_trips_by_stop.am_max_trips.fillna(0) + 
                   max_trips_by_stop.pm_max_trips.fillna(0))
    )
        
    return max_trips_by_stop

In [20]:
test = dev_stop_times_aggregation_max_by_stop(both_peak_filtered, analysis_date, single_route_dir=False)

In [21]:
max_arrivals_by_stop >> filter(_.am_max_trips > 4, _.pm_max_trips > 4) # old HQ corridor + major stop precursor

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,pm_max_trips,n_trips
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,22,28,50.0
179,0139b1253130b33adcd4b3a4490530d2,98d2a60c-86b1-45d6-b5d6-39b273c9eb46,8,8,16.0
180,0139b1253130b33adcd4b3a4490530d2,999ff07b-4a27-4c80-9a1c-e868038ce097,13,12,25.0
250,0139b1253130b33adcd4b3a4490530d2,e37ef534-cce7-4470-84b4-25ba98c51114,5,5,10.0
296,015d67d5b75b5cf2b710bbadadfb75f5,40103,5,5,10.0
...,...,...,...,...,...
77910,ff1bc5dde661d62c877165421e9ca257,LO_1,5,5,10.0
77916,ff1bc5dde661d62c877165421e9ca257,LO_15,5,5,10.0
77918,ff1bc5dde661d62c877165421e9ca257,LO_17,5,5,10.0
77919,ff1bc5dde661d62c877165421e9ca257,LO_18,5,5,10.0


In [22]:
test >> filter(_.am_max_trips_hr > 4, _.pm_max_trips_hr > 4) #  new HQ corridor (still multi-route)

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,39,58.0,13,14,97.0
179,0139b1253130b33adcd4b3a4490530d2,98d2a60c-86b1-45d6-b5d6-39b273c9eb46,16,24.0,5,6,40.0
180,0139b1253130b33adcd4b3a4490530d2,999ff07b-4a27-4c80-9a1c-e868038ce097,25,44.0,8,11,69.0
297,015d67d5b75b5cf2b710bbadadfb75f5,40113,30,48.0,10,12,78.0
332,015d67d5b75b5cf2b710bbadadfb75f5,40158,21,34.0,7,8,55.0
...,...,...,...,...,...,...,...
77686,fb467982dcc77a7f9199bebe709bb700,66001,20,27.0,6,6,47.0
77713,fb746afc72ff40405cfefa6d23ab58a0,53000,22,27.0,7,6,49.0
77721,fb746afc72ff40405cfefa6d23ab58a0,53129,28,42.0,9,10,70.0
77723,fb746afc72ff40405cfefa6d23ab58a0,53163,20,27.0,6,6,47.0


In [23]:
test >> filter(_.am_max_trips_hr > 3, _.pm_max_trips_hr > 3) #  new major stop precursor (still multi-route)

Unnamed: 0,schedule_gtfs_dataset_key,stop_id,am_max_trips,pm_max_trips,am_max_trips_hr,pm_max_trips_hr,n_trips
96,0139b1253130b33adcd4b3a4490530d2,52c2636c-34a3-434c-99ae-cdf3dc36d15c,39,58.0,13,14,97.0
179,0139b1253130b33adcd4b3a4490530d2,98d2a60c-86b1-45d6-b5d6-39b273c9eb46,16,24.0,5,6,40.0
180,0139b1253130b33adcd4b3a4490530d2,999ff07b-4a27-4c80-9a1c-e868038ce097,25,44.0,8,11,69.0
296,015d67d5b75b5cf2b710bbadadfb75f5,40103,12,17.0,4,4,29.0
297,015d67d5b75b5cf2b710bbadadfb75f5,40113,30,48.0,10,12,78.0
...,...,...,...,...,...,...,...
77721,fb746afc72ff40405cfefa6d23ab58a0,53129,28,42.0,9,10,70.0
77723,fb746afc72ff40405cfefa6d23ab58a0,53163,20,27.0,6,6,47.0
77767,fc6cd27871cce0092a08ccf68fb240a2,132744,19,23.0,6,5,42.0
77790,fe4aab1717eca5a2935c32c85a35a5bf,115,13,22.0,4,5,35.0


## debug kernel bomb

### this merge runs out of memory...

In [None]:
gtfs_key = helpers.import_scheduled_trips(
    analysis_date,
    # columns = ["feed_key", "gtfs_dataset_key"],
    columns = ["feed_key", "gtfs_dataset_key", "route_id", "direction_id"],
    get_pandas = True
)

stop_times = stop_times.assign(
    departure_hour = pd.to_datetime(
        stop_times.departure_sec, unit="s").dt.hour
).merge(
    gtfs_key,
    on = "feed_key"
)

### try out Dask dd.multi??

In [19]:
stop_times = both_peak_filtered

In [20]:
import dask.dataframe as dd

In [21]:
stop_cols = ["schedule_gtfs_dataset_key", "stop_id"]
trips_per_hour_cols = ["peak"]

# if single_route_dir:
#     trips_per_hour_cols += ["route_id", "direction_id"]

gtfs_key = helpers.import_scheduled_trips(
    analysis_date,
    # columns = ["feed_key", "gtfs_dataset_key"],
    columns = ["feed_key", "gtfs_dataset_key", "route_id", "direction_id"],
    get_pandas = False
).set_index('feed_key')

stop_times = stop_times.assign(
    departure_hour = pd.to_datetime(
        stop_times.departure_sec, unit="s").dt.hour
)

In [22]:
stop_times = dd.from_pandas(stop_times, npartitions=10).set_index('feed_key')

In [23]:
gtfs_key

Unnamed: 0_level_0,schedule_gtfs_dataset_key,route_id,direction_id
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
002790c3b23889605c337dc8129ee1c2,object,object,float64
ffd535d03c203cf59162dc35497b85d9,...,...,...


In [24]:
stop_times

Unnamed: 0_level_0,feed_timezone,base64_url,trip_id,stop_id,stop_sequence,timepoint,arrival_sec,departure_sec,arrival_hour,departure_hour,peak
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
002790c3b23889605c337dc8129ee1c2,object,object,object,object,int64,float64,float64,float64,float64,int64,object
0110e66b44a8f07126af415097613bb6,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
c3754676d6867a6d6c81e62e75ff285f,...,...,...,...,...,...,...,...,...,...,...
ffd535d03c203cf59162dc35497b85d9,...,...,...,...,...,...,...,...,...,...,...


In [25]:
aligned = dd.multi.align_partitions(gtfs_key, stop_times)

In [31]:
gtfs_key = aligned[0][0]

In [32]:
stop_times = aligned[0][1]

In [33]:
dask_merged = dd.multi.merge_indexed_dataframes(
    stop_times, gtfs_key)

In [None]:
stop_times = dask_merged.compute()

In [None]:
stop_times