In [None]:
#https://stackoverflow.com/questions/35064304/runtimeerror-make-sure-the-graphviz-executables-are-on-your-systems-path-aft
#!pip install graphviz
#!conda install --yes -c conda-forge graphviz

# Minimal example to get `dask.distributed` working

* No `dask_geopandas` installed in `dask.distributed`...will have to ask Andrew. For now, test `dask.dataframe` only.
* Move all functions into this notebook to simplify for now.
* Read in multiple tabular datasets, merge, and do aggregation.
* Get task graph loaded (`bus_service_utils` has the `pip` and `conda` requirements)
* Can we test `dask.delayed` and `dask futures`?

In [1]:
import dask.dataframe as dd
import pandas as pd

#from dask.distributed import Client

analysis_date = "2022-10-12"
RT_GCS = 'gs://calitp-analytics-data/data-analyses/rt_delay/cached_views/'

In [2]:
def categorize_time_of_day(value: int ) -> str:
    if isinstance(value, int):
        hour = value
    if hour < 4:
        return "Owl"
    elif hour < 7:
        return "Early AM"
    elif hour < 10:
        return "AM Peak"
    elif hour < 15:
        return "Midday"
    elif hour < 20:
        return "PM Peak"
    else:
        return "Evening"

In [3]:
def merge_stop_times_to_trips(stop_times: dd.DataFrame, 
                          trips: dd.DataFrame) -> dd.DataFrame:   
    shape_id_cols = ["calitp_itp_id", "shape_id"]

    merged = dd.merge(
        stop_times,
        trips[shape_id_cols + ["trip_id"]].drop_duplicates(),
        on = ["calitp_itp_id", "trip_id"]
    )
    
    # Map to time-of-day
    stop_times_binned = merged.assign(
        time_of_day=merged.apply(
            lambda x: categorize_time_of_day(x.departure_hour), axis=1, 
            meta=('time_of_day', 'str'))
    )
    
    # Calculate the number of arrivals by time-of-day
    arrivals = (stop_times_binned.groupby(shape_id_cols + ["time_of_day"])
          .agg({"stop_id": "count"})
            .reset_index()
         )
    
    return arrivals

In [None]:
client = Client("dask-scheduler.dask.svc.cluster.local:8786")
client

Show task graph for just 1 operator.

* low-level: `.visualize()` 
* high-level: `.dask.visualize()`

In [4]:
def import_data_combined(date):
    RT_GCS = 'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'
    stop_times = dd.read_parquet(f"{RT_GCS}st_{date}.parquet")
    trips = dd.read_parquet(f"{RT_GCS}trips_{date}.parquet")
    
    return stop_times, trips

In [5]:
all_stop_times, all_trips = import_data_combined(analysis_date)

In [6]:
all_stop_times = all_stop_times.repartition(npartitions=5)
all_trips = all_trips.repartition(npartitions=5)

In [7]:
merged = merge_stop_times_to_trips(all_stop_times, all_trips)

In [9]:
merged = merged.repartition(npartitions=3)

In [None]:
# Low-level graph
merged.visualize(optimize_graph=True)

In [None]:
# High-level graph
merged.dask.visualize()

In [None]:
# Serialization issue comes up when compute() is called
# to turn dask.dataframe into pandas df...and here, to create a list of 1 value
merged[merged.calitp_itp_id==182].calitp_itp_id.unique().compute()

In [None]:
merged.to_parquet("gs://calitp-analytics-data/data-analyses/dask_test/test.parquet", 
                  storage_options = {'token': gcs.session.credentials}
                 )

In [10]:
merged.to_parquet("test")