In [None]:
#https://stackoverflow.com/questions/35064304/runtimeerror-make-sure-the-graphviz-executables-are-on-your-systems-path-aft
#!pip install graphviz
#!conda install --yes -c conda-forge graphviz

# Minimal example to get `dask.distributed` working

* No `dask_geopandas` installed in `dask.distributed`...will have to ask Andrew. For now, test `dask.dataframe` only.
* Move all functions into this notebook to simplify for now.
* Read in multiple tabular datasets, merge, and do aggregation.
* Get task graph loaded (`bus_service_utils` has the `pip` and `conda` requirements)
* Can we test `dask.delayed` and `dask futures`?

In [1]:
import dask.dataframe as dd
import pandas as pd

from dask.distributed import Client

analysis_date = "2022-10-12"
RT_GCS = 'gs://calitp-analytics-data/data-analyses/rt_delay/cached_views/'

In [2]:
def import_data(itp_id, date):
    stop_times = dd.read_parquet(
                f"{RT_GCS}st_{itp_id}_{date}.parquet")
    trips = dd.read_parquet(
        f"{RT_GCS}trips_{itp_id}_{date}.parquet")
    
    return stop_times, trips

In [3]:
def categorize_time_of_day(value: int ) -> str:
    if isinstance(value, int):
        hour = value
    if hour < 4:
        return "Owl"
    elif hour < 7:
        return "Early AM"
    elif hour < 10:
        return "AM Peak"
    elif hour < 15:
        return "Midday"
    elif hour < 20:
        return "PM Peak"
    else:
        return "Evening"

In [4]:
def merge_stop_times_to_trips(stop_times: dd.DataFrame, 
                          trips: dd.DataFrame) -> dd.DataFrame:   
    shape_id_cols = ["calitp_itp_id", "shape_id"]

    merged = dd.merge(
        stop_times,
        trips[shape_id_cols + ["trip_id"]].drop_duplicates(),
        on = ["calitp_itp_id", "trip_id"]
    )
    
    # Map to time-of-day
    stop_times_binned = merged.assign(
        time_of_day=merged.apply(
            lambda x: categorize_time_of_day(x.departure_hour), axis=1, 
            meta=('time_of_day', 'str'))
    )
    
    # Calculate the number of arrivals by time-of-day
    arrivals = (stop_times_binned.groupby(shape_id_cols + ["time_of_day"])
          .agg({"stop_id": "count"})
            .reset_index()
         )
    
    return arrivals

In [6]:
client = Client("dask-scheduler.dask.svc.cluster.local:8786")
client


+-------------+----------------+----------------+----------------+
| Package     | client         | scheduler      | workers        |
+-------------+----------------+----------------+----------------+
| cloudpickle | 2.2.0          | 2.1.0          | 2.1.0          |
| dask        | 2022.05.2      | 2022.8.0       | 2022.8.0       |
| distributed | 2022.5.2       | 2022.8.0       | 2022.8.0       |
| numpy       | 1.23.3         | 1.23.1         | 1.23.1         |
| pandas      | 1.5.0          | 1.4.3          | 1.4.3          |
| python      | 3.10.6.final.0 | 3.8.13.final.0 | 3.8.13.final.0 |
+-------------+----------------+----------------+----------------+


0,1
Connection method: Direct,
Dashboard: http://dask-scheduler.dask.svc.cluster.local:8787/status,

0,1
Comm: tcp://10.96.3.216:8786,Workers: 3
Dashboard: http://10.96.3.216:8787/status,Total threads: 6
Started: 1 week ago,Total memory: 46.91 GiB

0,1
Comm: tcp://10.96.11.27:40653,Total threads: 2
Dashboard: http://10.96.11.27:8790/status,Memory: 15.64 GiB
Nanny: tcp://10.96.11.27:35095,
Local directory: /tmp/dask-worker-space/worker-fsinsm12,Local directory: /tmp/dask-worker-space/worker-fsinsm12
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 88.50 MiB,Spilled bytes: 0 B
Read bytes: 286.11542017243664 B,Write bytes: 1.07 kiB

0,1
Comm: tcp://10.96.5.23:42471,Total threads: 2
Dashboard: http://10.96.5.23:8790/status,Memory: 15.64 GiB
Nanny: tcp://10.96.5.23:41529,
Local directory: /tmp/dask-worker-space/worker-3yv4ipua,Local directory: /tmp/dask-worker-space/worker-3yv4ipua
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 115.80 MiB,Spilled bytes: 0 B
Read bytes: 285.98213593492153 B,Write bytes: 1.07 kiB

0,1
Comm: tcp://10.96.6.56:42509,Total threads: 2
Dashboard: http://10.96.6.56:8790/status,Memory: 15.64 GiB
Nanny: tcp://10.96.6.56:38591,
Local directory: /tmp/dask-worker-space/worker-8128i5zj,Local directory: /tmp/dask-worker-space/worker-8128i5zj
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 6.0%,Last seen: Just now
Memory usage: 100.98 MiB,Spilled bytes: 0 B
Read bytes: 285.15723583806783 B,Write bytes: 1.06 kiB


Show task graph for just 1 operator.

* low-level: `.visualize()` 
* high-level: `.dask.visualize()`

In [7]:
def import_data_combined(date):
    RT_GCS = 'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'
    stop_times = dd.read_parquet(f"{RT_GCS}st_{date}.parquet")
    trips = dd.read_parquet(f"{RT_GCS}trips_{date}.parquet")
    
    return stop_times, trips

In [9]:
all_stop_times, all_trips = import_data_combined(analysis_date)

In [10]:
all_stop_times = all_stop_times.repartition(npartitions=5)
all_trips = all_trips.repartition(npartitions=5)

In [11]:
merged = merge_stop_times_to_trips(all_stop_times, all_trips)

In [14]:
merged[merged.calitp_itp_id==182].calitp_itp_id.unique().compute()

2022-10-24 16:58:49,996 - distributed.protocol.pickle - INFO - Failed to deserialize b'\x80\x04\x95\xfb\x03\x00\x00\x00\x00\x00\x00\x8c\x15distributed.scheduler\x94\x8c\x0cKilledWorker\x94\x93\x94\x8cX(\'aggregate-chunk-2522b12a5bbe27d1e0b9332860f7f246-7f565acfab557fbc1e9d83df5496e1b5\', 0)\x94h\x00\x8c\x0bWorkerState\x94\x93\x94)\x81\x94N}\x94(\x8c\x07address\x94\x8c\x16tcp://10.96.5.23:32827\x94\x8c\x03pid\x94Mq\x01\x8c\x04name\x94\x8c\x16tcp://10.96.5.23:32827\x94\x8c\x08nthreads\x94K\x02\x8c\x0cmemory_limit\x94\x8a\x05\x00p\xae\xe8\x03\x8c\x0flocal_directory\x94\x8c&/tmp/dask-worker-space/worker-n_8jfdtc\x94\x8c\x08services\x94}\x94\x8c\tdashboard\x94MV"s\x8c\x08versions\x94}\x94\x8c\x05nanny\x94\x8c\x16tcp://10.96.5.23:41529\x94\x8c\x06status\x94\x8c\x10distributed.core\x94\x8c\x06Status\x94\x93\x94\x8c\x06closed\x94\x85\x94R\x94\x8c\x05_hash\x94\x8a\x08A=)\x9e\xaf\'\xcc\x87\x8c\x06nbytes\x94K\x00\x8c\toccupancy\x94K\x00\x8c\x15_memory_unmanaged_old\x94K\x00\x8c\x19_memory_unmanag

Exception: b'\x80\x04\x95\xfb\x03\x00\x00\x00\x00\x00\x00\x8c\x15distributed.scheduler\x94\x8c\x0cKilledWorker\x94\x93\x94\x8cX(\'aggregate-chunk-2522b12a5bbe27d1e0b9332860f7f246-7f565acfab557fbc1e9d83df5496e1b5\', 0)\x94h\x00\x8c\x0bWorkerState\x94\x93\x94)\x81\x94N}\x94(\x8c\x07address\x94\x8c\x16tcp://10.96.5.23:32827\x94\x8c\x03pid\x94Mq\x01\x8c\x04name\x94\x8c\x16tcp://10.96.5.23:32827\x94\x8c\x08nthreads\x94K\x02\x8c\x0cmemory_limit\x94\x8a\x05\x00p\xae\xe8\x03\x8c\x0flocal_directory\x94\x8c&/tmp/dask-worker-space/worker-n_8jfdtc\x94\x8c\x08services\x94}\x94\x8c\tdashboard\x94MV"s\x8c\x08versions\x94}\x94\x8c\x05nanny\x94\x8c\x16tcp://10.96.5.23:41529\x94\x8c\x06status\x94\x8c\x10distributed.core\x94\x8c\x06Status\x94\x93\x94\x8c\x06closed\x94\x85\x94R\x94\x8c\x05_hash\x94\x8a\x08A=)\x9e\xaf\'\xcc\x87\x8c\x06nbytes\x94K\x00\x8c\toccupancy\x94K\x00\x8c\x15_memory_unmanaged_old\x94K\x00\x8c\x19_memory_unmanaged_history\x94\x8c\x0bcollections\x94\x8c\x05deque\x94\x93\x94)R\x94\x8c\x07metrics\x94}\x94\x8c\tlast_seen\x94K\x00\x8c\ntime_delay\x94K\x00\x8c\tbandwidth\x94J\x00\xe1\xf5\x05\x8c\x06actors\x94\x8f\x94\x8c\t_has_what\x94}\x94\x8c\nprocessing\x94}\x94(\x8c5(\'repartition-5-401b4778cced9d03cab1dc347df981bb\', 4)\x94G?Z\x90-\xa8\xb8\xc5(h\x03G@G\xec\xdd\xd4\x00\x00\x00\x8cX(\'aggregate-chunk-2522b12a5bbe27d1e0b9332860f7f246-7f565acfab557fbc1e9d83df5496e1b5\', 1)\x94G@G\xec\xd30\x00\x00\x00u\x8c\x0clong_running\x94\x8f\x94\x8c\texecuting\x94}\x94(h\x03G@7\xec\xdd\xd4\x00\x00\x00h4G@7\xec\xd30\x00\x00\x00u\x8c\tresources\x94}\x94\x8c\x0eused_resources\x94}\x94\x8c\x05extra\x94}\x94\x8c\tserver_id\x94\x8c+Worker-413503ec-8643-4958-80dd-6b9a6fabb594\x94u\x86\x94b\x86\x94R\x94.'

In [None]:
#merged.visualize()
#merged.to_parquet("gs://calitp-analytics-data/data-analyses/dask_test/test.parquet")