### Instructions 

Before running this notebook, create a local environment that matches the environemnt on the cloud. Dask Cloud Provider also let's you specify a custom docker image to run on your cluster ([see Dask Cloud Provider's documentation](https://cloudprovider.dask.org/en/latest/aws.html#elastic-compute-cloud-ec2) for more details). This benchmark was run using [Dask 2024.9.1](https://hub.docker.com/r/daskdev/dask/tags).

You can use the provided `env.yml` to create your environment locally.

``` shell
cd benchmarks/dask
conda env create -f env.yml
```
This will create the conda environment `benchmark_dask`, attach it to this notebook.

In [1]:
import time

import dask.dataframe as dd
from dask.distributed import Client
from dask_cloudprovider.aws import EC2Cluster

In [34]:
env_vars = {"EXTRA_CONDA_PACKAGES": "s3fs==2024.10.0"}
cluster = EC2Cluster(
    # NOTE: Setting security = False to avoid large config size
    # https://github.com/dask/dask-cloudprovider/issues/249
    security=False,
    n_workers=4,
    scheduler_instance_type="c6i.xlarge",
    worker_instance_type="r6i.16xlarge",
    # Region for accessing bodo-example-data
    region="us-east-2",
    env_vars=env_vars,
    debug=True,
)

Creating scheduler instance

Cloud init


#cloud-config


# Bootstrap
packages:
  - apt-transport-https
  - ca-certificates
  - curl
  - gnupg-agent
  - software-properties-common
  - ubuntu-drivers-common

# Enable ipv4 forwarding, required on CIS hardened machines
write_files:
  - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
    content: |
      net.ipv4.conf.all.forwarding=1

# create the docker group
groups:
  - docker

# Add default auto created user to docker group
system_info:
  default_user:
    groups: [docker]


runcmd:
  
  # Install Docker
  - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
  - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
  - apt-get update -y
  - apt-get install -y docker-ce docker-ce-cli containerd.io
  - systemctl start docker
  - systemctl enable docker
  

  

  

  # Run container
  - 'docker run --net=host   -e EXTRA_CONDA_PACKAGES="s3fs==2024.10.0"  daskdev/dask:

  next(self.gen)



Cloud init


#cloud-config


# Bootstrap
packages:
  - apt-transport-https
  - ca-certificates
  - curl
  - gnupg-agent
  - software-properties-common
  - ubuntu-drivers-common

# Enable ipv4 forwarding, required on CIS hardened machines
write_files:
  - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
    content: |
      net.ipv4.conf.all.forwarding=1

# create the docker group
groups:
  - docker

# Add default auto created user to docker group
system_info:
  default_user:
    groups: [docker]


runcmd:
  
  # Install Docker
  - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
  - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
  - apt-get update -y
  - apt-get install -y docker-ce docker-ce-cli containerd.io
  - systemctl start docker
  - systemctl enable docker
  

  

  

  # Run container
  - 'docker run --net=host   -e EXTRA_CONDA_PACKAGES="s3fs==2024.10.0"  daskdev/dask:latest env DASK_INTERNAL_INH

In [35]:
client = Client(cluster)
print(client.dashboard_link)

http://18.220.177.101:8787/status



+---------+-----------------+-----------------+---------+
| Package | Client          | Scheduler       | Workers |
+---------+-----------------+-----------------+---------+
| python  | 3.10.15.final.0 | 3.10.12.final.0 | None    |
+---------+-----------------+-----------------+---------+


In [22]:
# first parquet file
dataset = [
    f"s3://bodo-example-data/nyc-taxi/fhvhv_tripdata/fhvhv_tripdata_2019-{i:02}.parquet"
    for i in range(2, 8)
]

In [36]:
# run on entire dataset
dataset = "s3://bodo-example-data/nyc-taxi/fhvhv_tripdata/"

In [37]:
def get_monthly_travels_weather():
    start = time.time()
    central_park_weather_observations = dd.read_csv(
        "s3://bodo-example-data/nyc-taxi/central_park_weather.csv",
        parse_dates=["DATE"],
        storage_options={"anon": True},
    )
    central_park_weather_observations = central_park_weather_observations.rename(
        columns={"DATE": "date", "PRCP": "precipitation"}
    )

    fhvhv_tripdata = dd.read_parquet(dataset, storage_options={"anon": True})

    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date
    fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month
    fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour
    fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin(
        [0, 1, 2, 3, 4]
    )
    monthly_trips_weather = fhvhv_tripdata.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather["date_with_precipitation"] = (
        monthly_trips_weather["precipitation"] > 0.1
    )

    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map(
        get_time_bucket, meta=("hour", "object")
    )
    monthly_trips_weather = (
        monthly_trips_weather.groupby(
            [
                "PULocationID",
                "DOLocationID",
                "month",
                "weekday",
                "date_with_precipitation",
                "time_bucket",
            ],
        )
        .agg({"hvfhs_license_num": "count", "trip_miles": "mean"})
        .reset_index()
    )
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ],
        ascending=True,
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "hvfhs_license_num": "trips",
            "trip_miles": "avg_distance",
        },
    )

    # TODO: Write output to S3 once permissions issue is resolved.
    monthly_trips_weather = monthly_trips_weather.compute()

    end = time.time()

    return end - start

In [39]:
for _ in range(3):
    future = client.submit(get_monthly_travels_weather)
    total_time = future.result()
    client.restart()
    print("Total time for IO and compute:", total_time)

Total time for IO and compute: 932.2804353237152
Total time for IO and compute: 888.8601548671722
Total time for IO and compute: 885.689935207367


In [40]:
client.close()
cluster.close()

Terminated dask-d7369884-worker-87177d09 (i-0e2f3a60157449305)
Terminated dask-d7369884-worker-592c886c (i-03c7260fe1621977b)
Terminated dask-d7369884-worker-f9596efa (i-08464c37bec5899d5)
Terminated dask-d7369884-worker-a09fb848 (i-01836314d011735d9)
Terminated dask-d7369884-scheduler (i-0311fa1be461bd2a9)
