### Instructions 

Before running this notebook, set up your local environment to match the environemnt on the cloud. Dask cloud provider also let's you specify a custom docker image to run on your cluster. [See Dask Cloud Provider's documentation](https://cloudprovider.dask.org/en/latest/aws.html#elastic-compute-cloud-ec2) for more details. This benchmark was run on the default image.

``` shell
cd benchmarks/dask
conda env create -f env.yml
```
This will create the conda environment `benchmark_dask`, attach it to this notebook.

In [1]:
import time

import dask.dataframe as dd
from dask.distributed import Client
from dask_cloudprovider.aws import EC2Cluster

In [None]:
env_vars = {"EXTRA_CONDA_PACKAGES": "s3fs==2024.10.0"}
cluster = EC2Cluster(
    # NOTE: Setting security = False to avoid large config size
    # https://github.com/dask/dask-cloudprovider/issues/249
    security=False,
    n_workers=4,
    instance_type="r6i.8xlarge",
    # Region for accessing bodo-example-data
    region="us-east-2",
    env_vars=env_vars,
)

In [None]:
client = Client(cluster)
print(client.dashboard_link)

In [63]:
# first 6 parquet files
dataset = [
    f"s3://bodo-example-data/nyc-taxi/fhvhv_tripdata/fhvhv_tripdata_2019-{i:02}.parquet"
    for i in range(2, 3)
]

In [70]:
# run on entire dataset
dataset = "s3://bodo-example-data/nyc-taxi/fhvhv_tripdata/"

In [118]:
client.restart()

In [71]:
def get_monthly_travels_weather():
    start = time.time()
    central_park_weather_observations = dd.read_csv(
        "s3://bodo-example-data/nyc-taxi/central_park_weather.csv",
        parse_dates=["DATE"],
        storage_options={"anon": True},
    )
    central_park_weather_observations = central_park_weather_observations.rename(
        columns={"DATE": "date", "PRCP": "precipitation"}
    )

    fhvhv_tripdata = dd.read_parquet(dataset, storage_options={"anon": True})

    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date
    fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month
    fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour
    fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin(
        [0, 1, 2, 3, 4]
    )
    monthly_trips_weather = fhvhv_tripdata.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather["date_with_precipitation"] = (
        monthly_trips_weather["precipitation"] > 0.1
    )

    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map(
        get_time_bucket, meta=("hour", "object")
    )
    monthly_trips_weather = (
        monthly_trips_weather.groupby(
            [
                "PULocationID",
                "DOLocationID",
                "month",
                "weekday",
                "date_with_precipitation",
                "time_bucket",
            ],
        )
        .agg({"hvfhs_license_num": "count", "trip_miles": "mean"})
        .reset_index()
    )
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ],
        ascending=True,
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "hvfhs_license_num": "trips",
            "trip_miles": "avg_distance",
        },
    )

    # convert to pandas and then write result dataframe ~11 rows, 8 columns
    def write_results():
        monthly_trips_weather.compute().to_parquet("/tmp/data/dask_results.pq")

    future = client.submit(write_results)
    future.result()

    end = time.time()
    print("Total IO and compute time: ", (end - start))

    return monthly_trips_weather


dask_result = get_monthly_travels_weather()

Total IO and compute time:  924.3090829849243


In [None]:
client.close()
cluster.close()