### Instructions 

Before running this notebook, set up your local environment to match the environemnt on the cloud. Dask cloud provider also let's you specify a custom docker image to run on your cluster. [See Dask Cloud Provider's documentation](https://cloudprovider.dask.org/en/latest/aws.html#elastic-compute-cloud-ec2) for more details. This benchmark was run on the default image.

``` shell
cd benchmarks/dask
conda env create -f env.yml
```
This will create the conda environment `benchmark_dask`, attach it to this notebook.

In [2]:
import time

import dask.dataframe as dd
from dask.distributed import Client
from dask_cloudprovider.aws import EC2Cluster

In [6]:
env_vars = {"EXTRA_CONDA_PACKAGES": "s3fs==2024.10.0"}
cluster = EC2Cluster(
    # NOTE: Setting security = False to avoid large config size
    # https://github.com/dask/dask-cloudprovider/issues/249
    security=False,
    n_workers=4,
    instance_type="r6i.8xlarge",
    # Region for accessing bodo-example-data
    region="us-east-2",
    debug=True,
    env_vars=env_vars,
)

Creating scheduler instance

Cloud init


#cloud-config


# Bootstrap
packages:
  - apt-transport-https
  - ca-certificates
  - curl
  - gnupg-agent
  - software-properties-common
  - ubuntu-drivers-common

# Enable ipv4 forwarding, required on CIS hardened machines
write_files:
  - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
    content: |
      net.ipv4.conf.all.forwarding=1

# create the docker group
groups:
  - docker

# Add default auto created user to docker group
system_info:
  default_user:
    groups: [docker]


runcmd:
  
  # Install Docker
  - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
  - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
  - apt-get update -y
  - apt-get install -y docker-ce docker-ce-cli containerd.io
  - systemctl start docker
  - systemctl enable docker
  

  

  

  # Run container
  - 'docker run --net=host   -e EXTRA_CONDA_PACKAGES="s3fs==2024.10.0"  daskdev/dask:

In [7]:
client = Client(cluster)
print(client.dashboard_link)

http://13.58.45.89:8787/status



+---------+-----------------+-----------------+---------+
| Package | Client          | Scheduler       | Workers |
+---------+-----------------+-----------------+---------+
| python  | 3.10.15.final.0 | 3.10.12.final.0 | None    |
+---------+-----------------+-----------------+---------+


In [8]:
# run on entire dataset
parquet_files = "s3://bodo-example-data/nyc-taxi/fhvhv_tripdata/"

In [9]:
def get_monthly_travels_weather():
    start = time.time()
    central_park_weather_observations = dd.read_csv(
        "s3://bodo-example-data/nyc-taxi/central_park_weather.csv",
        parse_dates=["DATE"],
        storage_options={"anon": True},
    )
    central_park_weather_observations = central_park_weather_observations.rename(
        columns={"DATE": "date", "PRCP": "precipitation"}
    )

    fhvhv_tripdata = dd.read_parquet(
        "s3://bodo-example-data/nyc-taxi/fhvhv_tripdata",
        storage_options={"anon": True},
    )

    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date
    fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month
    fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour
    fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin(
        [1, 2, 3, 4, 5]
    )
    monthly_trips_weather = fhvhv_tripdata.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather["date_with_precipitation"] = (
        monthly_trips_weather["precipitation"] > 0.1
    )

    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map(
        get_time_bucket, meta=("hour", "object")
    )
    monthly_trips_weather = (
        monthly_trips_weather.groupby(
            [
                "PULocationID",
                "DOLocationID",
                "month",
                "weekday",
                "date_with_precipitation",
                "time_bucket",
            ],
        )
        .agg({"hvfhs_license_num": "count", "trip_miles": "mean"})
        .reset_index()
    )
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ],
        ascending=True,
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "hvfhs_license_num": "trips",
            "trip_miles": "avg_distance",
        },
    )
    monthly_trips_weather = monthly_trips_weather.compute()

    end = time.time()
    print("Total read and compute time: ", end - start)

    print(monthly_trips_weather.head())
    return monthly_trips_weather


get_monthly_travels_weather()

Total read and compute time:  1284.0738711357117
         PULocationID  DOLocationID  month  weekday  date_with_precipitation  \
987737              1             1      1    False                    False   
985918              1             1      1    False                    False   
974350              1             1      1    False                    False   
1371239             1             1      1     True                    False   
979183              1             1      1     True                    False   

        time_bucket  trips  avg_distance  
987737    afternoon      3     16.386667  
985918      morning      2      8.985000  
974350        other      2     11.704500  
1371239   afternoon      1      0.010000  
979183       midday      4     21.752750  


Unnamed: 0,PULocationID,DOLocationID,month,weekday,date_with_precipitation,time_bucket,trips,avg_distance
987737,1,1,1,False,False,afternoon,3,16.386667
985918,1,1,1,False,False,morning,2,8.985000
974350,1,1,1,False,False,other,2,11.704500
1371239,1,1,1,True,False,afternoon,1,0.010000
979183,1,1,1,True,False,midday,4,21.752750
...,...,...,...,...,...,...,...,...
892593,265,265,12,True,True,afternoon,64,5.309078
894493,265,265,12,True,True,evening,33,10.278848
889331,265,265,12,True,True,midday,78,4.747564
897028,265,265,12,True,True,morning,44,6.877977


In [None]:
client.close()
cluster.close()