In [1]:
from dask_cloudprovider.aws import EC2Cluster
from dask.distributed import Client
import dask.dataframe as dd
import time

In [2]:
env_vars = {"EXTRA_CONDA_PACKAGES": "s3fs==2024.10.0"}
cluster = EC2Cluster(
    # NOTE: Setting security = False to avoid large config size
    # https://github.com/dask/dask-cloudprovider/issues/249
    security=False,
    n_workers=4,
    instance_type="c6i.8xlarge",
    # for accessing bodo-example-data
    region="us-east-2",
    debug=True,
    env_vars=env_vars,
)

Creating scheduler instance

Cloud init


#cloud-config


# Bootstrap
packages:
  - apt-transport-https
  - ca-certificates
  - curl
  - gnupg-agent
  - software-properties-common
  - ubuntu-drivers-common

# Enable ipv4 forwarding, required on CIS hardened machines
write_files:
  - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
    content: |
      net.ipv4.conf.all.forwarding=1

# create the docker group
groups:
  - docker

# Add default auto created user to docker group
system_info:
  default_user:
    groups: [docker]


runcmd:
  
  # Install Docker
  - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
  - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
  - apt-get update -y
  - apt-get install -y docker-ce docker-ce-cli containerd.io
  - systemctl start docker
  - systemctl enable docker
  

  

  

  # Run container
  - 'docker run --net=host   -e EXTRA_CONDA_PACKAGES="s3fs==2024.10.0"  daskdev/dask:

  next(self.gen)



Cloud init


#cloud-config


# Bootstrap
packages:
  - apt-transport-https
  - ca-certificates
  - curl
  - gnupg-agent
  - software-properties-common
  - ubuntu-drivers-common

# Enable ipv4 forwarding, required on CIS hardened machines
write_files:
  - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
    content: |
      net.ipv4.conf.all.forwarding=1

# create the docker group
groups:
  - docker

# Add default auto created user to docker group
system_info:
  default_user:
    groups: [docker]


runcmd:
  
  # Install Docker
  - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
  - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
  - apt-get update -y
  - apt-get install -y docker-ce docker-ce-cli containerd.io
  - systemctl start docker
  - systemctl enable docker
  

  

  

  # Run container
  - 'docker run --net=host   -e EXTRA_CONDA_PACKAGES="s3fs==2024.10.0"  daskdev/dask:latest env DASK_INTERNAL_INH

In [3]:
client = Client(cluster)
print(client.dashboard_link)

http://52.15.169.79:8787/status



+---------+-----------------+-----------------+---------+
| Package | Client          | Scheduler       | Workers |
+---------+-----------------+-----------------+---------+
| python  | 3.10.15.final.0 | 3.10.12.final.0 | None    |
+---------+-----------------+-----------------+---------+


In [4]:
def run_dask():
    start = time.time()
    central_park_weather_observations = dd.read_csv(
        "s3://bodo-example-data/nyc-taxi/central_park_weather.csv", 
        parse_dates=["DATE"], 
        storage_options={"anon": True}
    )
    central_park_weather_observations = central_park_weather_observations.rename(
        columns={"DATE": "date", "PRCP": "precipitation"}
    )
    
    fhvhv_tripdata = dd.read_parquet(
        "s3://bodo-example-data/nyc-taxi/fhvhv_tripdata", 
        storage_options={"anon": True}
    )

    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date
    fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month
    fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour
    fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin(
        [1, 2, 3, 4, 5]
    )
    monthly_trips_weather = fhvhv_tripdata.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather["date_with_precipitation"] = (
        monthly_trips_weather["precipitation"] > 0.1
    )
    
    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map(
    get_time_bucket, meta=('hour', 'object')
    )
    monthly_trips_weather = monthly_trips_weather.groupby(
        [
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ],
    ).agg({"hvfhs_license_num": "count", "trip_miles": "mean"}).reset_index()
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ],
        ascending=True
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "hvfhs_license_num": "trips",
            "trip_miles": "avg_distance",
        },
    )
    monthly_trips_weather = monthly_trips_weather.compute()
    
    end = time.time()
    print("Total read and compute time: ", end - start)

    print(monthly_trips_weather.head())
    return monthly_trips_weather

run_dask()

KilledWorker: Attempted to run task ('read_csv-operation-0709f81d79b82a4af639efb6f62fbe15', 13) on 4 different workers, but all those workers died while running it. The last worker that attempt to run the task was tcp://172.31.64.224:46769. Inspecting worker logs is often a good next step to diagnose what went wrong. For more information see https://distributed.dask.org/en/stable/killed.html.

In [5]:
client.close()
cluster.close()

Terminated dask-9d886db7-worker-a63c94d1 (i-0fd9f31f025cb4c04)
Terminated dask-9d886db7-worker-6418608b (i-0ca579a923a2ae0be)
Terminated dask-9d886db7-worker-5d61b4ba (i-024b8d0d619e3ef53)
Terminated dask-9d886db7-worker-3ad1b3e7 (i-0fec71d5255c5d50e)
Terminated dask-9d886db7-scheduler (i-0c66bddc13ed0b823)
