In [1]:
import os
import time

import boto3
import dask.dataframe as dd
from botocore import UNSIGNED
from botocore.config import Config
from dask.distributed import Client

### Setup 

Download the datasets from the public S3 bucket.

In [2]:
central_park_weather_path_s3 = "nyc-taxi/central_park_weather.csv"
bucket_name = "bodo-example-data"
hvfhv_5M_path_s3 = "nyc-taxi/fhvhv_5M_rows.pq"

In [3]:
def download_data_s3(path_to_s3: str, local_data_dir: str = "data") -> str:
    """Download the dataset from S3 if already exists, skip download."""
    file_name = path_to_s3.split("/")[-1]
    local_path = os.path.join(local_data_dir, file_name)

    if os.path.exists(local_path):
        return local_path

    print("Downloading dataset from S3...")

    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

    if not os.path.exists(local_data_dir):
        os.mkdir(local_data_dir)

    s3.download_file(bucket_name, path_to_s3, local_path)
    return local_path

In [4]:
weather_path = download_data_s3(central_park_weather_path_s3)
hvfhv_5M_path = download_data_s3(hvfhv_5M_path_s3)

In [5]:
def get_monthly_travels_weather(weather_dataset, hvfhv_dataset, storage_options=None):
    start = time.time()
    central_park_weather_observations = dd.read_csv(
        weather_dataset, parse_dates=["DATE"], storage_options=storage_options
    )
    central_park_weather_observations = central_park_weather_observations.rename(
        columns={"DATE": "date", "PRCP": "precipitation"}
    )

    fhvhv_tripdata = dd.read_parquet(hvfhv_dataset, storage_options=storage_options)

    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date
    fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month
    fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour
    fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin(
        [0, 1, 2, 3, 4]
    )
    monthly_trips_weather = fhvhv_tripdata.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather["date_with_precipitation"] = (
        monthly_trips_weather["precipitation"] > 0.1
    )

    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map(
        get_time_bucket, meta=("hour", "object")
    )
    monthly_trips_weather = (
        monthly_trips_weather.groupby(
            [
                "PULocationID",
                "DOLocationID",
                "month",
                "weekday",
                "date_with_precipitation",
                "time_bucket",
            ],
        )
        .agg({"hvfhs_license_num": "count", "trip_miles": "mean"})
        .reset_index()
    )
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ],
        ascending=True,
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "hvfhs_license_num": "trips",
            "trip_miles": "avg_distance",
        },
    )

    monthly_trips_weather = monthly_trips_weather.to_parquet(
        "dask_monthly_trips_weather.pq", compute=True
    )

    end = time.time()

    return end - start


def local_get_monthly_travels_weather(weather_dataset, hvfhv_dataset):
    """Run Dask on local cluster."""
    with Client():
        total_time = get_monthly_travels_weather(weather_dataset, hvfhv_dataset)
        print("Total time for IO and compute:", total_time)

In [None]:
local_get_monthly_travels_weather(weather_path, hvfhv_5M_path)

### Running on a Larger Dataset



In [7]:
hvfhv_20M_path_s3 = "nyc-taxi/fhvhv_tripdata/fhvhv_tripdata_2019-02.parquet"
hvfhv_20M_path = download_data_s3(hvfhv_20M_path_s3)

In [None]:
local_get_monthly_travels_weather(weather_path, hvfhv_20M_path)