In [11]:
import os
import time

import boto3
import daft
from botocore import UNSIGNED
from botocore.config import Config
from daft import col

### Setup 

Download the datasets from the public S3 bucket.

In [2]:
central_park_weather_path_s3 = "nyc-taxi/central_park_weather.csv"
bucket_name = "bodo-example-data"
hvfhv_5M_path_s3 = "nyc-taxi/fhvhv_5M_rows.pq"

In [3]:
def download_data_s3(path_to_s3: str, local_data_dir: str = "data") -> str:
    """Download the dataset from S3 if already exists, skip download."""
    file_name = path_to_s3.split("/", -1)[1]
    local_path = os.path.join(local_data_dir, file_name)

    if os.path.exists(local_path):
        return local_path

    print("Downloading dataset from S3...")

    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

    if not os.path.exists(local_data_dir):
        os.mkdir(local_data_dir)

    s3.download_file(bucket_name, path_to_s3, local_path)
    return local_path

In [4]:
weather_path = download_data_s3(central_park_weather_path_s3)
hvfhv_5M_path = download_data_s3(hvfhv_5M_path_s3)

In [12]:
def get_monthly_travels_weather(weather_dataset_path, hvfhv_dataset_path):
    start = time.time()

    # read data, rename some columns
    central_park_weather_observations = daft.read_csv(weather_dataset_path)
    central_park_weather_observations = (
        central_park_weather_observations.with_columns_renamed(
            {"DATE": "date", "PRCP": "precipitation"}
        )
    )
    hvfhv_dataset = daft.read_parquet(hvfhv_dataset_path)

    # datetime manipulation
    central_park_weather_observations = central_park_weather_observations.with_column(
        "date",
        central_park_weather_observations["date"].dt.date(),
    )
    hvfhv_dataset = hvfhv_dataset.with_columns(
        {
            "date": col("pickup_datetime").dt.date(),
            "month": col("pickup_datetime").dt.month(),
            "hour": col("pickup_datetime").dt.hour(),
            "weekday": col("pickup_datetime").dt.day_of_week().is_in([0, 1, 2, 3, 4]),
        }
    )

    # combine NYC taxi dataset with weather observations
    monthly_trips_weather = hvfhv_dataset.join(
        central_park_weather_observations, on="date", how="inner"
    )

    monthly_trips_weather = monthly_trips_weather.with_column(
        "date_with_precipitation", col("precipitation") > 0.1
    )

    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather = monthly_trips_weather.with_column(
        "time_bucket",
        col("hour").apply(get_time_bucket, return_dtype=daft.DataType.string()),
    )

    monthly_trips_weather = monthly_trips_weather.groupby(
        [
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ]
    ).agg(col("hvfhs_license_num").count(), col("trip_miles").mean())

    monthly_trips_weather = monthly_trips_weather.sort(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ]
    )

    monthly_trips_weather = monthly_trips_weather.with_columns_renamed(
        {
            "hvfhs_license_num": "trips",
            "trip_miles": "avg_distance",
        },
    )
    monthly_trips_weather = monthly_trips_weather.collect()

    monthly_trips_weather.write_parquet("daft_result.pq", write_mode="overwrite")

    end = time.time()
    print("Total E2E time:", (end - start))

    return monthly_trips_weather

In [None]:
result = get_monthly_travels_weather(weather_path, hvfhv_5M_path)
result.to_pandas().head(5)

### Running on a Larger Dataset



In [13]:
hvfhv_20M_path_s3 = "nyc-taxi/fhvhv_tripdata/fhvhv_tripdata_2019-02.parquet"
hvfhv_20M_path = download_data_s3(hvfhv_20M_path_s3)

In [14]:
result = get_monthly_travels_weather(weather_path, hvfhv_20M_path)
result.to_pandas().head(5)













[A[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A


[A[A[A


[A[A[A








[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A

[A[A

[A[A



[A[A[A[A



[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A


[A[A[A










[A[A[A[A[A[A[A[A[A[A[A

[A[A



[A[A[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A


[A[A[A

[A[A



[A[A[A[A








[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A


[A[A[A

[A[A



[A[A[A[A








[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A


[A[A[A

[A[A



[A

In [15]:
import pandas as pd

out_df = pd.read_parquet("daft_result.pq")

out_df

Unnamed: 0,PULocationID,DOLocationID,month,weekday,date_with_precipitation,time_bucket,trips,avg_distance
0,1,1,2,False,False,midday,1,19.680000
1,1,1,2,False,False,other,1,0.210000
2,1,1,2,True,False,afternoon,2,4.955000
3,1,1,2,True,False,evening,1,25.250000
4,1,1,2,True,False,midday,1,18.870000
...,...,...,...,...,...,...,...,...
640985,223,160,2,False,True,afternoon,1,6.890000
640986,223,160,2,False,True,midday,1,5.910000
640987,223,160,2,False,True,other,1,5.800000
640988,223,160,2,True,False,afternoon,35,6.482286
