In [7]:
import os
import time

import boto3
import polars as pl
from botocore import UNSIGNED
from botocore.config import Config

### Setup 

Download the datasets from the public S3 bucket.

In [8]:
central_park_weather_path_s3 = "nyc-taxi/central_park_weather.csv"
bucket_name = "bodo-example-data"
hvfhv_5M_path_s3 = "nyc-taxi/fhvhv_5M_rows.pq"

In [9]:
def download_data_s3(path_to_s3: str, local_data_dir: str = "data") -> str:
    """Download the dataset from S3 if already exists, skip download."""
    file_name = path_to_s3.split("/", -1)[1]
    local_path = os.path.join(local_data_dir, file_name)

    if os.path.exists(local_path):
        return local_path

    print("Downloading dataset from S3...")

    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

    if not os.path.exists(local_data_dir):
        os.mkdir(local_data_dir)

    s3.download_file(bucket_name, path_to_s3, local_path)
    return local_path

In [10]:
weather_path = download_data_s3(central_park_weather_path_s3)
hvfhv_5M_path = download_data_s3(hvfhv_5M_path_s3)

In [35]:
def get_monthly_travels_weather(weather_dataset_path, hvfhv_dataset_path):
    start = time.time()

    # read data lazily
    hvfhv_dataset = pl.scan_parquet(hvfhv_dataset_path)
    weather_dataset = pl.scan_csv(weather_dataset_path, try_parse_dates=True)

    weather_dataset = weather_dataset.select(
        pl.col("DATE").alias("date"),
        (pl.col("PRCP") > 0.1).alias("date_with_precipitation"),
    )

    hvfhv_dataset = hvfhv_dataset.with_columns(
        pl.col("pickup_datetime").dt.date().alias("date"),
        pl.col("pickup_datetime").dt.month().alias("month"),
        pl.col("pickup_datetime").dt.hour().alias("hour"),
        pl.col("pickup_datetime").dt.weekday().is_in([1, 2, 3, 4, 5]).alias("weekday"),
    )

    # merge with weather observations
    monthly_trips_weather = hvfhv_dataset.join(weather_dataset, on="date")

    # place rides in bucket determined by hour of the day
    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather = monthly_trips_weather.with_columns(
        pl.col("hour")
        .map_elements(get_time_bucket, return_dtype=pl.String)
        .alias("time_bucket")
    )

    # get total trips and average distance for all trips
    monthly_trips_weather = monthly_trips_weather.group_by(
        [
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ]
    ).agg(
        pl.col("hvfhs_license_num").count().alias("count"),
        pl.col("trip_miles").mean().alias("avg_distance"),
    )

    monthly_trips_weather = monthly_trips_weather.sort(
        [
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ]
    )

    monthly_trips_weather.sink_parquet("polars_monthly_trips_weather.pq")

    end = time.time()
    print("Monthly Taxi Travel Times Computation Time: ", end - start)

    return monthly_trips_weather

In [None]:
get_monthly_travels_weather(weather_path, hvfhv_5M_path)

### Running on a Larger Dataset


In [19]:
hvfhv_20M_path_s3 = "nyc-taxi/fhvhv_tripdata/fhvhv_tripdata_2019-02.parquet"
hvfhv_20M_path = download_data_s3(hvfhv_20M_path_s3)

In [31]:
hvfhv_20M_path_s3 = "nyc-taxi/fhvhv_tripdata_rewrite/fhvhv_tripdata_2019-02.parquet"

In [32]:
get_monthly_travels_weather(weather_path, hvfhv_20M_path)

Monthly Taxi Travel Times Computation Time:  4.458498239517212


PULocationID,DOLocationID,month,weekday,date_with_precipitation,time_bucket,count,avg_distance
i64,i64,i8,bool,bool,str,u32,f64
1,1,2,false,false,"""midday""",1,19.68
1,1,2,false,false,"""other""",1,0.21
1,1,2,true,false,"""afternoon""",2,4.955
1,1,2,true,false,"""evening""",1,25.25
1,1,2,true,false,"""midday""",1,18.87
…,…,…,…,…,…,…,…
265,265,2,true,true,"""afternoon""",17,5.488824
265,265,2,true,true,"""evening""",14,4.835714
265,265,2,true,true,"""midday""",20,5.571
265,265,2,true,true,"""morning""",7,1.488571
