In [1]:
import os
import time

import boto3
import pandas as pd
from botocore import UNSIGNED
from botocore.config import Config

import bodo

### Setup 

Download the datasets from the public S3 bucket.

In [2]:
central_park_weather_path_s3 = "nyc-taxi/central_park_weather.csv"
bucket_name = "bodo-example-data"
hvfhv_5M_path_s3 = "nyc-taxi/fhvhv_5M_rows.pq"

In [12]:
def download_data_s3(path_to_s3: str, local_data_dir: str = "data") -> str:
    """Download the dataset from S3 if already exists, skip download."""
    file_name = path_to_s3.split("/", -1)[1]
    local_path = os.path.join(local_data_dir, file_name)

    if os.path.exists(local_path):
        return local_path

    print("Downloading dataset from S3...")

    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

    if not os.path.exists(local_data_dir):
        os.mkdir(local_data_dir)

    s3.download_file(bucket_name, path_to_s3, local_path)
    return local_path

In [4]:
weather_path = download_data_s3(central_park_weather_path_s3)
hvfhv_5M_path = download_data_s3(hvfhv_5M_path_s3)

In [5]:
def get_monthly_travels_weather(weather_dataset, hvfhv_dataset):
    start = time.time()
    central_park_weather_observations = pd.read_csv(
        weather_dataset,
        parse_dates=["DATE"],
    )
    central_park_weather_observations = central_park_weather_observations.rename(
        columns={"DATE": "date", "PRCP": "precipitation"}, copy=False
    )
    fhvhv_tripdata = pd.read_parquet(hvfhv_dataset)

    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date
    fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month
    fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour
    fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin(
        [0, 1, 2, 3, 4]
    )

    monthly_trips_weather = fhvhv_tripdata.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather["date_with_precipitation"] = (
        monthly_trips_weather["precipitation"] > 0.1
    )

    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map(
        get_time_bucket
    )
    monthly_trips_weather = monthly_trips_weather.groupby(
        [
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ],
        as_index=False,
    ).agg({"hvfhs_license_num": "count", "trip_miles": "mean"})
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ]
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "hvfhs_license_num": "trips",
            "trip_miles": "avg_distance",
        },
        copy=False,
    )
    end = time.time()

    monthly_trips_weather.to_parquet("bodo_monthly_trips_weather.pq")
    end = time.time()
    print("Total E2E time:", (end - start))
    return monthly_trips_weather

In [6]:
bodo.jit(get_monthly_travels_weather, cache=True)(weather_path, hvfhv_5M_path)



Total E2E time: 1.5309290000000146




Unnamed: 0,PULocationID,DOLocationID,month,weekday,date_with_precipitation,time_bucket,trips,avg_distance
287389,1,1,2,True,False,midday,1,18.87
264417,1,1,2,True,False,morning,3,2.47
100659,1,1,2,True,False,other,1,14.23
293113,1,1,2,True,True,midday,1,17.07
20376,1,3,2,False,False,evening,1,30.31
...,...,...,...,...,...,...,...,...
368566,265,265,2,True,True,afternoon,7,3.68
371052,265,265,2,True,True,evening,2,3.615
106349,265,265,2,True,True,midday,4,2.9575
104961,265,265,2,True,True,morning,2,0.88


### Running on a Larger Dataset

In [13]:
hvfhv_20M_path_s3 = "nyc-taxi/fhvhv_tripdata/fhvhv_tripdata_2019-02.parquet"
hvfhv_20M_path = download_data_s3(hvfhv_20M_path_s3)

Downloading dataset from S3...


In [8]:
bodo.jit(get_monthly_travels_weather, cache=True)(weather_path, hvfhv_20M_path)



Total E2E time: 4.952475999999933




Unnamed: 0,PULocationID,DOLocationID,month,weekday,date_with_precipitation,time_bucket,trips,avg_distance
427387,1,1,2,False,False,midday,1,19.68
232987,1,1,2,False,False,other,1,0.21
303633,1,1,2,True,False,afternoon,2,4.955
303811,1,1,2,True,False,evening,1,25.25
474786,1,1,2,True,False,midday,1,18.87
...,...,...,...,...,...,...,...,...
609599,265,265,2,True,True,afternoon,17,5.488824
612085,265,265,2,True,True,evening,14,4.835714
159802,265,265,2,True,True,midday,20,5.571
158414,265,265,2,True,True,morning,7,1.488571


### Run using Bodo DataFrames

In [1]:
import bodo.pandas as pd

In [2]:
get_monthly_travels_weather(weather_path, hvfhv_5M_path)

NameError: name 'get_monthly_travels_weather' is not defined

In [16]:
get_monthly_travels_weather(weather_path, hvfhv_20M_path)



Total E2E time: 9.646630764007568




Unnamed: 0,PULocationID,DOLocationID,month,weekday,date_with_precipitation,time_bucket,trips,avg_distance
0,1,1,2,False,False,midday,1,19.68
1,1,1,2,False,False,other,1,0.21
2,1,1,2,True,False,afternoon,2,4.955
3,1,1,2,True,False,evening,1,25.25
4,1,1,2,True,False,midday,1,18.87
...,...,...,...,...,...,...,...,...
640985,265,265,2,True,True,afternoon,17,5.488824
640986,265,265,2,True,True,evening,14,4.835714
640987,265,265,2,True,True,midday,20,5.571
640988,265,265,2,True,True,morning,7,1.488571
