In [2]:
import os
import time

import boto3
import pandas as pd
from botocore import UNSIGNED
from botocore.config import Config

### Setup 

Download the datasets from the public S3 bucket.

In [3]:
central_park_weather_path_s3 = "nyc-taxi/central_park_weather.csv"
bucket_name = "bodo-example-data"
hvfhv_5M_path_s3 = "nyc-taxi/fhvhv_5M_rows.pq"

In [4]:
def download_data_s3(path_to_s3: str, local_data_dir: str = "data") -> str:
    """Download the dataset from S3 if already exists, skip download."""
    file_name = path_to_s3.split("/", -1)[1]
    local_path = os.path.join(local_data_dir, file_name)

    if os.path.exists(local_path):
        return local_path

    print("Downloading dataset from S3...")

    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

    if not os.path.exists(local_data_dir):
        os.mkdir(local_data_dir)

    s3.download_file(bucket_name, path_to_s3, local_path)
    return local_path

In [5]:
weather_path = download_data_s3(central_park_weather_path_s3)
hvfhv_5M_path = download_data_s3(hvfhv_5M_path_s3)

Downloading dataset from S3...
Downloading dataset from S3...


In [6]:
def get_monthly_travels_weather(weather_dataset, hvfhv_dataset):
    start_read = time.time()
    central_park_weather_observations = pd.read_csv(
        weather_dataset,
        parse_dates=["DATE"],
    )
    central_park_weather_observations = central_park_weather_observations.rename(
        columns={"DATE": "date", "PRCP": "precipitation"}, copy=False
    )
    fhvhv_tripdata = pd.read_parquet(hvfhv_dataset)
    end = time.time()
    print("Reading Time: ", (end - start_read))

    start_compute = time.time()

    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date
    fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month
    fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour
    fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin(
        [0, 1, 2, 3, 4]
    )

    monthly_trips_weather = fhvhv_tripdata.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather["date_with_precipitation"] = (
        monthly_trips_weather["precipitation"] > 0.1
    )

    def get_time_bucket(t):
        bucket = "other"
        if t in (8, 9, 10):
            bucket = "morning"
        elif t in (11, 12, 13, 14, 15):
            bucket = "midday"
        elif t in (16, 17, 18):
            bucket = "afternoon"
        elif t in (19, 20, 21):
            bucket = "evening"
        return bucket

    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map(
        get_time_bucket
    )
    monthly_trips_weather = monthly_trips_weather.groupby(
        [
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ],
        as_index=False,
    ).agg({"hvfhs_license_num": "count", "trip_miles": "mean"})
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "date_with_precipitation",
            "time_bucket",
        ]
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "hvfhs_license_num": "trips",
            "trip_miles": "avg_distance",
        },
        copy=False,
    )
    end = time.time()
    print("Monthly Taxi Travel Times Computation Time: ", end - start_compute)

    start_write = time.time()
    monthly_trips_weather.to_parquet("pandas_monthly_trips_weather.pq")
    end = time.time()
    print("Writing time:", (end - start_write))
    print("Total E2E time:", (end - start_read))
    return monthly_trips_weather

In [7]:
get_monthly_travels_weather(weather_path, hvfhv_5M_path)

Reading Time:  0.4263150691986084
Monthly Taxi Travel Times Computation Time:  3.1104328632354736
Writing time: 0.04564404487609863
Total E2E time: 3.5826308727264404


Unnamed: 0,PULocationID,DOLocationID,month,weekday,date_with_precipitation,time_bucket,trips,avg_distance
0,1,1,2,True,False,midday,1,18.8700
1,1,1,2,True,False,morning,3,2.4700
2,1,1,2,True,False,other,1,14.2300
3,1,1,2,True,True,midday,1,17.0700
4,1,3,2,False,False,evening,1,30.3100
...,...,...,...,...,...,...,...,...
373163,265,265,2,True,True,afternoon,7,3.6800
373164,265,265,2,True,True,evening,2,3.6150
373165,265,265,2,True,True,midday,4,2.9575
373166,265,265,2,True,True,morning,2,0.8800


### Running on a Larger Dataset



In [8]:
hvfhv_20M_path_s3 = "nyc-taxi/fhvhv_tripdata/fhvhv_tripdata_2019-02.parquet"
hvfhv_20M_path = download_data_s3(hvfhv_20M_path_s3)

Downloading dataset from S3...


In [9]:
get_monthly_travels_weather(weather_path, hvfhv_20M_path)

Reading Time:  2.4875190258026123
Monthly Taxi Travel Times Computation Time:  15.4218168258667
Writing time: 0.08026504516601562
Total E2E time: 17.990182876586914


Unnamed: 0,PULocationID,DOLocationID,month,weekday,date_with_precipitation,time_bucket,trips,avg_distance
0,1,1,2,False,False,midday,1,19.680000
1,1,1,2,False,False,other,1,0.210000
2,1,1,2,True,False,afternoon,2,4.955000
3,1,1,2,True,False,evening,1,25.250000
4,1,1,2,True,False,midday,1,18.870000
...,...,...,...,...,...,...,...,...
640985,265,265,2,True,True,afternoon,17,5.488824
640986,265,265,2,True,True,evening,14,4.835714
640987,265,265,2,True,True,midday,20,5.571000
640988,265,265,2,True,True,morning,7,1.488571
