In [None]:
import pandas as pd
from pathlib import Path

START = "2023-06-01"
END = "2023-06-15"

data_path = Path("data/raw/citibike")
csv_files = sorted(list(data_path.glob("*.csv")))

frames = []
for file in csv_files:
    df = pd.read_csv(file, low_memory=False) # low_memory=False
    df.columns = [c.lower() for c in df.columns]
    if "started_at" in df.columns:
        time_col = "started_at"
    elif "starttime" in df.columns:
        time_col = "starttime"
    else:
        continue
    df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
    df = df.dropna(subset=[time_col])
    df = df[(df[time_col] >= START) & (df[time_col] < END)]
    df["trip_date"] = df[time_col].dt.date
    df["trip_hour"] = df[time_col].dt.floor("h")
    frames.append(df[["trip_date", "trip_hour"]])

all_trips = pd.concat(frames, ignore_index=True)

daily_counts = (
    all_trips
    .groupby("trip_date")
    .size()
    .reset_index(name="trip_count_daily")
    .sort_values("trip_date")
)

hourly_counts = (
    all_trips
    .groupby("trip_hour")
    .size()
    .reset_index(name="trip_count_hourly")
    .sort_values("trip_hour")
)

daily_counts.to_csv("data/processed/citibike_daily_20230601_14.csv", index=False)
hourly_counts.to_csv("data/processed/citibike_hourly_20230601_14.csv", index=False)

daily_counts.head(), hourly_counts.head()
