In [1]:
# Show all output for a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Imports and data loading
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

# Specify year and month
month = 1
year = 2024

# Load processed Citi Bike data
path = Path("..") / "data" / "processed" / f"citi_bike_rides_processed_{year}_{month:02}.parquet"
table = pq.read_table(path)
rides = table.to_pandas()
rides.head()

Unnamed: 0,started_at,start_station_id
0,2024-01-22 18:43:19.012,7954.12
1,2024-01-11 19:19:18.721,6771.13
2,2024-01-30 19:17:41.693,5659.11
3,2024-01-27 11:27:01.759,6771.13
4,2024-01-16 15:15:41.000,7443.01


In [4]:
# Convert timestamps and extract hourly bins
rides['started_at'] = pd.to_datetime(rides['started_at'], errors='coerce')
rides['hour'] = rides['started_at'].dt.floor('H')

In [14]:
# Step 1: Extract top 3 start stations
top_stations = rides['start_station_id'].value_counts().nlargest(3)
print("Top 3 stations and their ride counts:")
print(top_stations)

# Convert to list for later use
top_station_ids = top_stations.index.tolist()

Top 3 stations and their ride counts:
start_station_id
6140.05    8308
6822.09    6554
6450.05    6318
Name: count, dtype: int64


In [None]:
# Create full hourly range based strictly on data bounds
start_date = rides['hour'].min().floor('H')  # Start at the first actual hour
end_date = rides['hour'].max().floor('H')    # End at the last actual hour
full_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Combine all top station time series into one DataFrame
ts_combined = pd.DataFrame()

for station in top_stations:
    station_rides = rides[rides['start_station_id'] == station]
    hourly_counts = (
        station_rides.groupby('hour')
        .size()
        .reindex(full_range, fill_value=0)  # Fill missing hours with 0
        .rename('ride_count')
        .reset_index()
        .rename(columns={'index': 'hour'})
    )
    hourly_counts['station_id'] = station
    ts_combined = pd.concat([ts_combined, hourly_counts], ignore_index=True)

# Preview the result
ts_combined.head()
ts_combined.tail()

In [None]:
import pandas as pd

# Step 1: Determine the expected hourly time range
expected_hours = pd.date_range(
    start=ts_combined['hour'].min().floor('H'),
    end=ts_combined['hour'].max().ceil('H'),
    freq='H'
)

# Step 2: Check completeness of each station's time series
for station_id in top_station_ids:
    station_data = ts_combined[ts_combined['station_id'] == station_id]
    actual_hours = pd.DatetimeIndex(station_data['hour'])  # FIX: convert Series to DatetimeIndex

    missing_hours = expected_hours.difference(actual_hours)
    extra_hours = actual_hours.difference(expected_hours)

    print(f"Station {station_id}:")
    print(f"  Total expected hours: {len(expected_hours)}")
    print(f"  Total actual hours:   {len(actual_hours)}")
    print(f"  Missing hours:        {len(missing_hours)}")
    print(f"  Extra hours:          {len(extra_hours)}\n")

    # Optional: Print missing hours
    if not missing_hours.empty:
        print(f"  ❗ Missing hours for station {station_id}:")
        print(missing_hours.to_list())
        print()

In [15]:
from typing import Optional, List
import plotly.express as px
import pandas as pd

def plot_rides_top3(
    rides: pd.DataFrame,
    top_station_ids: List[str]
):
    # Filter only for top 3 stations
    rides_to_plot = rides[rides.station_id.isin(top_station_ids)]

    # Plot
    fig = px.line(
        rides_to_plot,
        x="hour",
        y="ride_count",
        color="station_id",
        template="none",
        title="Hourly Ride Counts for Top 3 Citi Bike Stations"
    )

    fig.show()

In [26]:
plot_rides_top3(ts_combined, top_station_ids)

In [28]:
# Save combined time series to a single parquet file
output_dir = Path("..") / "data" / "processed"
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / f"citi_bike_ts_top3_{year}_{month:02}.parquet"
ts_combined.to_parquet(output_path, engine='pyarrow', index=False)
print(f"Saved combined file: {output_path}")

Saved combined file: ../data/processed/citi_bike_ts_top3_2024_01.parquet
