In [1]:
# Show all output for a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

# Load Citi Bike time series data for top 3 stations
month = 1
year = 2024
path = Path("..") / "data" / "processed" / f"citi_bike_ts_top3_{year}_{month:02}.parquet"

ts_data = pq.read_table(path).to_pandas()
ts_data.head()

Unnamed: 0,hour,ride_count,station_id
0,2024-01-01 00:00:00,2,6140.05
1,2024-01-01 01:00:00,2,6140.05
2,2024-01-01 02:00:00,5,6140.05
3,2024-01-01 03:00:00,2,6140.05
4,2024-01-01 04:00:00,0,6140.05


In [5]:
# Preview one station’s time series (e.g., first station in the list)
sample_station = ts_data['station_id'].unique()[0]
ts_data[ts_data['station_id'] == sample_station].head()

Unnamed: 0,hour,ride_count,station_id
0,2024-01-01 00:00:00,2,6140.05
1,2024-01-01 01:00:00,2,6140.05
2,2024-01-01 02:00:00,5,6140.05
3,2024-01-01 03:00:00,2,6140.05
4,2024-01-01 04:00:00,0,6140.05


In [6]:
import numpy as np

def transform_time_series_to_tabular(df, station_id, feature_col="ride_count", window_size=12, step_size=1):
    """
    Transforms time series data for a given station ID into a tabular format.

    Parameters:
        df (pd.DataFrame): Input time series data.
        station_id (str): Station ID to extract features for.
        feature_col (str): Column to use as features/target (default: ride_count).
        window_size (int): Number of past time steps to use as input.
        step_size (int): Step size for sliding window.

    Returns:
        pd.DataFrame: Tabular dataset with features and target.
    """
    station_data = df[df["station_id"] == station_id].reset_index(drop=True)
    values = station_data[feature_col].values

    if len(values) <= window_size:
        raise ValueError("Not enough data to create even one training example.")

    rows = []
    for i in range(0, len(values) - window_size, step_size):
        features = values[i:i + window_size]
        target = values[i + window_size]
        rows.append(np.append(features, target))

    columns = [f"feature_{i+1}" for i in range(window_size)] + ["target"]
    return pd.DataFrame(rows, columns=columns)


In [7]:
# Example usage: convert time series for one station to tabular format
features_df = transform_time_series_to_tabular(ts_data, station_id=sample_station, window_size=12, step_size=1)
features_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,target
0,2,2,5,2,0,1,0,0,1,3,3,8,20
1,2,5,2,0,1,0,0,1,3,3,8,20,8
2,5,2,0,1,0,0,1,3,3,8,20,8,9
3,2,0,1,0,0,1,3,3,8,20,8,9,11
4,0,1,0,0,1,3,3,8,20,8,9,11,9
