# 📕 03 - Transform Time Series Data into Features and Targets

## Introduction

Navigating from the previous steps where we structured the raw data into a time-series format, our next leap in the data processing journey is to carve out discernible features and targets from our aggregated dataset. This transformation paves the way for our upcoming machine learning endeavors, ensuring we have a well-defined problem space and solution mechanism. 

## Notebook Overview:

1. **Data Subset**: Extract data for a specific location.
2. **Utility Development**: Create a function to identify cutoff indices.
3. **Indexing Strategy**: Define the start, middle, and end indices for each sub-sequence.
4. **Data Transformation**: Convert time series data into features and targets.



In [1]:
# import libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Tuple

# load ts data nd check first rows
ts_data = pd.read_parquet('../data/transformed/ts_data_rides_2022_01.parquet')
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,11,4
1,2022-01-01 01:00:00,15,4
2,2022-01-01 02:00:00,26,4
3,2022-01-01 03:00:00,8,4
4,2022-01-01 04:00:00,9,4
...,...,...,...
191203,2022-01-31 19:00:00,0,176
191204,2022-01-31 20:00:00,0,176
191205,2022-01-31 21:00:00,0,176
191206,2022-01-31 22:00:00,0,176


In [2]:
# Subset the data to select entries for a specific pickup location
ts_data_one_location = ts_data.loc[ts_data['pickup_location_id'] == 43, :].reset_index(drop=True)
ts_data_one_location.head(15)

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
5,2022-01-01 05:00:00,5,43
6,2022-01-01 06:00:00,3,43
7,2022-01-01 07:00:00,10,43
8,2022-01-01 08:00:00,7,43
9,2022-01-01 09:00:00,19,43


In [3]:
def get_cutoff_indices(
        data: pd.DataFrame,
        n_features: int,
        step_size: int
) -> list:
    """
    Given a time series data and the number of features and step size, returns a list of tuples containing the start,
    middle and end indices of each sub-sequence.

    :param data: A pandas DataFrame containing the time series data.
    :param n_features: An integer representing the number of features to use in each sub-sequence.
    :param step_size: An integer representing the step size between each sub-sequence.
    :return: A list of tuples containing the start, middle and end indices of each sub-sequence.
    """
    stop_position = len(data) - 1

    # start the first sub-sequence at index position 0
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices = []

    while subseq_last_idx <= stop_position:
        indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))

        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size

    return indices


In [4]:
# Setting parameters for feature extraction
n_features = 24
step_size = 1

# Compute indices for data split
indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
n_examples = len(indices)

indices[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

In [5]:
# Initialize arrays to store the features and targets
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples), dtype=np.float32)
pickup_hours = []

# Convert time series data to features and targets for the selected location
for i, idx in enumerate(indices):
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
    y[i] = ts_data_one_location.iloc[idx[1]]['rides']
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

In [6]:
# Display shape and values of the features and targets for validation
print(f"{x.shape=}")
print(f"{x=}")
print(f"{y=}")
print(f"{pickup_hours[:5]=}")

x.shape=(719, 24)
x=array([[ 97.,  60.,  22., ...,  16.,  18.,   6.],
       [ 60.,  22.,   8., ...,  18.,   6.,   3.],
       [ 22.,   8.,   6., ...,   6.,   3.,   1.],
       ...,
       [ 28.,  16.,  13., ..., 102.,  66.,  61.],
       [ 16.,  13.,   8., ...,  66.,  61.,  73.],
       [ 13.,   8.,   1., ...,  61.,  73.,  33.]], dtype=float32)
y=array([  3.,   1.,   1.,   0.,   0.,   3.,   4.,  11.,  19.,  31.,  44.,
        61.,  80., 114., 125., 122., 143.,  86.,  34.,  29.,  25.,  23.,
        25.,   7.,   1.,   0.,   0.,   1.,   0.,   3.,  12.,  29.,  62.,
        72.,  78.,  81.,  97., 112., 138., 149., 138., 117.,  50.,  36.,
        23.,  15.,  24.,   8.,   3.,   0.,   0.,   0.,   2.,   3.,   6.,
        43.,  90.,  89.,  98.,  95.,  84.,  97., 135., 129., 156., 132.,
        48.,  45.,  17.,  25.,  21.,  16.,   2.,   5.,   0.,   0.,   1.,
         6.,  10.,  44.,  93.,  76., 121., 132.,  81.,  83., 106., 122.,
       114.,  98.,  64.,  56.,  36.,  27.,  20.,   9.,   4.,   2.,

In [7]:
# Create a dataframe for the features
features_one_location = pd.DataFrame(x,
    columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])

features_one_location

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,97.0,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,...,70.0,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0
1,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,...,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0
2,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,...,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0
3,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,...,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0
4,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,77.0,...,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,52.0,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,...,78.0,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0
715,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,...,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0
716,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,...,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0
717,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,9.0,...,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0,73.0


In [8]:
# Create a dataframe for the targets
targets_one_location = pd.DataFrame(y,
    columns=['target_rides_next_hour'])

targets_one_location

Unnamed: 0,target_rides_next_hour
0,3.0
1,1.0
2,1.0
3,0.0
4,0.0
...,...
714,66.0
715,61.0
716,73.0
717,33.0


For our taxi demand prediction task, we transform our time series dataset into features and targets suitable for training machine learning models. This involves creating sequences or "windows" of historical data as features and the subsequent hour's data as the target.

Parameters used:
- **`ts_data`**: The entire time series dataset.
- **`n_features`**: This specifies the length of the historical window we are considering. A value of `24*7` means we are using the past week (7 days, with 24 hours each) of data to predict the taxi demand of the subsequent hour.
- **`step_size`**: The number of hours the window moves forward for each subsequent data point. A value of `24` means we move forward by a day for each step, creating a new feature set for every day in our dataset.

The output consists of two parts:
- **`features`**: This contains the sequences of historical data. Each row represents a sequence of the past week's taxi demand data, and each column in that row corresponds to an hour's data.
- **`targets`**: This contains the actual taxi demand values for the hour immediately following each historical sequence in `features`.


In [9]:
def transform_ts_data_into_features_and_target(
    ts_data: pd.DataFrame,
    n_features: int,
    step_size: int
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Transforms a time series dataset into features and targets suitable for machine learning.

    Given a time series dataset with columns 'pickup_hour', 'rides' and 'pickup_location_id', this function
    extracts the data for each location, and for each location it creates a set of features and targets. The
    features are created by selecting a window of n_features hours of rides data, and the target is the number
    of rides in the next hour. The window is moved step_size hours at a time, creating multiple examples. The
    function returns two dataframes: one with the features, and one with the targets.

    :param ts_data: A pandas DataFrame containing the time series data.
    :param n_features: An integer representing the number of features to use in each sub-sequence.
    :param step_size: An integer representing the step size between each sub-sequence.
    :return: A tuple containing two pandas DataFrames: the first one with the features, and the second one with
            the targets.
    """
    # check that the ts_data DataFrame has the expected columns
    assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'}

    # get the unique location ids
    location_ids = ts_data["pickup_location_id"].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()

    for location_id in tqdm(location_ids):
        ts_data_one_location = ts_data.loc[ts_data['pickup_location_id'] == location_id, ['pickup_hour', 'rides']]

        # pre-compute indices for one location
        indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
        n_examples = len(indices)
        
        # transform time series data into features and targets
        x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
        y = np.ndarray(shape=(n_examples))
        pickup_hours = []

        for i, idx in enumerate(indices):
            x[i] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
            y[i] = ts_data_one_location.iloc[idx[2]]['rides']
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

        features_one_location = pd.DataFrame(x, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])
        
        features_one_location['pickup_hour'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id

        targets_one_location = pd.DataFrame(y, columns=["target_rides_next_hour"])

        # concatenate features and targets for one location
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, targets_one_location])

    features.reset_index(inplace=True, drop=True)
    targets.reset_index(inplace=True, drop=True)

    return features, targets['target_rides_next_hour']




In [10]:
# Apply the transformation to the entire dataset
features, targets = transform_ts_data_into_features_and_target(
    ts_data=ts_data,
    n_features=24*7*1,
    step_size=24
)

# Display shapes for validation
print(f'{features.shape=}')
print(f'{targets.shape=}')

100%|██████████| 257/257 [00:02<00:00, 118.21it/s]

features.shape=(6168, 170)
targets.shape=(6168,)





In [11]:
# Display the first few rows of the transformed features
features.head()

Unnamed: 0,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,rides_previous_160_hour,rides_previous_159_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,2.0,3.0,3.0,7.0,4.0,4.0,7.0,10.0,2022-01-08,4
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,3.0,3.0,5.0,7.0,8.0,6.0,7.0,14.0,2022-01-09,4
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,6.0,4.0,3.0,5.0,1.0,1.0,1.0,0.0,2022-01-10,4
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,6.0,3.0,2.0,4.0,1.0,0.0,1.0,2.0,2022-01-11,4
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,1.0,6.0,3.0,2.0,3.0,2.0,4.0,1.0,2022-01-12,4
