In [2]:
# import necessary libraries
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from typing import List, Tuple, Optional

In [3]:
# load ts data
ts_data = pd.read_parquet('../data/transformed/ts_data_rides_2022_01.parquet')
ts_data.head()

Unnamed: 0,pickup_datetime,rides,pickup_location_id
0,2023-01-01 00:00:00,19,4
1,2023-01-01 01:00:00,28,4
2,2023-01-01 02:00:00,43,4
3,2023-01-01 03:00:00,33,4
4,2023-01-01 04:00:00,12,4


In [5]:
ts_data.loc[ts_data['pickup_location_id'] == 43, :]

Unnamed: 0,pickup_datetime,rides,pickup_location_id
9672,2023-01-01 00:00:00,93,43
9673,2023-01-01 01:00:00,81,43
9674,2023-01-01 02:00:00,30,43
9675,2023-01-01 03:00:00,15,43
9676,2023-01-01 04:00:00,4,43
...,...,...,...
10411,2023-01-31 19:00:00,81,43
10412,2023-01-31 20:00:00,49,43
10413,2023-01-31 21:00:00,44,43
10414,2023-01-31 22:00:00,35,43


In [8]:
# work on a specific location
ts_data_one_location = ts_data.loc[ts_data['pickup_location_id'] == 43, :].reset_index(drop=True)
ts_data_one_location.head(30)

Unnamed: 0,pickup_datetime,rides,pickup_location_id
0,2023-01-01 00:00:00,93,43
1,2023-01-01 01:00:00,81,43
2,2023-01-01 02:00:00,30,43
3,2023-01-01 03:00:00,15,43
4,2023-01-01 04:00:00,4,43
5,2023-01-01 05:00:00,4,43
6,2023-01-01 06:00:00,4,43
7,2023-01-01 07:00:00,12,43
8,2023-01-01 08:00:00,12,43
9,2023-01-01 09:00:00,23,43


In [9]:
def get_cutoff_indices(
        data: pd.DataFrame,
        n_features: int,
        step_size: int
) -> list:
    """
    Given a time series data and the number of features and step size, returns a list of tuples containing the start,
    middle and end indices of each sub-sequence.

    :param data: A pandas DataFrame containing the time series data.
    :param n_features: An integer representing the number of features to use in each sub-sequence.
    :param step_size: An integer representing the step size between each sub-sequence.
    :return: A list of tuples containing the start, middle and end indices of each sub-sequence.
    """
    stop_position = len(data) - 1

    # start the first sub-sequence at index position 0
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices = []

    while subseq_last_idx <= stop_position:
        indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))

        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size

    return indices


In [10]:
n_features = 24
step_size = 1

# computer indices for data split
indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
n_examples = len(indices)



In [11]:
indices[:10]

[(0, 24, 25),
 (1, 25, 26),
 (2, 26, 27),
 (3, 27, 28),
 (4, 28, 29),
 (5, 29, 30),
 (6, 30, 31),
 (7, 31, 32),
 (8, 32, 33),
 (9, 33, 34)]

In [13]:
# Initialize arrays to store the features and targets
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples), dtype=np.float32)
pickup_hours = []

# Convert time series data to features and targets for the selected location
for i, idx in enumerate(indices):
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
    y[i] = ts_data_one_location.iloc[idx[1]]['rides']
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_datetime'].hour)

In [14]:
# Display shape and values of the features and targets for validation
print(f"{x.shape=}")
print(f"{x=}")
print(f"{y=}")
print(f"{pickup_hours[:5]=}")

x.shape=(719, 24)
x=array([[ 93.,  81.,  30., ...,  41.,  18.,  13.],
       [ 81.,  30.,  15., ...,  18.,  13.,   2.],
       [ 30.,  15.,   4., ...,  13.,   2.,   0.],
       ...,
       [ 50.,  33.,  16., ..., 108.,  88.,  81.],
       [ 33.,  16.,  17., ...,  88.,  81.,  49.],
       [ 16.,  17.,   3., ...,  81.,  49.,  44.]], dtype=float32)
y=array([  2.,   0.,   2.,   2.,   0.,   1.,   5.,  10.,  16.,  29.,  62.,
        51., 112., 147., 191., 197., 202.,  97.,  45.,  29.,  29.,  24.,
        16.,  14.,   1.,   0.,   0.,   0.,   1.,   4.,  17.,  49.,  54.,
        53.,  93.,  87., 123., 141., 157., 180., 179., 102.,  75.,  39.,
        29.,  52.,  20.,  18.,   3.,   2.,   0.,   0.,   1.,   1.,  11.,
        51.,  50.,  60.,  99.,  96., 125., 136., 115., 160., 133., 107.,
       117.,  78.,  34.,  49.,  25.,  11.,   6.,   0.,   0.,   1.,   0.,
         2.,  10.,  62.,  62.,  67.,  86.,  95., 165., 193., 183., 232.,
       172., 125., 134.,  82.,  59.,  78.,  27.,  18.,  10.,   0.,

In [15]:
# Create a dataframe for the features
features_one_location = pd.DataFrame(x,
    columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])

features_one_location

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,93.0,81.0,30.0,15.0,4.0,4.0,4.0,12.0,12.0,23.0,...,106.0,120.0,104.0,65.0,39.0,35.0,32.0,41.0,18.0,13.0
1,81.0,30.0,15.0,4.0,4.0,4.0,12.0,12.0,23.0,37.0,...,120.0,104.0,65.0,39.0,35.0,32.0,41.0,18.0,13.0,2.0
2,30.0,15.0,4.0,4.0,4.0,12.0,12.0,23.0,37.0,41.0,...,104.0,65.0,39.0,35.0,32.0,41.0,18.0,13.0,2.0,0.0
3,15.0,4.0,4.0,4.0,12.0,12.0,23.0,37.0,41.0,103.0,...,65.0,39.0,35.0,32.0,41.0,18.0,13.0,2.0,0.0,2.0
4,4.0,4.0,4.0,12.0,12.0,23.0,37.0,41.0,103.0,97.0,...,39.0,35.0,32.0,41.0,18.0,13.0,2.0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,99.0,74.0,50.0,33.0,16.0,17.0,3.0,1.0,0.0,1.0,...,67.0,73.0,97.0,106.0,107.0,109.0,96.0,107.0,156.0,108.0
715,74.0,50.0,33.0,16.0,17.0,3.0,1.0,0.0,1.0,1.0,...,73.0,97.0,106.0,107.0,109.0,96.0,107.0,156.0,108.0,88.0
716,50.0,33.0,16.0,17.0,3.0,1.0,0.0,1.0,1.0,2.0,...,97.0,106.0,107.0,109.0,96.0,107.0,156.0,108.0,88.0,81.0
717,33.0,16.0,17.0,3.0,1.0,0.0,1.0,1.0,2.0,16.0,...,106.0,107.0,109.0,96.0,107.0,156.0,108.0,88.0,81.0,49.0


In [16]:
# Create a dataframe for the targets
targets_one_location = pd.DataFrame(y,
    columns=['target_rides_next_hour'])

targets_one_location

Unnamed: 0,target_rides_next_hour
0,2.0
1,0.0
2,2.0
3,2.0
4,0.0
...,...
714,88.0
715,81.0
716,49.0
717,44.0


In [17]:
def transform_ts_data_into_features_and_target(
    ts_data: pd.DataFrame,
    n_features: int,
    step_size: int
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Transforms a time series dataset into features and targets suitable for machine learning.

    Given a time series dataset with columns 'pickup_hour', 'rides' and 'pickup_location_id', this function
    extracts the data for each location, and for each location it creates a set of features and targets. The
    features are created by selecting a window of n_features hours of rides data, and the target is the number
    of rides in the next hour. The window is moved step_size hours at a time, creating multiple examples. The
    function returns two dataframes: one with the features, and one with the targets.

    :param ts_data: A pandas DataFrame containing the time series data.
    :param n_features: An integer representing the number of features to use in each sub-sequence.
    :param step_size: An integer representing the step size between each sub-sequence.
    :return: A tuple containing two pandas DataFrames: the first one with the features, and the second one with
            the targets.
    """
    # check that the ts_data DataFrame has the expected columns
    assert set(ts_data.columns) == {'pickup_datetime', 'rides', 'pickup_location_id'}

    # get the unique location ids
    location_ids = ts_data["pickup_location_id"].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()

    for location_id in tqdm(location_ids):
        ts_data_one_location = ts_data.loc[ts_data['pickup_location_id'] == location_id, ['pickup_datetime', 'rides']]

        # pre-compute indices for one location
        indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
        n_examples = len(indices)
        
        # transform time series data into features and targets
        x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
        y = np.ndarray(shape=(n_examples))
        pickup_hours = []

        for i, idx in enumerate(indices):
            x[i] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
            y[i] = ts_data_one_location.iloc[idx[2]]['rides']
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_datetime'])

        features_one_location = pd.DataFrame(x, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])
        
        features_one_location['pickup_datetime'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id

        targets_one_location = pd.DataFrame(y, columns=["target_rides_next_hour"])

        # concatenate features and targets for one location
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, targets_one_location])

    features.reset_index(inplace=True, drop=True)
    targets.reset_index(inplace=True, drop=True)

    return features, targets['target_rides_next_hour']




In [18]:
# Apply the transformation to the entire dataset
features, targets = transform_ts_data_into_features_and_target(
    ts_data=ts_data,
    n_features=24*7*1,
    step_size=1
)

# Display shapes for validation
print(f'{features.shape=}')
print(f'{targets.shape=}')

  0%|          | 0/257 [00:00<?, ?it/s]

features.shape=(147775, 170)
targets.shape=(147775,)


In [19]:
features.head(10)

Unnamed: 0,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,rides_previous_160_hour,rides_previous_159_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_datetime,pickup_location_id
0,19.0,28.0,43.0,33.0,12.0,3.0,2.0,1.0,1.0,1.0,...,6.0,6.0,9.0,7.0,7.0,6.0,12.0,15.0,2023-01-08 00:00:00,4
1,28.0,43.0,33.0,12.0,3.0,2.0,1.0,1.0,1.0,2.0,...,6.0,9.0,7.0,7.0,6.0,12.0,15.0,35.0,2023-01-08 01:00:00,4
2,43.0,33.0,12.0,3.0,2.0,1.0,1.0,1.0,2.0,1.0,...,9.0,7.0,7.0,6.0,12.0,15.0,35.0,37.0,2023-01-08 02:00:00,4
3,33.0,12.0,3.0,2.0,1.0,1.0,1.0,2.0,1.0,3.0,...,7.0,7.0,6.0,12.0,15.0,35.0,37.0,33.0,2023-01-08 03:00:00,4
4,12.0,3.0,2.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,...,7.0,6.0,12.0,15.0,35.0,37.0,33.0,20.0,2023-01-08 04:00:00,4
5,3.0,2.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,2.0,...,6.0,12.0,15.0,35.0,37.0,33.0,20.0,5.0,2023-01-08 05:00:00,4
6,2.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0,...,12.0,15.0,35.0,37.0,33.0,20.0,5.0,0.0,2023-01-08 06:00:00,4
7,1.0,1.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0,4.0,...,15.0,35.0,37.0,33.0,20.0,5.0,0.0,0.0,2023-01-08 07:00:00,4
8,1.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0,4.0,2.0,...,35.0,37.0,33.0,20.0,5.0,0.0,0.0,1.0,2023-01-08 08:00:00,4
9,1.0,2.0,1.0,3.0,2.0,2.0,2.0,4.0,2.0,5.0,...,37.0,33.0,20.0,5.0,0.0,0.0,1.0,4.0,2023-01-08 09:00:00,4
