In [2]:
import pandas as pd
from tqdm import tqdm

ts_data = pd.read_parquet('../data/transformed/ts_data_2022_01.parquet')
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,11,4
1,2022-01-01 01:00:00,15,4
2,2022-01-01 02:00:00,26,4
3,2022-01-01 03:00:00,8,4
4,2022-01-01 04:00:00,9,4
...,...,...,...
191203,2022-01-31 19:00:00,0,176
191204,2022-01-31 20:00:00,0,176
191205,2022-01-31 21:00:00,0,176
191206,2022-01-31 22:00:00,0,176


In [3]:
ts_data_one_location = ts_data[ts_data['pickup_location_id'] == 43].reset_index(drop=True)
ts_data_one_location.head(25)

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
5,2022-01-01 05:00:00,5,43
6,2022-01-01 06:00:00,3,43
7,2022-01-01 07:00:00,10,43
8,2022-01-01 08:00:00,7,43
9,2022-01-01 09:00:00,19,43


In [4]:
def create_ts_dataset(data: pd.DataFrame, n_features: int, step_size: int) -> pd.DataFrame:
    """
    Create a dataset with n_features and a target column based on step_size.
    """

    import warnings

    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

    df_features = pd.DataFrame()
    df_target = pd.DataFrame()
    for location_id in tqdm(data['pickup_location_id'].unique()):
        data_i = data[data['pickup_location_id'] == location_id].reset_index(drop=True)
    
        df_features_i = pd.DataFrame()
        df_target_i = pd.DataFrame()
        # take the last n_features rows and add them as features
        for i in range(n_features):
            df_features_i['pickup_hour'] = data_i['pickup_hour'].shift(-n_features)
            df_features_i['pickup_location_id'] = data_i['pickup_location_id']
            df_features_i[f'rides_previous_{n_features-i}_hour'] = data_i['rides'].shift(-i)

            # take the next row after the last n_features rows and add it as target
            df_target_i['target_rides_next_hour'] = data_i['rides'].shift(-n_features)
    
        df_features = pd.concat([df_features, df_features_i])
        df_target = pd.concat([df_target, df_target_i])

    # select rows based on step_size
    index = list(range(0, len(df_features), step_size))
    df_features = df_features.iloc[index]
    df_target = df_target.iloc[index]
    
    return df_features.dropna().reset_index(drop=True), df_target['target_rides_next_hour'].dropna().reset_index(drop=True)

In [5]:
n_features = 24*7*1 # 1 week
step_size = 24

features, target = create_ts_dataset(ts_data, n_features, step_size)

100%|██████████| 257/257 [00:17<00:00, 14.55it/s]


In [6]:
features

Unnamed: 0,pickup_hour,pickup_location_id,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,2022-01-08,4,11,15.0,26.0,8.0,9.0,7.0,3.0,1.0,...,4.0,2.0,2.0,3.0,3.0,7.0,4.0,4.0,7.0,10.0
1,2022-01-09,4,1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,3.0,6.0,3.0,3.0,5.0,7.0,8.0,6.0,7.0,14.0
2,2022-01-10,4,0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,...,2.0,1.0,6.0,4.0,3.0,5.0,1.0,1.0,1.0,0.0
3,2022-01-11,4,1,1.0,0.0,0.0,0.0,3.0,2.0,3.0,...,1.0,2.0,6.0,3.0,2.0,4.0,1.0,0.0,1.0,2.0
4,2022-01-12,4,0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,1.0,4.0,1.0,6.0,3.0,2.0,3.0,2.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163,2022-01-27,176,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6164,2022-01-28,176,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6165,2022-01-29,176,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6166,2022-01-30,176,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
target

0       16.0
1       18.0
2        0.0
3        0.0
4        2.0
        ... 
6163     0.0
6164     0.0
6165     0.0
6166     0.0
6167     0.0
Name: target_rides_next_hour, Length: 6168, dtype: float64