In [1]:
# Activate IPython interactive shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
# Import required libraries
import numpy as np
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

In [3]:
# Read processed time series data
data_year = 2023
data_month = 1

time_series_taxi_rides_data_folder_path = Path('..') / 'data' / 'processed'
time_series_taxi_rides_data_file_path = time_series_taxi_rides_data_folder_path / f'taxi_rides_processed_time_series_{data_year}_{data_month:02}.parquet'
time_series_taxi_rides_table = pq.read_table(time_series_taxi_rides_data_file_path)
time_series_taxi_rides_data = time_series_taxi_rides_table.to_pandas()
time_series_taxi_rides_data.head()

Unnamed: 0,pickup_hour,pickup_location_id,num_of_rides
0,2023-01-01 00:00:00,2,0
1,2023-01-01 01:00:00,2,0
2,2023-01-01 02:00:00,2,0
3,2023-01-01 03:00:00,2,0
4,2023-01-01 04:00:00,2,0


In [None]:
# A function that converts data into feature-target table
def data_to_feature_target_table(taxi_rides_data, location_id, feature_col = 'num_of_rides', window_size = 12, step_size = 1) -> pd.DataFrame:
    '''
    Transforms time series data for a given location ID into a tabular format.
    The first `window_size` rows are used as features, and the next row is the target.
    The process slides down by `step_size` rows at a time to create the next set of features and target.

    Parameters:
        taxi_rides_data (pd.DataFrame): The input DataFrame containing time series data.
        location_id (int): The location ID to filter the data for.
        feature_col (str): The column name containing the values to use as features and target (default is "rides").
        window_size (int): The number of rows to use as features (default is 12).
        step_size (int): The number of rows to slide the window by (default is 1).

    Returns:
        pd.DataFrame: A transformed DataFrame where the first `window_size` columns are features
                      and the last column is the target.
    '''

    # Filter data for the given location id
    taxi_rides_pu_location_data = taxi_rides_data[taxi_rides_data['pickup_location_id'] == location_id].reset_index(drop = True)

    # Extract feature columns as numpy array
    feature_values = taxi_rides_pu_location_data[feature_col].values

    # Ensure there are enough rows to create at least one window
    if len(feature_values) <= window_size:
        raise ValueError('Not enough data to create even one window of features and target.')
    
    # Create the tabular data using a sliding window approach
    rows = []
    for row_num in range(0, (len(feature_values)) - window_size, step_size):
        # The first `window_size` values are features, and the next value is the target
        features = feature_values[row_num:row_num + window_size]
        target = feature_values[row_num + window_size]
        rows.append(np.append(features, target))

    # Convert the list of rows into a DataFrame
    feature_target_table_column_names = [f'feature_{feature_num + 1}' for feature_num in range(window_size)] + ['target']
    feature_target_table = pd.DataFrame(rows, columns = feature_target_table_column_names)

    return feature_target_table

In [5]:
feature_target_data = data_to_feature_target_table(time_series_taxi_rides_data, 43, 'num_of_rides', 24, 1)
feature_target_data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,target
0,92,81,29,15,4,4,3,12,12,23,...,119,103,65,39,35,32,40,18,13,2
1,81,29,15,4,4,3,12,12,23,37,...,103,65,39,35,32,40,18,13,2,0
2,29,15,4,4,3,12,12,23,37,41,...,65,39,35,32,40,18,13,2,0,2
3,15,4,4,3,12,12,23,37,41,102,...,39,35,32,40,18,13,2,0,2,2
4,4,4,3,12,12,23,37,41,102,97,...,35,32,40,18,13,2,0,2,2,0
