Feature Dictionary
- `n_features` = number of hours per day.

In [46]:
# import libraries
from src.libs import *

# retina display
%config InlineBackend.figure_format = 'retina'

# lux-api details increase the size of the visualizations
lux.config.default_display = "lux-widget"
lux.config.plotting_backend = "plotly"
lux.config.default_display_size = "large"



Option 1 dataset

In [47]:
# using option 1 dataset
ts_data_option_1 = pd.read_parquet('../data/transformed/option_1_rides_per_hour_ts_2022_01.parquet')
ts_data_option_1

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [48]:
# using Central Park location
ts_data_one_location = ts_data_option_1.loc[ts_data_option_1.pickup_location_id == 43,:].reset_index(drop=True)
ts_data_one_location.head()

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [49]:
# function to create features and targets

def get_cutoff_indices(
        data: pd.DataFrame,
        n_features: int, # hours
        step_size: int,
) -> list:
    """
    This function returns the indices of the cutoff date in the dataframe.

    Input: pd.DataFrame, n_features(12 h for now), step_size (default 1 day,could be extended)
    Output: list of indices
    """
    # init
    stop_position = len(data) - 1

    # start the first sub-sequence at the beginning of the data
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices = []

    # loop over the data
    while subseq_last_idx <= stop_position:
        # get the indices
        indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
        
        # update the indices
        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size

    return indices

In [50]:
# check the output
n_features = 24 # 24 hours
step_size = 1   # 1 day

indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)

In [51]:
indices[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

In [52]:
# implement the slicing using the indices

n_examples = len(indices)

# shape is number of examples(len of the list), number of features (columns)
X = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)

# 1D vector with number of rows = number of examples
y = np.ndarray(shape=(n_examples), dtype=np.float32)
pickup_hours = []

# loop over the indices to generate the values for features and targets
for i, idx in enumerate(indices):
    # get the features
    # extracting the first and the second index of the cutoff indices
    X[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values

    # get the target
    # extracting the second and the third index of the cutoff indices (mid and right side ones)
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values

    # store the pickup hours
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

In [53]:
# print the shape of the features and targets
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'{X=}')
print(f'pickup hours: {pickup_hours[:5]=}')

X shape: (719, 24)
y shape: (719,)
X=array([[ 97.,  60.,  22., ...,  16.,  18.,   6.],
       [ 60.,  22.,   8., ...,  18.,   6.,   3.],
       [ 22.,   8.,   6., ...,   6.,   3.,   1.],
       ...,
       [ 28.,  16.,  13., ..., 102.,  66.,  61.],
       [ 16.,  13.,   8., ...,  66.,  61.,  73.],
       [ 13.,   8.,   1., ...,  61.,  73.,  33.]], dtype=float32)
pickup hours: pickup_hours[:5]=[Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-02 01:00:00'), Timestamp('2022-01-02 02:00:00'), Timestamp('2022-01-02 03:00:00'), Timestamp('2022-01-02 04:00:00')]


The output above should be the same is in the cell "# using Central Park location". See timestamps and their position within the output - they should match.

In [54]:
# convert numpy arrays to pandas dataframes for X values
ts_data_one_location = pd.DataFrame(
    X, 
    columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])
ts_data_one_location

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [55]:
# transform the target into a dataframe
targets_ts_data_one_location = pd.DataFrame(y, columns=[f'target_rides_next_hour'])
targets_ts_data_one_location

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [56]:
from tqdm import *

In [63]:
# define a function that transforms the entire dataset into features and targets for all locations

def transform_ts_data_into_features_and_targets(
        ts_data: pd.DataFrame,
        input_seq_len: int, # number of features
        step_size: int,
) -> pd.DataFrame:
        """
        This function transforms the time series data into features and targets.
        Slices and transposes datafrom time series to supervised learning problem(features and targets).
        """
        assert set(ts_data.columns) == {
                'pickup_location_id',
                  'pickup_hour', 
                  'rides'},'The columns of the dataframe are not correct.'


        # init
        location_ids = ts_data.pickup_location_id.unique()
        features = pd.DataFrame()
        targets = pd.DataFrame()

        # loop over the locations
        for location_id in tqdm(location_ids):
                # keep only ts data for this location id
                ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == location_id,
                                                   ['pickup_hour', 'rides']]#.reset_index(drop=True)
                # pre-compute cutoff indices to split dataframe rows
                indices = get_cutoff_indices(ts_data_one_location, input_seq_len, step_size)

                # slide and transpose the data into numpy arrays for features and targets
                n_examples = len(indices)
                X = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
                y = np.ndarray(shape=(n_examples), dtype=np.float32)
                pickup_hours = []

                # loop over the indices to generate the values for features and targets
                for i, idx in enumerate(indices):
                        # get the features
                        # extracting the first and the second index of the cutoff indices
                        X[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values

                        # get the target
                        # extracting the second and the third index of the cutoff indices (mid and right side ones)
                        y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values

                        # store the pickup hours
                        pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

                # numpy -> pandas for X
                # convert numpy arrays to pandas dataframes for X values
                ts_data_one_location = pd.DataFrame(
                X, 
                columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))])
                ts_data_one_location
                
                # numpy -> pandas for y
                # transform the target into a dataframe
                targets_ts_data_one_location = pd.DataFrame(y, columns=[f'target_rides_next_hour'])
                targets_ts_data_one_location
                
                # concat the results from the previous iterations
                features = pd.concat([features, ts_data_one_location], axis=0)
                targets = pd.concat([targets, targets_ts_data_one_location], axis=0)
                
        # remove the index to make the output consistent
        features.reset_index(drop=True, inplace=True)
        targets.reset_index(drop=True, inplace=True)

        return features, targets['target_rides_next_hour']

In [64]:
# calling the function
features, targets = transform_ts_data_into_features_and_targets(
    ts_data_option_1,
    input_seq_len=24*7*1, # 1 week of history data
    step_size=24, # 24 hours \ 1 day
)

print(f'features shape: {features.shape}')
print(f'targets shape: {targets.shape}')

100%|██████████| 257/257 [00:02<00:00, 115.20it/s]

features shape: (6168, 168)
targets shape: (6168,)





In [None]:
# # parallelize the function transform_ts_data_into_features_and_targets
# # using joblib
# from joblib import Parallel, delayed