In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import List, Tuple

In [2]:
path = "../data/transformed/"

In [3]:
def filter_by_location(df: pd.DataFrame, location: str, location_col: str = 'pickup_location_id') -> pd.DataFrame:
    # Filter the DataFrame for the specified location
    filtered_df = df[df[location_col] == location]
    return filtered_df

In [4]:
def slice_and_slide(data: pd.DataFrame, start_position: int, n_features: int, step_size: int, target_col: str = 'ride_count') -> List[Tuple[int, int, int]]:
    indices_and_targets = [
        (start, start + n_features, start + n_features + 1)
        for start in tqdm(range(start_position, len(data) - n_features - 1, step_size), desc="Slicing data")
        if (start + n_features + 1) < len(data)
    ]
    return indices_and_targets

In [16]:

def generate_training_set(data: pd.DataFrame, start_position: int, n_features: int, step_size: int, pickup_location_id: int = None, target_col: str = 'ride_count') -> pd.DataFrame:
    """
    Generates a training set DataFrame with features, pickup_time_hour, and rides_next_hour.

    Parameters:
    data (pd.DataFrame): The input DataFrame containing the data.
    start_position (int): The starting position for slicing the data.
    n_features (int): The number of features to extract.
    step_size (int): The step size for slicing the data.
    pickup_location_id (int, optional): The pickup location ID to filter the data. Defaults to None.
    target_col (str): The name of the target column. Defaults to 'ride_count'.

    Returns:
    pd.DataFrame: A DataFrame containing the features, pickup_time_hour, and rides_next_hour.
    """
    # Filter by location if provided
    if pickup_location_id is not None:
        data = data[data['pickup_location_id'] == pickup_location_id]

    # Extract features and targets
    indices_and_targets = slice_and_slide(data, start_position, n_features, step_size, target_col=target_col)
    X = np.array([data.iloc[start:end][target_col].values for start, end, target in tqdm(indices_and_targets, desc="Extracting features for X", colour='green')])
    y = np.array([data.iloc[end:target][target_col] for start, end, target in tqdm(indices_and_targets, desc="Extracting target for y", colour='green')])

    # Extract additional columns
    pickup_time_hours = [data.iloc[target]['pickup_time'] for start, end, target in indices_and_targets]
    pickup_location_ids = [data.iloc[target]['pickup_location_id'] for start, end, target in indices_and_targets]
    
    # Create combined DataFrame
    combined_df = pd.DataFrame(X, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])
    combined_df['pickup_time'] = pickup_time_hours
    combined_df['pickup_location_id'] = pickup_location_ids
    combined_df['rides_next_hour'] = y

    return combined_df

In [6]:
# def generate_X_y_dataframes(data: pd.DataFrame, start_position: int, n_features: int, step_size: int, pickup_location_id: str = None, target_col: str = 'ride_count') -> Tuple[pd.DataFrame, pd.DataFrame]:
 
#     # Filter by location if provided
#     if pickup_location_id is not None:
#         data = data[data['pickup_location_id'] == pickup_location_id]

#     # Extract features for X
#     indices_and_targets = slice_and_slide(data, start_position, n_features, step_size, target_col=target_col)
#     X = np.array([data.iloc[start:end][target_col].values for start, end, target in tqdm(indices_and_targets, desc="Extracting features for X")])

#     # Extract target for y
#     y = np.array([data.iloc[end:target][target_col].values for start, end, target in tqdm(indices_and_targets, desc="Extracting target for y")])

#     # Create DataFrames
#     X_df = pd.DataFrame(X, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])
#     y_df = pd.DataFrame(y, columns=['rides_next_hour'])

#     return X_df, y_df

In [7]:
ts_data = pd.read_parquet('/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/data/silver/2024/ts_data_2024_01.parquet')
ts_data.head()

Unnamed: 0,pickup_time,pickup_location_id,ride_count
0,2024-01-01,4,24.0
1,2024-01-01,7,4.0
2,2024-01-01,9,1.0
3,2024-01-01,10,6.0
4,2024-01-01,12,4.0


In [8]:
# Filter the dataset for a specific location, e.g., location 4
filtered_data = filter_by_location(ts_data, location=43)
filtered_data[:25]

Unnamed: 0,pickup_time,pickup_location_id,ride_count
22,2024-01-01 00:00:00,43,156.0
282,2024-01-01 01:00:00,43,85.0
542,2024-01-01 02:00:00,43,36.0
802,2024-01-01 03:00:00,43,14.0
1062,2024-01-01 04:00:00,43,5.0
1322,2024-01-01 05:00:00,43,3.0
1582,2024-01-01 06:00:00,43,4.0
1842,2024-01-01 07:00:00,43,12.0
2102,2024-01-01 08:00:00,43,10.0
2362,2024-01-01 09:00:00,43,14.0


In [9]:
# Create slices with 12 previous hours as features
rolling_window = slice_and_slide(filtered_data, 0, 24*7, 1)
rolling_window


Slicing data: 100%|██████████| 575/575 [00:00<00:00, 1879754.33it/s]


[(0, 168, 169),
 (1, 169, 170),
 (2, 170, 171),
 (3, 171, 172),
 (4, 172, 173),
 (5, 173, 174),
 (6, 174, 175),
 (7, 175, 176),
 (8, 176, 177),
 (9, 177, 178),
 (10, 178, 179),
 (11, 179, 180),
 (12, 180, 181),
 (13, 181, 182),
 (14, 182, 183),
 (15, 183, 184),
 (16, 184, 185),
 (17, 185, 186),
 (18, 186, 187),
 (19, 187, 188),
 (20, 188, 189),
 (21, 189, 190),
 (22, 190, 191),
 (23, 191, 192),
 (24, 192, 193),
 (25, 193, 194),
 (26, 194, 195),
 (27, 195, 196),
 (28, 196, 197),
 (29, 197, 198),
 (30, 198, 199),
 (31, 199, 200),
 (32, 200, 201),
 (33, 201, 202),
 (34, 202, 203),
 (35, 203, 204),
 (36, 204, 205),
 (37, 205, 206),
 (38, 206, 207),
 (39, 207, 208),
 (40, 208, 209),
 (41, 209, 210),
 (42, 210, 211),
 (43, 211, 212),
 (44, 212, 213),
 (45, 213, 214),
 (46, 214, 215),
 (47, 215, 216),
 (48, 216, 217),
 (49, 217, 218),
 (50, 218, 219),
 (51, 219, 220),
 (52, 220, 221),
 (53, 221, 222),
 (54, 222, 223),
 (55, 223, 224),
 (56, 224, 225),
 (57, 225, 226),
 (58, 226, 227),
 (59, 2

In [10]:
# # Extract features (X), targets (y), and pickup hours from the DataFrame
# X = np.array([filtered_data.iloc[start:end]['ride_count'].values for start, end, target in rolling_window])
# y = np.array([filtered_data.iloc[end:target]['ride_count'].values for start, end, target in rolling_window])
# pickup_hours = [filtered_data.iloc[end]['pickup_time_hour'] for start, end, target in rolling_window]

In [19]:
xyz = generate_training_set(ts_data, start_position = 0, n_features = 24*7, step_size = 1, pickup_location_id=43, target_col='ride_count')



Slicing data: 100%|██████████| 575/575 [00:00<00:00, 1392450.81it/s]
Extracting features for X: 100%|[32m██████████[0m| 575/575 [00:00<00:00, 13512.65it/s]
Extracting target for y: 100%|[32m██████████[0m| 575/575 [00:00<00:00, 33017.88it/s]


In [12]:
xyz[xyz['pickup_location_id'] == 43]

Unnamed: 0,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,rides_previous_160_hour,rides_previous_159_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_time,pickup_location_id,rides_next_hour
113,1.0,1.0,20.0,1.0,48.0,4.0,5.0,2.0,1.0,1.0,...,0.0,14.0,0.0,13.0,5.0,7.0,61.0,2024-01-01 01:00:00,43,27.0
373,0.0,3.0,44.0,1.0,66.0,3.0,3.0,2.0,1.0,0.0,...,0.0,7.0,0.0,7.0,9.0,3.0,31.0,2024-01-01 02:00:00,43,15.0
633,0.0,0.0,29.0,1.0,57.0,1.0,0.0,0.0,0.0,0.0,...,0.0,6.0,0.0,6.0,9.0,1.0,6.0,2024-01-01 03:00:00,43,12.0
893,0.0,0.0,17.0,0.0,68.0,0.0,4.0,0.0,0.0,1.0,...,0.0,2.0,1.0,14.0,15.0,1.0,5.0,2024-01-01 04:00:00,43,7.0
1153,0.0,1.0,4.0,0.0,33.0,1.0,6.0,0.0,0.0,1.0,...,0.0,0.0,1.0,14.0,10.0,0.0,4.0,2024-01-01 05:00:00,43,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191993,0.0,0.0,13.0,0.0,50.0,0.0,1.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,1.0,8.0,2024-01-31 19:00:00,43,2.0
192253,0.0,0.0,3.0,0.0,46.0,0.0,0.0,1.0,1.0,0.0,...,0.0,2.0,0.0,0.0,0.0,1.0,5.0,2024-01-31 20:00:00,43,3.0
192513,0.0,0.0,16.0,0.0,43.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,5.0,2024-01-31 21:00:00,43,3.0
192773,0.0,0.0,9.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,5.0,2024-01-31 22:00:00,43,1.0
