In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# import libraries
from typing import Optional, List
from datetime import date, timedelta
import joblib

from pytz import timezone
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import plotly.express as px 
import optuna
import hopsworks

from src.paths import MODELS_DIR
from src.data import transform_ts_data_into_features_and_target
from src.data_split import train_test_split
from src.model import get_pipeline
import src.config as config

import warnings
warnings.filterwarnings("ignore")

In [11]:
# conect to hopsworks feature store
project = hopsworks.login(
    # Hopsworks project name
    project=config.HOPSWORKS_PROJECT_NAME,
    # Hopsworks feature store name
    api_key_value=config.HOPSWORKS_API_KEY,
)

feature_store = project.get_feature_store()

# connect to feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/111769
Connected. Call `.close()` to terminate connection gracefully.


In [12]:
# create feature view if does not exist yet
try:
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already exists. Skipping creation.')

# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view already exists. Skipping creation.


In [13]:
ts_data, _ = feature_view.training_data(
    description="Time-series hourly taxi rides"
)



2023-10-02 08:53:43,950 INFO: USE `taxi_demand_api_featurestore`
2023-10-02 08:53:44,951 INFO: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_api_featurestore`.`time_series_hourly_feature_group_1` `fg0`
WHERE `fg0`.`pickup_hour` >= TIMESTAMP '1970-01-01 12:16:40.000' AND `fg0`.`pickup_hour` < TIMESTAMP '2023-10-02 11:53:41.000'




In [18]:
print(ts_data.head(10))

ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)

                pickup_hour  rides  pickup_location_id
2818921 2022-01-01 00:00:00      0                   1
1222681 2022-01-01 01:00:00      0                   1
2552147 2022-01-01 02:00:00      0                   1
280548  2022-01-01 03:00:00      0                   1
3194290 2022-01-01 04:00:00      1                   1
3360201 2022-01-01 05:00:00      1                   1
1460822 2022-01-01 06:00:00      0                   1
395374  2022-01-01 07:00:00      2                   1
2156465 2022-01-01 08:00:00      0                   1
1615181 2022-01-01 09:00:00      0                   1


In [15]:
def plot_ts(ts_data: pd.DataFrame, locations: Optional[List[int]] = None):
    """
    Plot time-series data.

    Args:
    - ts_data: pandas DataFrame containing the time-series data to plot.
    - locations: optional list of integers representing the pickup location IDs to plot.

    Returns:
    - None
    """
    ts_data_to_plot = ts_data[ts_data.pickup_location_id.isin(locations)] if locations else ts_data

    fig = px.line(
        ts_data_to_plot,
        x="pickup_hour",
        y="rides",
        color='pickup_location_id',
        template='none',
    )

    fig.show()


In [16]:
plot_ts(ts_data, locations=[43])

In [23]:
features, targets = transform_ts_data_into_features_and_target(
    ts_data=ts_data,
    input_seq_len=24*28,
    step_size=1
)

100%|██████████| 265/265 [37:29<00:00,  8.49s/it]


In [24]:
features_and_targets = features.copy()

features_and_targets['target'] = targets

print(f'{features_and_targets.shape=}')

features_and_targets.shape=(3682175, 675)


In [25]:
features_and_targets.head(10)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29 00:00:00,1,0.0
1,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 01:00:00,1,0.0
2,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 02:00:00,1,0.0
3,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 03:00:00,1,0.0
4,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 04:00:00,1,0.0
5,1.0,0.0,2.0,0.0,0.0,1.0,2.0,1.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 05:00:00,1,0.0
6,0.0,2.0,0.0,0.0,1.0,2.0,1.0,5.0,5.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 06:00:00,1,0.0
7,2.0,0.0,0.0,1.0,2.0,1.0,5.0,5.0,13.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 07:00:00,1,0.0
8,0.0,0.0,1.0,2.0,1.0,5.0,5.0,13.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 08:00:00,1,0.0
9,0.0,1.0,2.0,1.0,5.0,5.0,13.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 09:00:00,1,0.0


In [31]:
# set training date from january 2022 up until april 2023
# and test date from april 2023 up until july 2023
training_end_date = pd.to_datetime(date(2023, 4, 1))

X_train, y_train, X_test, y_test = train_test_split(
    df=features_and_targets,
    cutoff_date=training_end_date,
    target_col_name='target'
)

# print shapes
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(2715720, 674)
y_train.shape=(2715720,)
X_test.shape=(966455, 674)
y_test.shape=(966455,)


In [32]:
def create_objective(X_train, y_train):
    def objective(trial: optuna.trial.Trial) -> float:
        """
        Train a model with a set of hyperparameters and compute and validation error based on a TimeSeriesSplit.
        """
        # define hyperparameters to tune
        hyperparams = {
            "metric": "mae",
            "verbose": -1,
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
            "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),    
        }

        tss = TimeSeriesSplit(n_splits=2)
        scores = []
        for train_index, val_index in tss.split(X_train):
            X_train_, X_val_ = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

            # create pipeline
            pipeline = get_pipeline(**hyperparams)
            pipeline.fit(X_train_, y_train_)

            # evaluate the model
            y_pred = pipeline.predict(X_val_)
            mae = mean_absolute_error(y_val_, y_pred)
            scores.append(mae)

        # return the mean scores
        return np.array(scores).mean()

    return objective

In [33]:
study = optuna.create_study(direction="minimize")
study.optimize(create_objective(X_train, y_train), n_trials=1)

[I 2023-10-02 22:26:28,224] A new study created in memory with name: no-name-8483684e-b0d9-468f-97cc-bf13e8e98c4e




[I 2023-10-02 22:31:20,853] Trial 0 finished with value: 3.220451304802626 and parameters: {'num_leaves': 135, 'feature_fraction': 0.32910402197251526, 'bagging_fraction': 0.9008727736790323, 'min_child_samples': 8}. Best is trial 0 with value: 3.220451304802626.


In [34]:
best_params = study.best_trial.params
print(f'{best_params=}')

pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

best_params={'num_leaves': 135, 'feature_fraction': 0.32910402197251526, 'bagging_fraction': 0.9008727736790323, 'min_child_samples': 8}


In [None]:
joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

In [None]:
from schema import Schema
