In [2]:
# autoreload
%load_ext autoreload
%autoreload 2

# set current working directory
import os
os.chdir(os.path.dirname(os.getcwd()))

import src.config as config

In [3]:
# connect to hopsworks feature store
import hopsworks

# connect to project
project = hopsworks.login(project=config.HOPSWORKS_PROJECT_NAME, api_key_value=config.HOPSWORKS_API_KEY)

# connect to feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_or_create_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
    description='Time series data at hourly frequency',
    primary_key=['pickup_datetime', 'pickup_hour'],
    event_time='pickup_hour',)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/20648
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
# create feature view (if it doesn't exist)

try:
    # create feature view if it doesn't exist
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already exists. Skipping creation.')

# get feature view
feature_view = feature_store.get_feature_view(config.FEATURE_VIEW_NAME, config.FEATURE_VIEW_VERSION)

Feature view already exists. Skipping creation.


In [5]:
ts_data, _ = feature_view.training_data(description='Time series hourly taxi rides')

2023-03-05 15:12:27,073 INFO: USE `taxi_demand_1_featurestore`
2023-03-05 15:12:27,651 INFO: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_1_featurestore`.`time_series_hourly_feature_group_1` `fg0`




In [6]:
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
593229,2022-01-01 00:00:00,0,1
2132751,2022-01-01 01:00:00,0,1
1633383,2022-01-01 02:00:00,0,1
1514964,2022-01-01 03:00:00,0,1
926635,2022-01-01 04:00:00,1,1
...,...,...,...
151506,2023-03-05 11:00:00,8,265
131159,2023-03-05 12:00:00,8,265
159169,2023-03-05 13:00:00,5,265
171399,2023-03-05 14:00:00,11,265


In [7]:
from src.data import create_ts_dataset

features, targets = create_ts_dataset(
    ts_data,
    n_features=24*28, # 1 month
    step_size=23)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 262/262 [11:11<00:00,  2.56s/it]


features_and_target.shape=(91079, 675)


In [8]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# training data range: January 2022 to Current Date - 1 month
# test data range: Current Date - 1 month to Current Date
cutoff_date = pd.to_datetime(date.today() - timedelta(days=28))

print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(
    df=features_and_target,
    cutoff_date=cutoff_date,
    target_column_name='target_rides_next_hour')

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

cutoff_date=Timestamp('2023-02-05 00:00:00')
X_train.shape=(83659, 674)
y_train.shape=(83659,)
X_test.shape=(7420, 674)
y_test.shape=(7420,)


In [9]:
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import optuna

from src import model

# define objective function
def objective(trial: optuna.trial.Trial) -> float:
    '''Takes in hyperparameters as input, and trains a model that computes the average validation error based on TimeSeriesSplit cross validation'''

    # define hyperparameters
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }

    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    for train_index, val_index in tss.split(X_train):
        # split data
        X_train_, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        # create model
        pipeline = model.get_pipeline(**params)

        # fit model
        pipeline.fit(X_train_, y_train_)

        # compute validation error
        y_pred = pipeline.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)

        scores.append(mae)
    
    return np.mean(scores)

In [10]:
import warnings
warnings.filterwarnings('ignore')

# optuna study
study = optuna.create_study(direction='minimize', study_name='lightgbm')
study.optimize(objective, n_trials=10)

[32m[I 2023-03-05 15:29:18,582][0m A new study created in memory with name: lightgbm[0m
[32m[I 2023-03-05 15:29:35,970][0m Trial 0 finished with value: 3.178742708964517 and parameters: {'num_leaves': 47, 'colsample_bytree': 0.38716207796448154, 'subsample': 0.6145990540916767, 'min_child_samples': 96}. Best is trial 0 with value: 3.178742708964517.[0m
[32m[I 2023-03-05 15:30:08,747][0m Trial 1 finished with value: 3.166321722250217 and parameters: {'num_leaves': 168, 'colsample_bytree': 0.5282773144964918, 'subsample': 0.22403406940430717, 'min_child_samples': 27}. Best is trial 1 with value: 3.166321722250217.[0m
[32m[I 2023-03-05 15:30:35,125][0m Trial 2 finished with value: 3.1413514109126943 and parameters: {'num_leaves': 122, 'colsample_bytree': 0.39208717886899147, 'subsample': 0.33071017552318854, 'min_child_samples': 62}. Best is trial 2 with value: 3.1413514109126943.[0m
[32m[I 2023-03-05 15:30:50,896][0m Trial 3 finished with value: 3.152496742856738 and parame

In [11]:
# print best parameters
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 189, 'colsample_bytree': 0.8228890459776981, 'subsample': 0.2990720206912918, 'min_child_samples': 61}


In [12]:
# fit best params on full training set
pipeline = model.get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [13]:
# compute test error on test set
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=5.8021


In [14]:
# save trained model
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['/Users/ani/Projects/1_taxi_demand_forecasting/models/model.pkl']

In [15]:
# define schema for hopsworks model reigistry
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)


In [16]:
# upload model to hopsworks model registry
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name='taxi_demand_forecaster_next_hour',
    metrics={'test_mae': test_mae},
    description='LightGBM model that predicts the number of taxi rides in the next hour',
    model_schema=model_schema,
    input_example=X_train.sample()
)

model.save(MODELS_DIR / 'model.pkl')

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Model created, explore it at https://c.app.hopsworks.ai:443/p/20648/models/taxi_demand_forecaster_next_hour/1


Model(name: 'taxi_demand_forecaster_next_hour', version: 1)