In [24]:
%reload_ext autoreload
%autoreload 2

In [25]:
import src.config as config

In [26]:
import hopsworks

#Connect to the project
project = hopsworks.login(
    project = config.HOPSWORKS_PROJECT_NAME,
    api_key_value= config.HOPSWORKS_API_KEY
)

#Connect to the feature store
feature_store = project.get_feature_store()

#connect to the feature_group
feature_group = feature_store.get_or_create_feature_group(
    name = config.FEATURE_GROUP_NAME,
    version = config.FEATURE_GROUP_VERSION,
    description = 'Time-series data at hourly frequency',
    primary_key= ['pickup_location_id', 'pickup_hour'],
    event_time = 'pickup_hour'
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/45206
Connected. Call `.close()` to terminate connection gracefully.


In [27]:
#Create a feature view, if it doesn't exist yet
#This feature view only uses one feature group, so the query is trivial

try:
    #Create feature view if it doesnt exist yet
    feature_store.create_feature_view(
        name = config.FEATURE_VIEW_NAME,
        version = config.FEATURE_VIEW_VERSION,
        query = feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation')

#Get feature_view
feature_view = feature_store.get_feature_view(
    name = config.FEATURE_VIEW_NAME,
    version = config.FEATURE_VIEW_VERSION
)

Feature view already existed. Skip creation


In [28]:
ts_data, _ = feature_view.training_data(
    description = 'Time-series hourly taxi rides'
)

2023-05-17 22:08:43,281 INFO: USE `taxi_demand_adi_featurestore`
2023-05-17 22:08:44,144 INFO: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`rides` `rides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxi_demand_adi_featurestore`.`time_series_hourly_feature_group_1` `fg0`




In [29]:
ts_data = ts_data.sort_values(by = ['pickup_location_id', 'pickup_hour'])
ts_data.head()

Unnamed: 0,pickup_hour,rides,pickup_location_id
1542238,2022-01-01 00:00:00,0,1
600454,2022-01-01 01:00:00,0,1
1402600,2022-01-01 02:00:00,0,1
486632,2022-01-01 03:00:00,0,1
312687,2022-01-01 04:00:00,1,1


In [30]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(ts_data, input_seq_len=24*28, step_size=23)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 262/262 [01:24<00:00,  3.09it/s]


features_and_target.shape=(115921, 675)


In [31]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1))
print(f'{cutoff_date=}')

X_train, y_train, X_test, y_test = train_test_split(features_and_target, cutoff_date, target_column_name='target_rides_next_hour')

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

cutoff_date=Timestamp('2023-04-19 00:00:00')
X_train.shape=(108468, 674)
y_train.shape=(108468,)
X_test.shape=(7453, 674)
y_test.shape=(7453,)


In [32]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna
import warnings
warnings.filterwarnings('ignore')

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyperparameters, it trains a model an computes an average validation error based on TimeSeriesSplit
    """

    #pick hyper_parameters
    hyperparams = {
        'metric':'mae',
        'verbose': -1,
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 3, 100)
    }

    tss = TimeSeriesSplit(n_splits = 4)
    scores = []
    for train_index, val_index in tss.split(X_train):
        #Split the data into train and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        #train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        #Evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
    return np.array(scores).mean()

In [33]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

[32m[I 2023-05-17 22:23:45,290][0m A new study created in memory with name: no-name-a54c4a73-a802-472a-9320-88597a601b17[0m




[32m[I 2023-05-17 22:27:04,553][0m Trial 0 finished with value: 3.0948454520548117 and parameters: {'num_leaves': 191, 'feature_fraction': 0.6240077731212971, 'bagging_fraction': 0.8356934535326543, 'min_child_samples': 58}. Best is trial 0 with value: 3.0948454520548117.[0m




[32m[I 2023-05-17 22:28:59,826][0m Trial 1 finished with value: 3.110712600950043 and parameters: {'num_leaves': 74, 'feature_fraction': 0.7497264451442789, 'bagging_fraction': 0.2539000906005793, 'min_child_samples': 57}. Best is trial 0 with value: 3.0948454520548117.[0m




[32m[I 2023-05-17 22:30:08,966][0m Trial 2 finished with value: 3.1498409680532324 and parameters: {'num_leaves': 26, 'feature_fraction': 0.3328827279083393, 'bagging_fraction': 0.534782425635778, 'min_child_samples': 21}. Best is trial 0 with value: 3.0948454520548117.[0m




[32m[I 2023-05-17 22:32:15,884][0m Trial 3 finished with value: 3.1443451315056925 and parameters: {'num_leaves': 104, 'feature_fraction': 0.3871742610861267, 'bagging_fraction': 0.8169907452864829, 'min_child_samples': 87}. Best is trial 0 with value: 3.0948454520548117.[0m




[32m[I 2023-05-17 22:33:44,969][0m Trial 4 finished with value: 3.1351314529713665 and parameters: {'num_leaves': 25, 'feature_fraction': 0.6359670844316767, 'bagging_fraction': 0.7533804132830637, 'min_child_samples': 35}. Best is trial 0 with value: 3.0948454520548117.[0m


In [34]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 191, 'feature_fraction': 0.6240077731212971, 'bagging_fraction': 0.8356934535326543, 'min_child_samples': 58}


In [35]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)



In [36]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=5.1812


In [40]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['C:\\Users\\Avi\\Desktop\\taxi_demand_ml_project\\models\\model.pkl']

In [41]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [43]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name = 'taxi_demand_predictor_next_hour',
    metrics = {'test_mae': test_mae},
    description = "LightGBM Regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

model.save(MODELS_DIR / 'model.pkl')

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Model created, explore it at https://c.app.hopsworks.ai:443/p/45206/models/taxi_demand_predictor_next_hour/2


Model(name: 'taxi_demand_predictor_next_hour', version: 2)