In [26]:
%load_ext autoreload
%autoreload 2

import taxi_demand_predictor.config as cfg
import taxi_demand_predictor.data as data
import taxi_demand_predictor.preprocessing as preprocessing 
import taxi_demand_predictor.model as model
from taxi_demand_predictor.paths import MODELS_DIR
from sklearn.metrics import mean_absolute_error
import pandas as pd
import optuna
import hopsworks
import joblib
from hsml.schema import Schema 
from hsml.model_schema import ModelSchema

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Connect to the project
project = hopsworks.login(
    project=cfg.HOPSWORKS_PROJECT_NAME,
    api_key_value=cfg.HOPSWORKS_API_KEY
)

# Connect to the feature store
feature_store = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/933012
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
# Connect to the feature group
feature_group = feature_store.get_or_create_feature_group(
    name=cfg.FEATURE_GROUP_NAME,
    version=cfg.FEATURE_GROUP_VERSION,
    description="ts data hourly frequency",
    primary_key = ['pickup_location_transformed', 'pickup_ts'],
    event_time='pickup_ts',
)

In [4]:
# Create a feature view in hopsworks
try:
    # Try creating it first in case it already exists
    feature_view = feature_store.create_feature_view(
        name=cfg.FEATURE_VIEW_NAME,
        version=cfg.FEATURE_VIEW_VERSION,
        description="An example feature view",
        query=feature_group.select_all()
        )
    print("The feature view already exists")
except:
    
    # Get feature view
    feature_view = feature_store.get_feature_view(
        cfg.FEATURE_VIEW_NAME,
        cfg.FEATURE_VIEW_VERSION
        )

In [5]:
# Query the feature view
ts_data, _ = feature_view.training_data(
    description="ts data hourly taxi rides",
)

# sort values to see the features in the feature store 
ts_data.sort_values(by=['pickup_location_id', 'pickup_time'], inplace=True)

# Print values
ts_data

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (190.91s) 





Unnamed: 0,pickup_time,pickup_location_id,ride_count,pickup_ts,pickup_location_transformed
879968,2023-05-01 00:00:00+00:00,1,0.0,1682899200000,1
62734,2023-05-01 01:00:00+00:00,1,0.0,1682902800000,1
86705,2023-05-01 02:00:00+00:00,1,0.0,1682906400000,1
2242966,2023-05-01 03:00:00+00:00,1,0.0,1682910000000,1
1919984,2023-05-01 04:00:00+00:00,1,0.0,1682913600000,1
...,...,...,...,...,...
1447711,2024-05-31 19:00:00+00:00,265,0.0,1717182000000,265
1023554,2024-05-31 20:00:00+00:00,265,0.0,1717185600000,265
549407,2024-05-31 21:00:00+00:00,265,0.0,1717189200000,265
247200,2024-05-31 22:00:00+00:00,265,0.0,1717192800000,265


In [6]:
# Generate features and target
features_and_target = data.generate_training_set(ts_data, start_position = 0, n_features = 24*28, step_size = 23, pickup_location_id = None, target_col = 'ride_count')

# Make a copy of the features and target
features_and_target_copy = features_and_target.copy()

# Print the features and target
print(f"{features_and_target.shape= }")

features_and_target.shape= (101255, 675)


In [7]:
# Show features and target dataframe
features_and_target.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_time,pickup_location_id,rides_next_hour
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,3.0,0.0,2.0,0.0,0.0,2023-05-29 00:00:00,1,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,1.0,0.0,0.0,0.0,0.0,2023-05-29 23:00:00,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2023-05-30 22:00:00,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,2.0,1.0,1.0,0.0,1.0,2023-05-31 21:00:00,1,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2023-06-01 20:00:00,1,4.0


In [8]:
# Cutoff date leaving out two months of data
days_from_end_date = 31*2-1

# Define the cutoff date
cutoff_date = ts_data['pickup_time'].max() - pd.Timedelta(days=days_from_end_date)

# Convert cutoff date to string
cutoff_date_str = cutoff_date.strftime('%Y-%m-%d %H:%M:%S')

# Split the data into training and validation
X_train, y_train, X_test, y_test = preprocessing.split_data(features_and_target_copy, datetime_column = 'pickup_time', cutoff_datetime = cutoff_date_str, target_column = 'rides_next_hour')

# Print shapes of train and test data
print(f"{X_train.shape= }, {y_train.shape= }, {X_test.shape= }, {y_test.shape= }")

X_train.shape= (84686, 674), y_train.shape= (84686,), X_test.shape= (16569, 674), y_test.shape= (16569,)


In [42]:
X_train

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_time,pickup_location_id,4_weeks_avg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,3.0,0.0,2.0,0.0,0.0,2023-05-29 00:00:00,1,0.037202
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,1.0,0.0,0.0,0.0,0.0,2023-05-29 23:00:00,1,0.052083
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2023-05-30 22:00:00,1,0.074405
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,2.0,1.0,1.0,0.0,1.0,2023-05-31 21:00:00,1,0.093750
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,2.0,2023-06-01 20:00:00,1,0.105655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101187,1.0,1.0,3.0,5.0,2.0,1.0,0.0,0.0,0.0,1.0,...,4.0,4.0,3.0,6.0,3.0,5.0,8.0,2024-03-27 19:00:00,265,2.602679
101188,1.0,1.0,3.0,1.0,2.0,3.0,1.0,4.0,1.0,2.0,...,2.0,2.0,1.0,1.0,4.0,5.0,4.0,2024-03-28 18:00:00,265,2.619048
101189,2.0,0.0,2.0,4.0,5.0,1.0,0.0,2.0,3.0,5.0,...,5.0,7.0,7.0,1.0,4.0,2.0,3.0,2024-03-29 17:00:00,265,2.666667
101190,2.0,5.0,2.0,4.0,3.0,1.0,2.0,4.0,2.0,1.0,...,1.0,0.0,0.0,5.0,4.0,7.0,2.0,2024-03-30 16:00:00,265,2.700893


In [9]:
# Create an optuna study
study = optuna.create_study(direction="minimize")

# Optimize the model hyperparameters
study.optimize(lambda trial: model.objective(trial, X_train, y_train, model.get_pipeline, mean_absolute_error, 3), n_trials=5)

[I 2024-08-13 14:46:22,008] A new study created in memory with name: no-name-e8ee78ac-d67d-4a7b-ba45-896193374f6b
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

In [10]:
# Get the best hyperparameters from Optuna optimization
best_params = study.best_trial.params

# Print the best hyperparameters
print(f'{best_params=}')

# train the model with the best hyperparameters on the entire training data
pipeline = model.get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

best_params={'num_leaves': 170, 'feature_fraction': 0.7495081434716913, 'bagging_fraction': 0.2559383936723787, 'min_child_samples': 47}


In [11]:
# Compute the predictions
predictions = pipeline.predict(X_test)

# Calculate the test error
test_mae = mean_absolute_error(y_test, predictions)

# Print the test error
print(f"{test_mae = }")

test_mae = 2.645143486110632


In [12]:
pipeline

In [15]:
# Serialize the model to disk
joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/models/model.pkl']

In [21]:
# Define input and output schema for the model
input_schema = Schema(X_train)
output_schema = Schema(y_train)

# Define model schema
model_schema = ModelSchema(input_schema, output_schema)

In [37]:
MODELS_DIR / 'model.pkl'

PosixPath('/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/models/model.pkl')

In [39]:
# Instantiate the pointer to the model registry
model_registry = project.get_model_registry()

# Register the model
model = model_registry.sklearn.create_model( # sklearn create model
    name='taxi_demand_predictor',            # assign a name to the model
    model_schema=model_schema,               # Define model schema
    metrics={'mae': test_mae},               # Define metrics
    description='LightGBM demand predictor', # Describe the model
    input_example=X_train.sample()           # Provide a sample of the input
)

# Register the model
model.save(str(MODELS_DIR / 'model.pkl'))

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1542704 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3408 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/58109 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/933012/models/taxi_demand_predictor/2


Model(name: 'taxi_demand_predictor', version: 2)

In [40]:
import taxi_demand_predictor.inference as inference

In [41]:
inference.get_hopsworks_project()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/933012


Project('demand_predictor_borja', 'borja.regueral@somniumrema.eu', 'Default project')