In [1]:
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()

In [2]:
def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)

In [3]:
def run_register_model(data_path: str, top_n: int):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    # best_run = client.search_runs( ...  )[0]

    # Register the best model
    # mlflow.register_model( ... )

In [4]:
run_register_model("./output", 5)

InvalidParameterError: The 'bootstrap' parameter of RandomForestRegressor must be an instance of 'bool', an instance of 'numpy.bool_' or an instance of 'int'. Got 'True' instead.

In [5]:
client = MlflowClient()

In [6]:
top_n = 5

# Retrieve the top_n model runs and log the models
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=top_n,
    order_by=["metrics.rmse ASC"]
)

In [7]:
len(runs)

5

In [8]:
data_path="./output"
params=runs[0].data.params

X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

with mlflow.start_run():

    params = {
        'min_weight_fraction_leaf': 0.0,
        'max_depth': 2,
        'ccp_alpha': 0.0,
        'criterion': 'squared_error',
        'warm_start': False,
        'max_leaf_nodes': None,
        'max_samples': None,
        'verbose': 0,
        'n_estimators': 22,
        'bootstrap': True,
        'oob_score': False,
        'n_jobs': -1,
        'min_impurity_decrease': 0.0,
        'min_samples_split': 8,
        'min_samples_leaf': 2,
        'random_state': 42,
        'max_features': 1.0
    }
    
    for param in RF_PARAMS:
        params[param] = int(params[param])

    rf = RandomForestRegressor(**params)
    rf.fit(X_train, y_train)

    # Evaluate model on the validation and test sets
    val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
    mlflow.log_metric("val_rmse", val_rmse)
    test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
    mlflow.log_metric("test_rmse", test_rmse)



In [9]:
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
experiment

<Experiment: artifact_location='mlflow-artifacts:/749136533729911951', creation_time=1684869623224, experiment_id='749136533729911951', last_update_time=1684869623224, lifecycle_stage='active', name='random-forest-best-models', tags={}>

In [12]:
experiment.experiment_id

'749136533729911951'

In [16]:
best_run = client.search_runs(experiment.experiment_id)[0]

In [37]:
uri = best_run.to_dictionary()["info"]["artifact_uri"]

In [35]:
name = best_run.to_dictionary()["info"]["run_name"]

In [43]:
id = best_run.to_dictionary()["info"]["experiment_id"]

'749136533729911951'

In [49]:
uri = "runs:/{}/sklearn-model".format(best_run.info.run_id)

In [50]:
mlflow.register_model(uri, name)

Registered model 'likeable-roo-385' already exists. Creating a new version of this model...
2023/05/23 21:33:52 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: likeable-roo-385, version 3
Created version '3' of model 'likeable-roo-385'.


<ModelVersion: aliases=[], creation_timestamp=1684877632026, current_stage='None', description='', last_updated_timestamp=1684877632026, name='likeable-roo-385', run_id='38c8f8903989412bb58658296757ecff', run_link='', source='mlflow-artifacts:/749136533729911951/38c8f8903989412bb58658296757ecff/artifacts/sklearn-model', status='READY', status_message='', tags={}, user_id='', version='3'>

In [39]:
best_run

<Run: data=<RunData: metrics={'test_rmse': 2.3279179748247727,
 'training_mean_absolute_error': 1.5485602391260203,
 'training_mean_squared_error': 4.461079191601193,
 'training_r2_score': 0.16400954109008747,
 'training_root_mean_squared_error': 2.112126698756775,
 'training_score': 0.16400954109008747,
 'val_rmse': 2.486106021576535}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '2',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '2',
 'min_samples_split': '8',
 'min_weight_fraction_leaf': '0.0',
 'n_estimators': '22',
 'n_jobs': '-1',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegressor',
 'mlflow.log-model.history': '[{"run_id": "38c8f8903989412bb58658296757ecff", '
                             '"artifa