# Random Forest Regressor training
- To reproduce these results, attach this notebook to a cluster with runtime version **14.3.x-cpu-ml-scala2.12**, and rerun it.
- Compare trials in the [MLflow experiment](#mlflow/experiments/1866282307194673).

In [0]:
# Import statements
import math
import pandas as pd
from pyspark.sql import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from datetime import timedelta 

In [0]:
import mlflow
import databricks.automl_runtime

target_col = "DropSizePercentage"

## Load Data

In [0]:
import mlflow
import os
import uuid
import shutil
import pandas as pd
from datetime import datetime

# Load data from Hive table into PySpark DataFrame
df_spark = spark.table("hive_metastore.datascience.dropsizetraining_data")

In [0]:
# Split the data into train (60%), validation (20%), and test (20%) sets
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

train_df, val_test_df = df_spark.randomSplit([train_ratio, 1 - train_ratio], seed=42)
val_df, test_df = val_test_df.randomSplit([val_ratio / (val_ratio + test_ratio), test_ratio / (val_ratio + test_ratio)], seed=42)

# Add split column indicator
train_df = train_df.withColumn("_automl_split_col_0000", lit("train"))
val_df = val_df.withColumn("_automl_split_col_0000", lit("val"))
test_df = test_df.withColumn("_automl_split_col_0000", lit("test"))

# Union all DataFrames
df_split = train_df.union(val_df).union(test_df)

In [0]:
# Convert PySpark DataFrame to Pandas DataFrame
df_loaded = df_split.toPandas()

In [0]:
# Convert all object dtype columns to str dtype
object_columns = df_loaded.select_dtypes(include=['object']).columns

for col in object_columns:
    df_loaded[col] = df_loaded[col].astype(str)

### Select supported columns
Select only the columns that are supported. This allows us to train a model that can predict on a dataset that has extra columns that are not used in training.
`[]` are dropped in the pipelines. See the Alerts tab of the AutoML Experiment page for details on why these columns are dropped.

In [0]:
from databricks.automl_runtime.sklearn.column_selector import ColumnSelector
supported_cols = ["MAIN_ENGINE_HP", "LNGTankCapacity_Tonnes", "DEADWEIGHT_TONNAGE", "actualBunkeringTime", "ShipType"]
col_selector = ColumnSelector(supported_cols)

## Preprocessors

### Numerical columns

Missing values for numerical columns are imputed with mean by default.

In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

num_imputers = []
num_imputers.append(("impute_mean", SimpleImputer(), ["DEADWEIGHT_TONNAGE", "LNGTankCapacity_Tonnes", "MAIN_ENGINE_HP", "actualBunkeringTime"]))

numerical_pipeline = Pipeline(steps=[
    ("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors='coerce'))),
    ("imputers", ColumnTransformer(num_imputers)),
    ("standardizer", StandardScaler()),
])

numerical_transformers = [("numerical", numerical_pipeline, ["MAIN_ENGINE_HP", "actualBunkeringTime", "LNGTankCapacity_Tonnes", "DEADWEIGHT_TONNAGE"])]

### Categorical columns

#### Low-cardinality categoricals
Convert each low-cardinality categorical column into multiple binary columns through one-hot encoding.
For each input categorical column (string or numeric), the number of output columns is equal to the number of unique values in the input column.

In [0]:
from databricks.automl_runtime.sklearn import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

one_hot_imputers = []

one_hot_pipeline = Pipeline(steps=[
    ("imputers", ColumnTransformer(one_hot_imputers, remainder="passthrough")),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="indicator")),
])

categorical_one_hot_transformers = [("onehot", one_hot_pipeline, ["ShipType"])]

In [0]:
from sklearn.compose import ColumnTransformer

transformers = numerical_transformers + categorical_one_hot_transformers

preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=0)

## Train - Validation - Test Split
The input data is split by AutoML into 3 sets:
- Train (60% of the dataset used to train the model)
- Validation (20% of the dataset used to tune the hyperparameters of the model)
- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)

`_automl_split_col_0000` contains the information of which set a given row belongs to.
We use this column to split the dataset into the above 3 sets. 
The column should not be used for training so it is dropped after split is done.

In [0]:
# AutoML completed train - validation - test split internally and used _automl_split_col_0000 to specify the set
split_train_df = df_loaded.loc[df_loaded._automl_split_col_0000 == "train"]
split_val_df = df_loaded.loc[df_loaded._automl_split_col_0000 == "val"]
split_test_df = df_loaded.loc[df_loaded._automl_split_col_0000 == "test"]

# Separate target column from features and drop _automl_split_col_0000
X_train = split_train_df.drop([target_col, "_automl_split_col_0000"], axis=1)
y_train = split_train_df[target_col]

X_val = split_val_df.drop([target_col, "_automl_split_col_0000"], axis=1)
y_val = split_val_df[target_col]

X_test = split_test_df.drop([target_col, "_automl_split_col_0000"], axis=1)
y_test = split_test_df[target_col]

## Train regression model
- Log relevant metrics to MLflow to track runs
- All the runs are logged under [this MLflow experiment](#mlflow/experiments/1866282307194673)
- Change the model parameters and re-run the training cell to log a different trial to the MLflow experiment
- To view the full list of tunable hyperparameters, check the output of the cell below

In [0]:
from sklearn.ensemble import RandomForestRegressor

# help(RandomForestRegressor)

### Define the objective function
The objective function used to find optimal hyperparameters. By default, this notebook only runs
this function once (`max_evals=1` in the `hyperopt.fmin` invocation) with fixed hyperparameters, but
hyperparameters can be tuned by modifying `space`, defined below. `hyperopt.fmin` will then use this
function's return value to search the space to minimize the loss.

### Configure the hyperparameter search space
Configure the search space of parameters. Parameters below are all constant expressions but can be
modified to widen the search space. For example, when training a decision tree regressor, to allow
the maximum tree depth to be either 2 or 3, set the key of 'max_depth' to
`hp.choice('max_depth', [2, 3])`. Be sure to also increase `max_evals` in the `fmin` call below.

See https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/index.html
for more information on hyperparameter tuning as well as
http://hyperopt.github.io/hyperopt/getting-started/search_spaces/ for documentation on supported
search expressions.

For documentation on parameters used by the model in use, please see:
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

NOTE: The above URL points to a stable version of the documentation corresponding to the last
released version of the package. The documentation may differ slightly for the package version
used by this notebook.

In [0]:
from hyperopt import hp

# Define the hyperparameter search space
space = {
  "bootstrap": hp.choice("bootstrap", [True, False]),
  "criterion": hp.choice("criterion", ["friedman_mse", "mae"]),
  "max_depth": hp.quniform("max_depth", 2, 10, 1),  # Example range for max_depth
  "max_features": hp.uniform("max_features", 0.1, 1.0),  # Example range for max_features
  "min_samples_leaf": hp.uniform("min_samples_leaf", 0.001, 0.1),  # Example range for min_samples_leaf
  "min_samples_split": hp.uniform("min_samples_split", 0.001, 0.1),  # Example range for min_samples_split
  "n_estimators": hp.quniform("n_estimators", 5, 50, 1),  # Example range for n_estimators
  "random_state": hp.randint("random_state", 1000000),  # Example range for random_state
}


### Run trials
When widening the search space and training multiple models, switch to `SparkTrials` to parallelize
training on Spark:
```
from hyperopt import SparkTrials
trials = SparkTrials()
```

NOTE: While `Trials` starts an MLFlow run for each set of hyperparameters, `SparkTrials` only starts
one top-level run; it will start a subrun for each set of hyperparameters.

See http://hyperopt.github.io/hyperopt/scaleout/spark/ for more info.

In [0]:
import mlflow
import os
import uuid
import shutil
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from hyperopt import hp, tpe, fmin, STATUS_OK, Trials
from mlflow.models import Model
from mlflow.pyfunc import PyFuncModel
import mlflow.pyfunc as pyfunc 
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger("mlflow").setLevel(logging.ERROR)

# Set the MLflow experiment
path = "/Workspace/CVT/DropSizeModel/Experiments"
experiment_name = "DropSize"
date_time = datetime.now().strftime("%Y%m%d_%H%M%S")
full_experiment_name = f"{path}/{experiment_name}_{date_time}"
mlflow.set_experiment(full_experiment_name)

# Ensure that X_train, X_val, X_test are DataFrames and y_train, y_val, y_test are Series

def objective(params):
    with mlflow.start_run() as mlflow_run:
        # Cast n_estimators and max_depth to integer
        params['n_estimators'] = int(params['n_estimators'])
        params['max_depth'] = int(params['max_depth'])

        skrf_regressor = RandomForestRegressor(n_jobs=1, **params)

        model = Pipeline([
            ("column_selector", col_selector),
            ("preprocessor", preprocessor),
            ("regressor", skrf_regressor),
        ])

        # Enable automatic logging of input samples, metrics, parameters, and models
        mlflow.sklearn.autolog(
            log_input_examples=True,
            silent=True,
            disable=True
        )

        # Fit the model
        model.fit(X_train, y_train)

        # Log the model explicitly
        mlflow.sklearn.log_model(model, "model")

        # Log metrics for the training set
        mlflow_model = Model()
        pyfunc.add_to_model(mlflow_model, loader_module="mlflow.sklearn")
        pyfunc_model = PyFuncModel(model_meta=mlflow_model, model_impl=model)
        
        training_eval_result = mlflow.evaluate(
            model=pyfunc_model,
            data=X_train.assign(**{str(target_col): y_train}),
            targets=str(target_col),
            model_type="regressor",
            evaluator_config={"log_model_explainability": False, "metric_prefix": "training_"}
        )

        # Log metrics for the validation set
        val_eval_result = mlflow.evaluate(
            model=pyfunc_model,
            data=X_val.assign(**{str(target_col): y_val}),
            targets=str(target_col),
            model_type="regressor",
            evaluator_config={"log_model_explainability": False, "metric_prefix": "val_"}
        )
        skrf_val_metrics = val_eval_result.metrics

        # Log metrics for the test set
        test_eval_result = mlflow.evaluate(
            model=pyfunc_model,
            data=X_test.assign(**{str(target_col): y_test}),
            targets=str(target_col),
            model_type="regressor",
            evaluator_config={"log_model_explainability": False, "metric_prefix": "test_"}
        )
        skrf_test_metrics = test_eval_result.metrics

        train_r2 = training_eval_result.metrics["training_r2_score"]
        val_r2 = skrf_val_metrics["val_r2_score"]
        test_r2 = skrf_test_metrics["test_r2_score"]

        # Debug print statements
        # print("train_r2:", train_r2, "type:", type(train_r2))
        # print("val_r2:", val_r2, "type:", type(val_r2))
        # print("test_r2:", test_r2, "type:", type(test_r2))

        # Ensure the values are float
        train_r2 = float(train_r2)
        val_r2 = float(val_r2)
        test_r2 = float(test_r2)

        best_model = model
        best_val_metrics = skrf_val_metrics
        best_test_metrics = skrf_test_metrics

        # # Check the condition for abs difference between train, val, and test R2
        # if not (abs(train_r2 - val_r2) <= 0.08 and abs(train_r2 - test_r2) <= 0.08 and abs(val_r2 - test_r2) <= 0.08):
        #     # If condition fails, choose the model with the highest test R2
        #     if test_r2 < val_r2:
        #         best_model = model
        #         best_val_metrics = skrf_val_metrics
        #         best_test_metrics = skrf_test_metrics

        return {
            "loss": -best_val_metrics["val_r2_score"],  # Negative because fmin minimizes the loss
            "status": STATUS_OK,
            "val_metrics": best_val_metrics,
            "test_metrics": best_test_metrics,
            "model": best_model,
            "run": mlflow_run,
            "run_id": mlflow_run.info.run_id,
            "run_name": mlflow_run.info.run_name
        }

# Define the hyperparameter space
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'max_depth': hp.quniform('max_depth', 5, 50, 1),
    'max_features': hp.uniform('max_features', 0.1, 1.0),
    'min_samples_split': hp.uniform('min_samples_split', 0.01, 0.5),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.01, 0.5),
    'bootstrap': hp.choice('bootstrap', [True, False])
}

trials = Trials()
best_trial = fmin(
    objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Increase this when widening the hyperparameter search space.
    trials=trials
)

best_result = trials.best_trial["result"]
best_model = best_result["model"]
run_id = best_result["run_id"]  # Get the run_id

print(best_model)


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

  2%|▏         | 1/50 [00:11<09:22, 11.47s/trial, best loss: -0.17696066736211646]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

  4%|▍         | 2/50 [00:21<08:30, 10.63s/trial, best loss: -0.17696066736211646]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

  6%|▌         | 3/50 [00:31<08:11, 10.45s/trial, best loss: -0.207829151234885]  

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

  8%|▊         | 4/50 [00:40<07:35,  9.91s/trial, best loss: -0.207829151234885]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 10%|█         | 5/50 [00:49<07:11,  9.60s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 12%|█▏        | 6/50 [00:59<07:02,  9.61s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 14%|█▍        | 7/50 [01:08<06:47,  9.48s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 16%|█▌        | 8/50 [01:18<06:39,  9.51s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 18%|█▊        | 9/50 [01:28<06:34,  9.63s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 20%|██        | 10/50 [01:37<06:27,  9.68s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 22%|██▏       | 11/50 [01:46<06:08,  9.44s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 24%|██▍       | 12/50 [01:56<05:55,  9.36s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 26%|██▌       | 13/50 [02:06<05:53,  9.57s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 28%|██▊       | 14/50 [02:15<05:38,  9.41s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 30%|███       | 15/50 [02:24<05:31,  9.46s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 32%|███▏      | 16/50 [02:34<05:20,  9.43s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 34%|███▍      | 17/50 [02:43<05:12,  9.48s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 36%|███▌      | 18/50 [02:52<04:58,  9.32s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 38%|███▊      | 19/50 [03:02<04:50,  9.37s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 40%|████      | 20/50 [03:11<04:43,  9.44s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 42%|████▏     | 21/50 [03:21<04:33,  9.44s/trial, best loss: -0.4253767044468856]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 44%|████▍     | 22/50 [03:30<04:21,  9.34s/trial, best loss: -0.4499658372351979]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 46%|████▌     | 23/50 [03:39<04:09,  9.26s/trial, best loss: -0.4499658372351979]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 48%|████▊     | 24/50 [03:48<03:57,  9.15s/trial, best loss: -0.4646640459583672]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 50%|█████     | 25/50 [03:57<03:48,  9.12s/trial, best loss: -0.4646640459583672]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 52%|█████▏    | 26/50 [04:07<03:44,  9.37s/trial, best loss: -0.4646640459583672]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 54%|█████▍    | 27/50 [04:16<03:31,  9.20s/trial, best loss: -0.4646640459583672]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 56%|█████▌    | 28/50 [04:25<03:25,  9.34s/trial, best loss: -0.4646640459583672]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 58%|█████▊    | 29/50 [04:34<03:14,  9.26s/trial, best loss: -0.4646640459583672]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 60%|██████    | 30/50 [04:44<03:06,  9.34s/trial, best loss: -0.47877237283057705]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 62%|██████▏   | 31/50 [04:53<02:57,  9.34s/trial, best loss: -0.4903155501803138] 

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 64%|██████▍   | 32/50 [05:02<02:47,  9.30s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 66%|██████▌   | 33/50 [05:11<02:37,  9.25s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 68%|██████▊   | 34/50 [05:21<02:28,  9.25s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 70%|███████   | 35/50 [05:30<02:17,  9.16s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 72%|███████▏  | 36/50 [05:39<02:09,  9.28s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 74%|███████▍  | 37/50 [05:49<02:01,  9.38s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 76%|███████▌  | 38/50 [05:58<01:51,  9.32s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 78%|███████▊  | 39/50 [06:08<01:44,  9.50s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 80%|████████  | 40/50 [06:18<01:35,  9.52s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 82%|████████▏ | 41/50 [06:27<01:25,  9.48s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 84%|████████▍ | 42/50 [06:36<01:14,  9.37s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 86%|████████▌ | 43/50 [06:46<01:06,  9.48s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 88%|████████▊ | 44/50 [06:55<00:56,  9.47s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 90%|█████████ | 45/50 [07:05<00:48,  9.61s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 92%|█████████▏| 46/50 [07:15<00:39,  9.82s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 94%|█████████▍| 47/50 [07:28<00:31, 10.51s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 96%|█████████▌| 48/50 [07:37<00:20, 10.20s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 98%|█████████▊| 49/50 [07:46<00:09,  9.87s/trial, best loss: -0.4903155501803138]

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 50/50 [07:55<00:00,  9.69s/trial, best loss: -0.4903155501803138]100%|██████████| 50/50 [07:55<00:00,  9.52s/trial, best loss: -0.4903155501803138]
Pipeline(steps=[('column_selector',
                 ColumnSelector(cols=['MAIN_ENGINE_HP',
                                      'LNGTankCapacity_Tonnes',
                                      'DEADWEIGHT_TONNAGE',
                                      'actualBunkeringTime', 'ShipType'])),
                ('preprocessor',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('numerical',
                                                  Pipeline(steps=[('converter',
                                                                   FunctionTransformer(func=<function <lambda> at 0x7faef8afecb0>)),...
                                                   'DEADWEIGHT_TONNAGE']),
                                                 ('onehot',
              

### Patch pandas version in logged model

Ensures that model serving uses the same version of pandas that was used to train the model.

In [0]:
import mlflow
import os
import shutil
import tempfile
import yaml

run_id = best_result["run_id"]

# Set up a local dir for downloading the artifacts.
tmp_dir = tempfile.mkdtemp()

client = mlflow.tracking.MlflowClient()

# Fix conda.yaml
conda_file_path = mlflow.artifacts.download_artifacts(artifact_uri=f"runs:/{run_id}/model/conda.yaml", dst_path=tmp_dir)
with open(conda_file_path) as f:
  conda_libs = yaml.load(f, Loader=yaml.FullLoader)
pandas_lib_exists = any([lib.startswith("pandas==") for lib in conda_libs["dependencies"][-1]["pip"]])
if not pandas_lib_exists:
  print("Adding pandas dependency to conda.yaml")
  conda_libs["dependencies"][-1]["pip"].append(f"pandas=={pd.__version__}")

  with open(f"{tmp_dir}/conda.yaml", "w") as f:
    f.write(yaml.dump(conda_libs))
  client.log_artifact(run_id=run_id, local_path=conda_file_path, artifact_path="model")

# Fix requirements.txt
venv_file_path = mlflow.artifacts.download_artifacts(artifact_uri=f"runs:/{run_id}/model/requirements.txt", dst_path=tmp_dir)
with open(venv_file_path) as f:
  venv_libs = f.readlines()
venv_libs = [lib.strip() for lib in venv_libs]
pandas_lib_exists = any([lib.startswith("pandas==") for lib in venv_libs])
if not pandas_lib_exists:
  print("Adding pandas dependency to requirements.txt")
  venv_libs.append(f"pandas=={pd.__version__}")

  with open(f"{tmp_dir}/requirements.txt", "w") as f:
    f.write("\n".join(venv_libs))
  client.log_artifact(run_id=run_id, local_path=venv_file_path, artifact_path="model")

shutil.rmtree(tmp_dir)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Adding pandas dependency to conda.yaml


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Adding pandas dependency to requirements.txt


## Inference
[The MLflow Model Registry](https://docs.databricks.com/applications/mlflow/model-registry.html) is a collaborative hub where teams can share ML models, work together from experimentation to online testing and production, integrate with approval and governance workflows, and monitor ML deployments and their performance. The snippets below show how to add the model trained in this notebook to the model registry and to retrieve it later for inference.

> **NOTE:** The `model_uri` for the model already trained in this notebook can be found in the cell below

### Register to Model Registry
```
model_name = "Example"

model_uri = f"runs:/{ mlflow_run.info.run_id }/model"
registered_model_version = mlflow.register_model(model_uri, model_name)
```

### Load from Model Registry
```
model_name = "Example"
model_version = registered_model_version.version

model_uri=f"models:/{model_name}/{model_version}"
model = mlflow.pyfunc.load_model(model_uri=model_uri)
model.predict(input_X)
```

### Load model without registering
```
model_uri = f"runs:/{ mlflow_run.info.run_id }/model"

model = mlflow.pyfunc.load_model(model_uri=model_uri)
model.predict(input_X)
```

In [0]:
#Test Predictions

import numpy as np
model = best_model

# Make predictions on the test data
predictions_test = model.predict(X_test)
predictions_test = np.round(predictions_test, 2)

# Create a DataFrame with actual values, predictions, and the original features
results_test = X_test.copy()
results_test["actual"] = y_test
results_test["predicted"] = predictions_test
results_test["predicted-actual"] = results_test["predicted"] - results_test["actual"]
results_test["predicted-actual"] = np.round(results_test["predicted-actual"],2)

# Log the results_test DataFrame as a CSV file
csv_path = "results_test.csv"
results_test.to_csv(csv_path, index=False)

# Log the CSV file to the MLflow run
with mlflow.start_run(run_id=run_id):
    mlflow.log_artifact(csv_path, artifact_path="predictions")

# Remove the CSV file from the local file system
os.remove(csv_path)

print(f"Results saved as CSV and logged to MLflow run ID: {run_id}")

Results saved as CSV and logged to MLflow run ID: f7867c575334474f99a684a99d1f655b


In [0]:
#Train Predictions

import numpy as np
# Make predictions on the test data
predictions_train = model.predict(X_train)
predictions_train = np.round(predictions_train, 2)

# Create a DataFrame with actual values, predictions, and the original features
results_train = X_train.copy()
results_train["actual"] = y_train
results_train["predicted"] = predictions_train
results_train["predicted-actual"] = results_train["predicted"] - results_train["actual"]
results_train["predicted-actual"] = np.round(results_train["predicted-actual"],2)


#### Calculating RMSE as ShipType in results_train and storing in Confidence_Percentile_85.csv as artifact

In [0]:
# Rename the 'predicted-actual' column to 'ResidualsPredictedMinusActual'
results_train.rename(columns={'predicted-actual': 'ResidualsPredictedMinusActual'}, inplace=True)

# Calculate the residuals as actual - predicted
results_train['ResidualsActualMinusPredicted'] = results_train['actual'] - results_train['predicted']


# Calculate the 5 and 95 percentiles grouped by 'ShipType'
percentiles = results_train.groupby('ShipType')['ResidualsActualMinusPredicted'].quantile([0.075, 0.925]).unstack()

# Reset the index to convert the grouped data back to a DataFrame
percentiles.reset_index(inplace=True)

# Rename the columns for clarity
percentiles.columns = ['ShipType', 'Lower_85Conf', 'Upper_85Conf']

# Round the percentile columns to 2 decimal places
percentiles = percentiles.round({'Lower_85Conf': 2, 'Upper_85Conf': 2})

# Log the results_test DataFrame as a CSV file
csv_path = "Confidence_Percentile_85.csv"
percentiles.to_csv(csv_path, index=False)

# Log the CSV file to the MLflow run
with mlflow.start_run(run_id=run_id):
    mlflow.log_artifact(csv_path, artifact_path="predictions")

# Remove the CSV file from the local file system
os.remove(csv_path)

print(f"Confidence_Percentile saved as CSV and logged to MLflow run ID: {run_id}")


Confidence_Percentile saved as CSV and logged to MLflow run ID: f7867c575334474f99a684a99d1f655b


In [0]:
# Log the results_test DataFrame as a CSV file
csv_path = "results_train.csv"
results_train.to_csv(csv_path, index=False)

# Log the CSV file to the MLflow run
with mlflow.start_run(run_id=run_id):
    mlflow.log_artifact(csv_path, artifact_path="predictions")

# Remove the CSV file from the local file system
os.remove(csv_path)

print(f"Results saved as CSV and logged to MLflow run ID: {run_id}")

Results saved as CSV and logged to MLflow run ID: f7867c575334474f99a684a99d1f655b


In [0]:
# Log the results_test DataFrame as a CSV file
csv_path = "training_data.csv"
df_loaded.to_csv(csv_path, index=False)

# Log the CSV file to the MLflow run
with mlflow.start_run(run_id=run_id):
    mlflow.log_artifact(csv_path, artifact_path="training_data")

# Remove the CSV file from the local file system
os.remove(csv_path)

In [0]:
# Construct the model URI based on the run ID
model_uri = f"runs:/{run_id}/model"

In [0]:
# Register the model
registered_model_version = mlflow.register_model(model_uri, "DropSizeModel")

Registered model 'DropSizeModel' already exists. Creating a new version of this model...
Created version '9' of model 'DropSizeModel'.
