# Test MLFlow Integration

- start experiment 
- train model
- save metrics
- save artifact

In [1]:
!pip install minio mlflow==2.1.1 boto3 tenacity -q

In [2]:
import os
import warnings

import pandas as pd
import mlflow
import numpy as np

from minio import Minio
from minio.error import BucketAlreadyOwnedByYou
from mlflow.models.signature import infer_signature
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from tenacity import retry, stop_after_attempt, wait_exponential

# suppress warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
data.shape

(1599, 12)

In [5]:
TARGET_COLUMN = "quality"
train, test = train_test_split(data)

train_x = train.drop([TARGET_COLUMN], axis=1)
test_x = test.drop([TARGET_COLUMN], axis=1)
train_y = train[[TARGET_COLUMN]]
test_y = test[[TARGET_COLUMN]]

In [6]:
MINIO_HOST = os.environ["MINIO_ENDPOINT_URL"].split("http://")[1]
MINIO_BUCKET = "mlflow"

In [7]:
# Initialize a MinIO client
mc = Minio(
    endpoint=MINIO_HOST,
    access_key=os.environ["AWS_ACCESS_KEY_ID"],
    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    secure=False,
)

try:
    mc.make_bucket(MINIO_BUCKET)
except BucketAlreadyOwnedByYou:
    print(f"Bucket {MINIO_BUCKET} already exists!")

Bucket mlflow already exists!


In [8]:
wine_experiment_name = "My Wine Experiment"
experiment = mlflow.get_experiment_by_name(wine_experiment_name)
experiment_id = (
    mlflow.create_experiment(name=wine_experiment_name)
    if experiment is None
    else experiment.experiment_id
)

In [9]:
# check that the experiment was created successfully
assert mlflow.get_experiment(experiment_id).name == wine_experiment_name, f"Failed to create experiment {wine_experiment_name}!"

In [10]:
def experiment(alpha, l1_ratio):
    mlflow.sklearn.autolog()
    with mlflow.start_run(run_name='wine_models', experiment_id=experiment_id) as run:
            mlflow.set_tag("author", "kf-testing")
            lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
            lr.fit(train_x, train_y)

            pred_y = lr.predict(test_x)
            mlflow.log_metric("rmse", np.sqrt(mean_squared_error(test_y, pred_y)))
            mlflow.log_metric("r2", r2_score(test_y, pred_y))
            mlflow.log_metric("mae", mean_absolute_error(test_y, pred_y))

            signature = infer_signature(test_x, pred_y)
            mlflow.sklearn.log_model(lr, "model", registered_model_name="wine-elasticnet", signature=signature)
    
    return run

In [11]:
# run experiments
runs = [
    experiment(0.5, 0.5),
    experiment(1, 0),
    experiment(0, 1),
]

Registered model 'wine-elasticnet' already exists. Creating a new version of this model...
2023/07/06 08:39:06 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: wine-elasticnet, version 7
Created version '7' of model 'wine-elasticnet'.
Registered model 'wine-elasticnet' already exists. Creating a new version of this model...
2023/07/06 08:39:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: wine-elasticnet, version 8
Created version '8' of model 'wine-elasticnet'.
Registered model 'wine-elasticnet' already exists. Creating a new version of this model...
2023/07/06 08:39:25 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: wine-elasticnet, version 9
Created version '9' of model 'wine-elasticnet'.


In [12]:
@retry(
    wait=wait_exponential(multiplier=2, min=1, max=10),
    stop=stop_after_attempt(30),
    reraise=True,
)
def assert_run_finished(client, run_id):
    """Wait for the run to complete successfully."""
    status = client.get_run(run_id).info.status
    assert status == "FINISHED", f"MLFlow run in {status} state."


def assert_has_metrics(client, run_id, metrics):
    """Assert that the run contains the specified metrics."""
    run = client.get_run(run_id)
    for m in metrics:
        assert m in run.data.metrics, f"Metric {m} not found in logged data!"


def assert_model(client, run_id):
    """Assert Model exists."""
    model = client.sklearn.load_model(f"runs:/{run_id}/model")
    assert isinstance(model, ElasticNet), f"Model {model} is not of type ElasticNet!"

In [13]:
METRICS = ["rmse", "r2", "mae"]

for run in runs:
    run_id = run.info.run_id
    assert_run_finished(mlflow, run_id)
    assert_has_metrics(mlflow, run_id, METRICS)
    assert_model(mlflow, run_id)