In [1]:
%load_ext autoreload
%autoreload 2

# MLflow Classification Recipe Notebook

This notebook runs the MLflow Classification Recipe on Databricks and inspects its results. For more information about the MLflow Classification Recipe, including usage examples, see the [Classification Recipe overview documentation](https://mlflow.org/docs/latest/recipes.html#classification-recipe) the [Classification Recipe API documentation](https://mlflow.org/docs/latest/python_api/mlflow.recipes.html#module-mlflow.recipes.classification.v1.recipe).

In [2]:
from mlflow.recipes import Recipe

r = Recipe(profile="local")


2023/09/30 11:31:25 INFO mlflow.recipes.recipe: Creating MLflow Recipe 'mlflow' with profile: 'local'


In [3]:
r.clean()

In [4]:
r.inspect()

In [5]:
r.run("ingest")

2023/09/30 11:31:29 INFO mlflow.recipes.step: Running step ingest...
Loading dataset CSV using `pandas.read_csv()` with default arguments and assumed index column 0 which may not produce the desired schema. If the schema is not correct, you can adjust it by modifying the `load_file_as_dataframe()` function in `steps/ingest.py`
Loading dataset CSV using `pandas.read_csv()` with default arguments and assumed index column 0 which may not produce the desired schema. If the schema is not correct, you can adjust it by modifying the `load_file_as_dataframe()` function in `steps/ingest.py`


name,type
fixed acidity,number
volatile acidity,number
citric acid,number
residual sugar,number
chlorides,number
free sulfur dioxide,number
total sulfur dioxide,number
density,number
pH,number
sulphates,number

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


In [9]:
# Perform some EDA on the ingested dataset.
import matplotlib.pyplot as plt
import seaborn as sns

ingested_data = r.get_artifact("ingested_data")

dims = (3, 4)
 
f, axes = plt.subplots(dims[0], dims[1], figsize=(25, 15))
axis_i, axis_j = 0, 0
for col in ingested_data.columns:
  if col == "is_red":
    continue # Box plots cannot be used on indicator variables
  sns.boxplot(x=ingested_data["is_red"], y=ingested_data[col], ax=axes[axis_i, axis_j])
  axis_j += 1
  if axis_j == dims[1]:
    axis_i += 1
    axis_j = 0

ModuleNotFoundError: No module named 'seaborn'

In [10]:
r.run("split")

2023/09/30 11:22:36 INFO mlflow.recipes.utils.execution: ingest: No changes. Skipping.


Run MLFlow Recipe step: split

* 'schema_extra' has been renamed to 'json_schema_extra'
2023/09/30 11:22:37 INFO mlflow.recipes.step: Running step split...

* 'schema_extra' has been renamed to 'json_schema_extra'

* 'schema_extra' has been renamed to 'json_schema_extra'

* 'schema_extra' has been renamed to 'json_schema_extra'

* 'schema_extra' has been renamed to 'json_schema_extra'


In [11]:
r.run("transform")

2023/09/30 11:23:24 INFO mlflow.recipes.utils.execution: ingest, split: No changes. Skipping.


Run MLFlow Recipe step: transform

* 'schema_extra' has been renamed to 'json_schema_extra'
2023/09/30 11:23:25 INFO mlflow.recipes.step: Running step transform...


Name,Type
fixed acidity,float64
volatile acidity,float64
citric acid,float64
residual sugar,float64
chlorides,float64
free sulfur dioxide,float64
total sulfur dioxide,float64
density,float64
pH,float64
sulphates,float64

Name,Type
fixed acidity,float64
volatile acidity,float64
citric acid,float64
residual sugar,float64
chlorides,float64
free sulfur dioxide,float64
total sulfur dioxide,float64
density,float64
pH,float64
sulphates,float64

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,0
6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,0
8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,0
7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0
8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,0


In [12]:
r.run("train")

2023/09/30 11:23:46 INFO mlflow.recipes.utils.execution: ingest, split, transform: No changes. Skipping.


Run MLFlow Recipe step: train

* 'schema_extra' has been renamed to 'json_schema_extra'
2023/09/30 11:23:47 INFO mlflow.recipes.step: Running step train...
2023/09/30 11:23:48 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/09/30 11:23:48 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migrati

Metric,training,validation
f1_score,0.903323,0.891089
accuracy_score,0.941014,0.945455
example_count,4340.0,605.0
false_negatives,106.0,11.0
false_positives,150.0,22.0
precision_score,0.888559,0.859873
recall_score,0.918587,0.924658
score,0.941014,0.945455
true_negatives,2888.0,437.0
true_positives,1196.0,135.0

Name,Type
fixed acidity,double
volatile acidity,double
citric acid,double
residual sugar,double
chlorides,double
free sulfur dioxide,double
total sulfur dioxide,double
density,double
pH,double
sulphates,double

Name,Type
-,"Tensor('int64', (-1,))"

absolute_error,prediction,is_red,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
True,1,0,11.8,0.23,0.38,11.1,0.034,15.0,123.0,0.9997,2.93,0.55,9.7,3
True,0,1,7.3,0.45,0.36,5.9,0.074,12.0,87.0,0.9978,3.33,0.83,10.5,5
True,0,1,8.6,0.38,0.36,3.0,0.081,30.0,119.0,0.997,3.2,0.56,9.4,5
True,0,1,6.5,0.39,0.23,8.3,0.051,28.0,91.0,0.9952,3.44,0.55,12.1,6
True,0,1,5.6,0.31,0.37,1.4,0.074,12.0,96.0,0.9954,3.32,0.58,9.2,5
True,0,1,6.6,0.735,0.02,7.9,0.122,68.0,124.0,0.9994,3.47,0.53,9.9,5
True,1,0,7.1,0.32,0.4,1.5,0.034,13.0,84.0,0.9944,3.42,0.6,10.4,5
True,0,1,4.6,0.52,0.15,2.1,0.054,8.0,65.0,0.9934,3.9,0.56,13.1,4
True,0,1,5.1,0.585,0.0,1.7,0.044,14.0,86.0,0.99264,3.56,0.94,12.9,7
True,0,1,7.2,0.34,0.32,2.5,0.09,43.0,113.0,0.9966,3.32,0.79,11.1,5

Unnamed: 0,Latest
Model Rank,> 0
f1_score,0.891089
accuracy_score,0.945455
false_negatives,11
false_positives,22
log_loss,
precision_score,0.859873
recall_score,0.924658
roc_auc,
true_negatives,437


In [13]:
r.run("evaluate")

2023/09/30 11:24:12 INFO mlflow.recipes.utils.execution: ingest, split, transform, train: No changes. Skipping.


Run MLFlow Recipe step: evaluate

* 'schema_extra' has been renamed to 'json_schema_extra'
2023/09/30 11:24:13 INFO mlflow.recipes.step: Running step evaluate...
2023/09/30 11:24:14 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/30 11:24:14 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/09/30 11:24:14 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/30 11:24:14 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.


Metric,validation,test
f1_score,0.891089,0.866873
accuracy_score,0.945455,0.930421
example_count,605.0,618.0
false_negatives,11.0,11.0
false_positives,22.0,32.0
precision_score,0.859873,0.813953
recall_score,0.924658,0.927152
score,0.945455,0.930421
true_negatives,437.0,435.0
true_positives,135.0,140.0

metric,greater_is_better,value,threshold,validated
f1_score,True,0.866873,0.6,✅
precision_score,True,0.813953,0.6,✅
recall_score,True,0.927152,0.6,✅


In [14]:
r.run("register")

2023/09/30 11:24:21 INFO mlflow.recipes.utils.execution: ingest, split, transform, train, evaluate: No changes. Skipping.


Run MLFlow Recipe step: register

* 'schema_extra' has been renamed to 'json_schema_extra'
2023/09/30 11:24:22 INFO mlflow.recipes.step: Running step register...
Successfully registered model 'red_wine_classifier'.
2023/09/30 11:24:22 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: red_wine_classifier, version 1
Created version '1' of model 'red_wine_classifier'.


In [15]:
r.inspect("train")

Metric,training,validation
f1_score,0.903323,0.891089
accuracy_score,0.941014,0.945455
example_count,4340.0,605.0
false_negatives,106.0,11.0
false_positives,150.0,22.0
precision_score,0.888559,0.859873
recall_score,0.918587,0.924658
score,0.941014,0.945455
true_negatives,2888.0,437.0
true_positives,1196.0,135.0

Name,Type
fixed acidity,double
volatile acidity,double
citric acid,double
residual sugar,double
chlorides,double
free sulfur dioxide,double
total sulfur dioxide,double
density,double
pH,double
sulphates,double

Name,Type
-,"Tensor('int64', (-1,))"

absolute_error,prediction,is_red,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
True,1,0,11.8,0.23,0.38,11.1,0.034,15.0,123.0,0.9997,2.93,0.55,9.7,3
True,0,1,7.3,0.45,0.36,5.9,0.074,12.0,87.0,0.9978,3.33,0.83,10.5,5
True,0,1,8.6,0.38,0.36,3.0,0.081,30.0,119.0,0.997,3.2,0.56,9.4,5
True,0,1,6.5,0.39,0.23,8.3,0.051,28.0,91.0,0.9952,3.44,0.55,12.1,6
True,0,1,5.6,0.31,0.37,1.4,0.074,12.0,96.0,0.9954,3.32,0.58,9.2,5
True,0,1,6.6,0.735,0.02,7.9,0.122,68.0,124.0,0.9994,3.47,0.53,9.9,5
True,1,0,7.1,0.32,0.4,1.5,0.034,13.0,84.0,0.9944,3.42,0.6,10.4,5
True,0,1,4.6,0.52,0.15,2.1,0.054,8.0,65.0,0.9934,3.9,0.56,13.1,4
True,0,1,5.1,0.585,0.0,1.7,0.044,14.0,86.0,0.99264,3.56,0.94,12.9,7
True,0,1,7.2,0.34,0.32,2.5,0.09,43.0,113.0,0.9966,3.32,0.79,11.1,5

Unnamed: 0,Latest
Model Rank,> 0
f1_score,0.891089
accuracy_score,0.945455
false_negatives,11
false_positives,22
log_loss,
precision_score,0.859873
recall_score,0.924658
roc_auc,
true_negatives,437


In [None]:
training_data = r.get_artifact("training_data")
training_data.describe()

In [None]:
trained_model = r.get_artifact("model")
print(trained_model)