In [1]:
import mlflow

mlflow.__version__

'2.21.3'

In [2]:
from src.utils.folder_operations import get_project_root

# set mlflow tracking uri
mlflow.set_tracking_uri(
    # (get_project_root() / 'mlflow_new/mlruns').as_uri()
    "http://localhost:3000"
)

import os
os.environ['AWS_ACCESS_KEY_ID'] = 'mlflow'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'mlflow123'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000'

# Signature Enforcement

In [3]:
model_uri = 'runs:/7db904c984c04e9fb293a84dfa00a8cb/rfc'

In [4]:
# loading sklearn model
sk_model = mlflow.sklearn.load_model(model_uri)
print(type(sk_model).__name__)

RandomForestClassifier


In [5]:
# loading pyfunc model
pyfunc_model = mlflow.pyfunc.load_model(model_uri)
print(type(pyfunc_model).__name__)

PyFuncModel


In [6]:
# loading iris dataset
from sklearn.datasets import load_iris
from pprint import pprint

data = load_iris(as_frame=True)

# loading data
X = data.data
y = data.target

# Data Type Validation

In [7]:
# changin data type of one column to make it invalid
X_invalid = X.copy()
X_invalid['sepal length (cm)'] = X_invalid['sepal length (cm)'].astype(int)
X_invalid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    int64  
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 4.8 KB


In [8]:
sk_model.predict(X_invalid)  # this will work fine

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
# predicting with pyfunc model
try:
    pyfunc_model.predict(X_invalid)  # this will raise an error
except Exception as e:
    print(e)

Failed to enforce schema of data '     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                    5               3.5                1.4               0.2
1                    4               3.0                1.4               0.2
2                    4               3.2                1.3               0.2
3                    4               3.1                1.5               0.2
4                    5               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                  6               3.0                5.2               2.3
146                  6               2.5                5.0               1.9
147                  6               3.0                5.2               2.0
148                  6               3.4                5.4               2.3
149                  5               3.0                5.1               1.8

[150 rows x 4 columns]' with 

# Schema Validation

In [10]:
# changin column name to make it invalid
X_invalid = X.copy()
X_invalid['sepal length (cm) invalid'] = X_invalid['sepal length (cm)']
X_invalid = X_invalid.drop(columns=['sepal length (cm)'])
X_invalid.head()

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm),sepal length (cm) invalid
0,3.5,1.4,0.2,5.1
1,3.0,1.4,0.2,4.9
2,3.2,1.3,0.2,4.7
3,3.1,1.5,0.2,4.6
4,3.6,1.4,0.2,5.0


In [11]:
# predicting with sk model
try:
    sk_model.predict(X_invalid)  # this will not work due to sklearn validation
except Exception as e:
    print(e)

The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- sepal length (cm) invalid
Feature names seen at fit time, yet now missing:
- sepal length (cm)



In [12]:
# run to show that the validation comes from sklearn
sk_model.predict(X_invalid) 

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- sepal length (cm) invalid
Feature names seen at fit time, yet now missing:
- sepal length (cm)


In [13]:
# predicting with pyfunc model
try:
    pyfunc_model.predict(X_invalid)  # this will raise an error due to mlfow validation
except Exception as e:
    print(e)

Failed to enforce schema of data '     sepal width (cm)  petal length (cm)  petal width (cm)  \
0                 3.5                1.4               0.2   
1                 3.0                1.4               0.2   
2                 3.2                1.3               0.2   
3                 3.1                1.5               0.2   
4                 3.6                1.4               0.2   
..                ...                ...               ...   
145               3.0                5.2               2.3   
146               2.5                5.0               1.9   
147               3.0                5.2               2.0   
148               3.4                5.4               2.3   
149               3.0                5.1               1.8   

     sepal length (cm) invalid  
0                          5.1  
1                          4.9  
2                          4.7  
3                          4.6  
4                          5.0  
..                     

# Working with Optional Columns

In [14]:
from mlflow.models import ModelSignature
from mlflow.types.schema import Schema
from mlflow.types.schema import ColSpec

In [15]:
# To create a schema for the iris dataset, we need to create a list of ColSpec objects
col_specifications = [
    ColSpec(type="double", name=feature_name, required=True) for feature_name in data.feature_names
    ]

optional_column = [ColSpec(type="double", name="optional_column", required=False)]
# schema for the model input
model_input = Schema(inputs = col_specifications+optional_column)

In [16]:
# creating model output schema
model_output = Schema(inputs= [ColSpec(type="integer", name="species", required=True)])

In [17]:
# model signature
model_signature = ModelSignature(inputs=model_input, outputs=model_output)
pprint(model_signature.to_dict(), indent=2)

{ 'inputs': '[{"type": "double", "name": "sepal length (cm)", "required": '
            'true}, {"type": "double", "name": "sepal width (cm)", "required": '
            'true}, {"type": "double", "name": "petal length (cm)", '
            '"required": true}, {"type": "double", "name": "petal width (cm)", '
            '"required": true}, {"type": "double", "name": "optional_column", '
            '"required": false}]',
  'outputs': '[{"type": "integer", "name": "species", "required": true}]',
  'params': None}


In [18]:
class CustomModel(mlflow.pyfunc.PythonModel):
    
    def predict(self, context, model_input):
        "more logic can be added here"
        return model_input
    
# log custom model
with mlflow.start_run(run_name="custom-model-with-optional-inputs") as run:
    mlflow.pyfunc.log_model(
        artifact_path="custom_model",
        python_model=CustomModel(),
        signature=model_signature
    )

# loading custom model
custom_model = mlflow.pyfunc.load_model(f"runs:/{run.info.run_id}/custom_model")



🏃 View run custom-model-with-optional-inputs at: http://localhost:3000/#/experiments/0/runs/8b4a8b25d6a240ac82310503d33b04a7
🧪 View experiment at: http://localhost:3000/#/experiments/0


In [19]:
# Modify X to include the optional column
X_with_optional_column = X.copy()
X_with_optional_column['optional_column'] = 100*X["petal length (cm)"]
X_with_optional_column.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),optional_column
0,5.1,3.5,1.4,0.2,140.0
1,4.9,3.0,1.4,0.2,140.0
2,4.7,3.2,1.3,0.2,130.0
3,4.6,3.1,1.5,0.2,150.0
4,5.0,3.6,1.4,0.2,140.0


In [20]:
custom_model.predict(X_with_optional_column)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),optional_column
0,5.1,3.5,1.4,0.2,140.0
1,4.9,3.0,1.4,0.2,140.0
2,4.7,3.2,1.3,0.2,130.0
3,4.6,3.1,1.5,0.2,150.0
4,5.0,3.6,1.4,0.2,140.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,520.0
146,6.3,2.5,5.0,1.9,500.0
147,6.5,3.0,5.2,2.0,520.0
148,6.2,3.4,5.4,2.3,540.0


In [21]:
# Modify X to remove a mandatory column
X_with_missing_column = X.copy()
X_with_missing_column = X_with_missing_column.drop(columns=["sepal length (cm)"])
X_with_missing_column.head()

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm)
0,3.5,1.4,0.2
1,3.0,1.4,0.2
2,3.2,1.3,0.2
3,3.1,1.5,0.2
4,3.6,1.4,0.2


In [22]:
try:
    custom_model.predict(X_with_missing_column)  # this will raise an error
except Exception as e:
    print(e)

Failed to enforce schema of data '     sepal width (cm)  petal length (cm)  petal width (cm)
0                 3.5                1.4               0.2
1                 3.0                1.4               0.2
2                 3.2                1.3               0.2
3                 3.1                1.5               0.2
4                 3.6                1.4               0.2
..                ...                ...               ...
145               3.0                5.2               2.3
146               2.5                5.0               1.9
147               3.0                5.2               2.0
148               3.4                5.4               2.3
149               3.0                5.1               1.8

[150 rows x 3 columns]' with schema '['sepal length (cm)': double (required), 'sepal width (cm)': double (required), 'petal length (cm)': double (required), 'petal width (cm)': double (required), 'optional_column': double (optional)]'. Error: Model is missi

# Infer Model Signature

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from mlflow.models.signature import infer_signature
from pprint import pprint

In [24]:
# Load iris dataset
data = load_iris(as_frame=True)
X = data.data
y = data.target
# rename y colunm to species
y.name = "species"

# Create a signature for the model
model_signature = infer_signature(model_input=X, model_output=y)

pprint(model_signature.to_dict(), indent=2)

{ 'inputs': '[{"type": "double", "name": "sepal length (cm)", "required": '
            'true}, {"type": "double", "name": "sepal width (cm)", "required": '
            'true}, {"type": "double", "name": "petal length (cm)", '
            '"required": true}, {"type": "double", "name": "petal width (cm)", '
            '"required": true}]',
  'outputs': '[{"type": "long", "name": "species", "required": true}]',
  'params': None}




In [25]:
dtc = DecisionTreeClassifier()
dtc.fit(X, y)

# Log the model
with mlflow.start_run(run_name="log-decision-tree-classifier") as run:
    mlflow.sklearn.log_model(
        sk_model=dtc,
        artifact_path='model',
        signature=model_signature
    )



🏃 View run log-decision-tree-classifier at: http://localhost:3000/#/experiments/0/runs/f20b9f7711a54d619dfec5e8630a86f4
🧪 View experiment at: http://localhost:3000/#/experiments/0


# Optional fields when working with infer_signature

In [26]:
X_with_optional_column.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),optional_column
0,5.1,3.5,1.4,0.2,140.0
1,4.9,3.0,1.4,0.2,140.0
2,4.7,3.2,1.3,0.2,130.0
3,4.6,3.1,1.5,0.2,150.0
4,5.0,3.6,1.4,0.2,140.0


In [27]:
infer_signature_with_optional_column = infer_signature(model_input=X_with_optional_column, model_output=y)
pprint(infer_signature_with_optional_column.to_dict(), indent=2)

{ 'inputs': '[{"type": "double", "name": "sepal length (cm)", "required": '
            'true}, {"type": "double", "name": "sepal width (cm)", "required": '
            'true}, {"type": "double", "name": "petal length (cm)", '
            '"required": true}, {"type": "double", "name": "petal width (cm)", '
            '"required": true}, {"type": "double", "name": "optional_column", '
            '"required": true}]',
  'outputs': '[{"type": "long", "name": "species", "required": true}]',
  'params': None}




In [28]:
X_with_optional_dict = X_with_optional_column.iloc[0].to_dict()
X_with_optional_dict['optional_column'] = None
print(X_with_optional_dict)

{'sepal length (cm)': 5.1, 'sepal width (cm)': 3.5, 'petal length (cm)': 1.4, 'petal width (cm)': 0.2, 'optional_column': None}


In [29]:
infer_signature_with_optional_column = infer_signature(model_input=X_with_optional_dict, model_output=y)

In [30]:
pprint(infer_signature_with_optional_column.to_dict(), indent=2)

{ 'inputs': '[{"type": "double", "name": "sepal length (cm)", "required": '
            'true}, {"type": "double", "name": "sepal width (cm)", "required": '
            'true}, {"type": "double", "name": "petal length (cm)", '
            '"required": true}, {"type": "double", "name": "petal width (cm)", '
            '"required": true}, {"type": "any", "name": "optional_column", '
            '"required": false}]',
  'outputs': '[{"type": "long", "name": "species", "required": true}]',
  'params': None}
