<a href="https://colab.research.google.com/github/alexandergribenchenko/Data_Science_Toolkit/blob/main/mlflow/NB_01_mlflow_complete_sklearn_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 01. Libraries

In [None]:
import logging
import sys
import warnings
from urllib.parse import urlparse

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

In [None]:
import requests
import json

In [None]:
warnings.filterwarnings("ignore")
np.random.seed(40)

In [None]:
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)
logger



# 02. Credentials

In [None]:
databricks_instance = 'adb-5365071268183879.19.azuredatabricks.net'
databricks_key = 'dapi9bb49ac3f0f0baa8ef087c0241f3b74c-3'

# 02. Input dataset

In [None]:
csv_url = 'https://raw.githubusercontent.com/mlflow/mlflow/master/tests/datasets/winequality-red.csv'

In [None]:
data = pd.read_csv(csv_url, sep=";")

The dataset is related with wine information, and the target is `quality`.

In [None]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# 03. Customized functions

In [None]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

# 04. Train-Test Spit

In [None]:
train, test = train_test_split(data)

# The predicted column is "quality" which is a scalar from [3, 9]
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]

# 05. Model

## 05.01. Instatiating the model

In [None]:
alpha = 0.1
l1_ratio = 0.9

In [None]:
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

## 05.02. (mlflow: register experiments) Training the model

In [None]:
lr.fit(train_x, train_y)

## 05.03. Predicting with the model

In [None]:
predicted_qualities = lr.predict(test_x)

## 05.04. Evaluating the model

In [None]:
(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

print(f"Elasticnet model (alpha={alpha:f}, l1_ratio={l1_ratio:f}):")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R2: {r2}")

Elasticnet model (alpha=0.100000, l1_ratio=0.900000):
  RMSE: 0.735314956888905
  MAE: 0.566974647785579
  R2: 0.23390870203034675


## 05.05. (mlflow: register params and metrics) Logging params and metrics in mlflow

In [None]:
mlflow.log_param("alpha_mio", alpha)
mlflow.log_param("l1_ratio_mio", l1_ratio)
mlflow.log_metric("rmse_mio", rmse)
mlflow.log_metric("r2_mio", r2)
mlflow.log_metric("mae_mio", mae)

## 05.06. Infer model signature to register in mlflow

In [None]:
predictions = lr.predict(train_x)
signature = infer_signature(train_x, predictions)

## 05.06. (mlflow: register model) Registring model in mlflow

In [None]:
mlflow.sklearn.log_model(lr, 'model',registered_model_name="ElasticnetWineModel", signature=signature)

Successfully registered model 'ElasticnetWineModel'.
2023/11/15 22:31:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: ElasticnetWineModel, version 1
Created version '1' of model 'ElasticnetWineModel'.


<mlflow.models.model.ModelInfo at 0x7f83c9fcbd00>

## 05.07. (mlflow: deploying model) Serving endpoint in mlflow

In [None]:
url = 'https://{databricks_instance}/api/2.0/serving-endpoints/'.\
    format(databricks_instance=databricks_instance)

headers = {'Authorization':  'Bearer {databricks_key}'.format(databricks_key=databricks_key),
           'Content-Type': 'application/json'}

payload = {
    "name": "Endponint_ElasticnetWineModel",
    "config": {
        "served_models": [{
            "model_name": "ElasticnetWineModel",
            "model_version": "2",
            "workload_size": "Small",
            "scale_to_zero_enabled": True
        }]
    }
}

response = requests.post(url, headers=headers, json=payload )

# Imprime la respuesta
print(response.status_code)
print(response.json())

200
{'name': 'Endponint_ElasticnetWineModel', 'creator': 'alexander.ortega@level60consulting.com', 'creation_timestamp': 1699994437000, 'last_updated_timestamp': 1699994437000, 'state': {'ready': 'NOT_READY', 'config_update': 'IN_PROGRESS'}, 'pending_config': {'start_time': 1699994437000, 'served_models': [{'name': 'ElasticnetWineModel-2', 'model_name': 'ElasticnetWineModel', 'model_version': '2', 'workload_size': 'Small', 'scale_to_zero_enabled': True, 'workload_type': 'CPU', 'state': {'deployment': 'DEPLOYMENT_CREATING', 'deployment_state_message': 'Creating resources for served model.'}, 'creator': 'alexander.ortega@level60consulting.com', 'creation_timestamp': 1699994437000}], 'config_version': 1, 'traffic_config': {'routes': [{'served_model_name': 'ElasticnetWineModel-2', 'traffic_percentage': 100, 'served_entity_name': 'ElasticnetWineModel-2'}]}}, 'id': 'de3babf238d54dffb9daa61b353e08bf', 'permission_level': 'CAN_MANAGE', 'route_optimized': False}


# 06. Testing existing Endpoint

## 06.01. Alternativa python

In [None]:
contenido_json = {
  "dataframe_split": {
    "columns": [
      "fixed acidity",
      "volatile acidity",
      "citric acid",
      "residual sugar",
      "chlorides",
      "free sulfur dioxide",
      "total sulfur dioxide",
      "density",
      "pH",
      "sulphates",
      "alcohol"
    ],
    "data": [
      [
        7.4,
        0.7,
        0.0,
        1.9,
        0.076,
        11.0,
        34.0,
        0.9978,
        3.51,
        0.56,
        9.4
      ],
      [
        6.7,
        0.8,
        0.12,
        2.0,
        0.064,
        11.0,
        22.0,
        0.9956,
        3.4,
        0.63,
        10.0
      ],
      [
        8.0,
        0.45,
        0.23,
        2.2,
        0.094,
        16.0,
        39.0,
        0.9972,
        3.29,
        0.54,
        9.5
      ]
    ]
  }
}

df = pd.DataFrame(contenido_json['dataframe_split']['data'], columns=contenido_json['dataframe_split']['columns'])
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,6.7,0.8,0.12,2.0,0.064,11.0,22.0,0.9956,3.4,0.63,10.0
2,8.0,0.45,0.23,2.2,0.094,16.0,39.0,0.9972,3.29,0.54,9.5


In [None]:
endpoint_to_query = 'Endponint_ElasticnetWineModel'

In [None]:
headers = {'Authorization':  'Bearer {databricks_key}'.format(databricks_key=databricks_key),
           'Content-Type': 'application/json'}


url = 'https://{databricks_instance}/serving-endpoints/{endpoint_to_query}/invocations'.\
    format(endpoint_to_query=endpoint_to_query,
           databricks_instance = databricks_instance)


def create_tf_serving_json(data):
  return {'inputs': {name: data[name].tolist() for name in data.keys()} if isinstance(data, dict) else data.tolist()}
ds_dict = {'dataframe_split': df.to_dict(orient='split')} if isinstance(df, pd.DataFrame) else create_tf_serving_json(df)
data_json = json.dumps(ds_dict, allow_nan=True)

In [None]:
response = requests.request(method='POST', headers=headers, url=url, data=data_json)

In [None]:
response.text

'{"predictions": [5.3436443838585275, 5.517806023434063, 5.406008491177671]}'

## 06.02. Alternativa curl

In [None]:
%sh
curl -X POST https://adb-5365071268183879.19.azuredatabricks.net/serving-endpoints/Endponint_ElasticnetWineModel/invocations \
-H "Authorization: Bearer dapi9bb49ac3f0f0baa8ef087c0241f3b74c-3" \
-H "Content-Type: application/json" \
-d '{
  "dataframe_split": {
    "columns": [
      "fixed acidity",
      "volatile acidity",
      "citric acid",
      "residual sugar",
      "chlorides",
      "free sulfur dioxide",
      "total sulfur dioxide",
      "density",
      "pH",
      "sulphates",
      "alcohol"
    ],
    "data": [
      [
        7.4,
        0.7,
        0.0,
        1.9,
        0.076,
        11.0,
        34.0,
        0.9978,
        3.51,
        0.56,
        9.4
      ],
      [
        6.7,
        0.8,
        0.12,
        2.0,
        0.064,
        11.0,
        22.0,
        0.9956,
        3.4,
        0.63,
        10.0
      ],
      [
        8.0,
        0.45,
        0.23,
        2.2,
        0.094,
        16.0,
        39.0,
        0.9972,
        3.29,
        0.54,
        9.5
      ]
    ]
  }
}'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   891  100    75  100   816    370   4030 --:--:-- --:--:-- --:--:--  4410


{"predictions": [5.3436443838585275, 5.517806023434063, 5.406008491177671]}