# MLFlow Workshop

## 0. Data Preparation

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [3]:
X, y = make_classification(n_samples=1000,
                           n_features=5,
                           n_informative=3,
                           n_classes=2,
                           random_state=42)

In [4]:
X[:5]

array([[-0.06529995, -0.7172141 ,  0.39395171, -0.93447324,  1.68151389],
       [ 0.56701461, -0.04460648,  1.61285062, -1.35017375,  2.48887766],
       [-0.24721549, -0.65056935, -0.74350032, -1.21418979,  0.84110979],
       [ 1.14587027,  0.9742245 ,  1.5625056 , -2.27701042,  2.27652083],
       [ 0.59960461, -0.4275453 ,  2.37447233, -1.50350966,  3.60495891]])

In [5]:
y[:5]

array([0, 0, 0, 1, 0])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

## 1. MLFlow Tracking

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [8]:
model = LogisticRegression().fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.94      0.92        94
           1       0.94      0.92      0.93       106

    accuracy                           0.93       200
   macro avg       0.92      0.93      0.92       200
weighted avg       0.93      0.93      0.93       200



### 1.1 Concepts
* <b>Code Version:</b> Git commit hash used.
* <b>Start & End Time:</b> Start & end time of the run.
* <b>Source:</b> Name of the file to launch the run, or the project name.
* <b>Parameters:</b> Key-value input parameters of your choice.
* <b>Metrics:</b> Key-value metrics, where the value is numeric.
* <b>Artifacts:</b> Output files in any format.

In [11]:
import mlflow

In [12]:
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment("classification-project")

2023/05/25 01:29:04 INFO mlflow.tracking.fluent: Experiment with name 'classification-project' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/alios/Desktop/b2metric/aktifbank-workshop/mlruns/1', creation_time=1684967344602, experiment_id='1', last_update_time=1684967344602, lifecycle_stage='active', name='classification-project', tags={}>

### 1.2 Experiments

In [13]:
# Experiment: 1

mlflow.sklearn.autolog()

with mlflow.start_run():

    # Developer Tag
    mlflow.set_tag("data-scientist", "aliosman")

    model = LogisticRegression().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred)

    # Metric Tag
    mlflow.log_metric("F1", score)

In [14]:
# Experiment: 2

mlflow.sklearn.autolog()

with mlflow.start_run():

    # Developer tag
    mlflow.set_tag("data-scientist", "aliosman")

    solver = "liblinear"
    penalty = "l1"
    C = 20

    mlflow.log_param("solver", solver)
    mlflow.log_param("penalty", penalty)
    mlflow.log_param("C", C)

    model = LogisticRegression(solver=solver, penalty=penalty, C=C).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred)

    # Metric Tag
    mlflow.log_metric("F1", score)

In [15]:
# Experiment: 3

from xgboost import XGBClassifier

mlflow.xgboost.autolog()

with mlflow.start_run():

    # Developer tag
    mlflow.set_tag("data-scientist", "aliosman")

    model = XGBClassifier().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred)

    # Metric Tag
    mlflow.log_metric("F1", score)

## 2. MLFlow Model Registry

### 2.1 Record

<img src="https://mlflow.org/docs/latest/_images/scenario_2.png" width="400" height="500">

### 2.2 Registry

In [16]:
from mlflow.models.signature import infer_signature

In [17]:
# Model: 1

mlflow.sklearn.autolog()

with mlflow.start_run():

    # Developer Tag
    mlflow.set_tag("data-scientist", "aliosman")

    model = LogisticRegression().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred)

    # Metric Log
    mlflow.log_metric("F1", score)

    signature = infer_signature(X_test, y_pred)

    # Model Log
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="sklearn-logistic-base-model",
    )

Successfully registered model 'sklearn-logistic-base-model'.
2023/05/25 01:29:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: sklearn-logistic-base-model, version 1
Created version '1' of model 'sklearn-logistic-base-model'.


In [18]:
# Model: 2

mlflow.xgboost.autolog()

with mlflow.start_run():

    # Developer Tag
    mlflow.set_tag("data-scientist", "aliosman")

    model = XGBClassifier().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred)

    # Metric Log
    mlflow.log_metric("F1", score)

    signature = infer_signature(X_test, y_pred)

    # Model Log
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path="xgboost-model",
        signature=signature,
        registered_model_name="xgboost-base-model",
    )

Successfully registered model 'xgboost-base-model'.
2023/05/25 01:29:33 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost-base-model, version 1
Created version '1' of model 'xgboost-base-model'.


## 3. MLFlow Models

### 3.1 Registered Models

In [19]:
from mlflow.tracking import MlflowClient

In [20]:
client = MlflowClient(tracking_uri="sqlite:///mlruns.db")

In [21]:
registered_model = client.get_registered_model("xgboost-base-model")
registered_model

<RegisteredModel: aliases={}, creation_timestamp=1684967373592, description=None, last_updated_timestamp=1684967373606, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1684967373606, current_stage='None', description=None, last_updated_timestamp=1684967373606, name='xgboost-base-model', run_id='876c74058428428abc925631a2b889fc', run_link=None, source='/home/alios/Desktop/b2metric/aktifbank-workshop/mlruns/1/876c74058428428abc925631a2b889fc/artifacts/xgboost-model', status='READY', status_message=None, tags={}, user_id=None, version=1>], name='xgboost-base-model', tags={}>

### 3.2 Load Model

In [22]:
from mlflow.xgboost import load_model

In [23]:
src_path = registered_model.latest_versions[0].source

In [24]:
model = load_model(src_path)

In [25]:
y_pred = model.predict(X_test)

In [26]:
f1_score(y_test, y_pred)

0.9537037037037037

In [27]:
from mlflow.sklearn import load_model

In [28]:
registered_model = client.get_registered_model("sklearn-logistic-base-model")
registered_model

<RegisteredModel: aliases={}, creation_timestamp=1684967368389, description=None, last_updated_timestamp=1684967368405, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1684967368405, current_stage='None', description=None, last_updated_timestamp=1684967368405, name='sklearn-logistic-base-model', run_id='793a666f0ccd4bc0a52bfb63ed802b35', run_link=None, source='/home/alios/Desktop/b2metric/aktifbank-workshop/mlruns/1/793a666f0ccd4bc0a52bfb63ed802b35/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>], name='sklearn-logistic-base-model', tags={}>

In [29]:
src_path = registered_model.latest_versions[0].source

In [30]:
model = load_model(src_path)

In [31]:
y_pred = model.predict(X_test)

In [32]:
f1_score(y_test, y_pred)

0.9282296650717704

## 4. MLFlow Projects

In [33]:
!tree

[01;34m.[00m
├── [01;34martifacts[00m
├── MLFlow Workshop.ipynb
├── [01;34mmlruns[00m
│   └── [01;34m1[00m
│       ├── [01;34m793a666f0ccd4bc0a52bfb63ed802b35[00m
│       │   └── [01;34martifacts[00m
│       │       ├── estimator.html
│       │       ├── [01;34mmodel[00m
│       │       │   ├── conda.yaml
│       │       │   ├── MLmodel
│       │       │   ├── model.pkl
│       │       │   ├── python_env.yaml
│       │       │   └── requirements.txt
│       │       ├── [01;34msklearn-model[00m
│       │       │   ├── conda.yaml
│       │       │   ├── MLmodel
│       │       │   ├── model.pkl
│       │       │   ├── python_env.yaml
│       │       │   └── requirements.txt
│       │       ├── [01;35mtraining_confusion_matrix.png[00m
│       │       ├── [01;35mtraining_precision_recall_curve.png[00m
│       │       └── [01;35mtraining_roc_curve.png[00m
│       ├── [01;34m876c74058428428abc925631a2b889fc[00m
│       │   └── [01;34martifacts[00m
│       │       ├─

In [34]:
src_path

'/home/alios/Desktop/b2metric/aktifbank-workshop/mlruns/1/793a666f0ccd4bc0a52bfb63ed802b35/artifacts/sklearn-model'

In [35]:
!conda env create -f /home/alios/Desktop/b2metric/aktifbank-workshop/mlruns/1/793a666f0ccd4bc0a52bfb63ed802b35/artifacts/sklearn-model/conda.yaml

Collecting package metadata (repodata.json): done
Solving environment: done


  current version: 22.9.0
  latest version: 23.3.1

Please update conda by running

    $ conda update -n base -c defaults conda


Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Installing pip dependencies: - Ran pip subprocess with arguments:
['/home/alios/anaconda3/envs/mlflow-env/bin/python', '-m', 'pip', 'install', '-U', '-r', '/home/alios/Desktop/b2metric/aktifbank-workshop/mlruns/1/793a666f0ccd4bc0a52bfb63ed802b35/artifacts/sklearn-model/condaenv.lu_wairi.requirements.txt']
Pip subprocess output:
Collecting mlflow==2.3 (from -r /home/alios/Desktop/b2metric/aktifbank-workshop/mlruns/1/793a666f0ccd4bc0a52bfb63ed802b35/artifacts/sklearn-model/condaenv.lu_wairi.requirements.txt (line 1))
  Downloading mlflow-2.3.0-py3-none-any.whl (17.7 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.7/17.7 MB 8.6 MB/s eta 0:00:009
[?25hCollecting cloudpickle==2.2.1 (from -r /h