<a href="https://colab.research.google.com/github/Yingfu46/Yingfu46/blob/main/MLFlow_in_Colab_med_Onyxia_Exempel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Background

**Test MLFlow on a forecasting example**

In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.10.2-py3-none-any.whl (19.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=2.1.0 (from mlflow)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.0.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.6/147.6 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gunicorn<

In [2]:
!mlflow

Usage: mlflow [OPTIONS] COMMAND [ARGS]...

Options:
  --version  Show the version and exit.
  --help     Show this message and exit.

Commands:
  artifacts    Upload, list, and download artifacts from an MLflow...
  db           Commands for managing an MLflow tracking database.
  deployments  Deploy MLflow models to custom targets.
  doctor       Prints out useful information for debugging issues with MLflow.
  experiments  Manage experiments.
  gc           Permanently delete runs in the `deleted` lifecycle stage.
  models       Deploy MLflow models locally.
  recipes      Run MLflow Recipes and inspect recipe results.
  run          Run an MLflow project from the given URI.
  runs         Manage runs.
  sagemaker    Serve models on SageMaker.
  server       Run the MLflow tracking server.


In [3]:
# import other packages and set up figures
from datetime import datetime

starttime = datetime.now()

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from warnings import simplefilter
# Ignore warnings
simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")

plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)

import os
from pprint import pprint
import json

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import mlflow
import mlflow.sklearn
import mlflow.pyfunc
print("setup complete")


setup complete


In [4]:
SEED = 0

In [5]:
DATA_URL = "/content/drive/MyDrive/Colab Notebooks/adult.csv"
df_census = pd.read_csv(DATA_URL)

In [6]:
df_census.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32560,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [7]:
le = LabelEncoder()

X = df_census.drop(columns="income")
y = le.fit_transform(df_census["income"].values)

In [8]:
# The encoded classes
le.classes_

array(['<=50K', '>50K'], dtype=object)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 15282 to 2732
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             26048 non-null  int64 
 1   workclass       26048 non-null  object
 2   fnlwgt          26048 non-null  int64 
 3   education       26048 non-null  object
 4   education.num   26048 non-null  int64 
 5   marital.status  26048 non-null  object
 6   occupation      26048 non-null  object
 7   relationship    26048 non-null  object
 8   race            26048 non-null  object
 9   sex             26048 non-null  object
 10  capital.gain    26048 non-null  int64 
 11  capital.loss    26048 non-null  int64 
 12  hours.per.week  26048 non-null  int64 
 13  native.country  26048 non-null  object
dtypes: int64(6), object(8)
memory usage: 3.0+ MB


In [11]:
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

categorical_transformer = make_pipeline(mode_imputer, ordinal_encoder)

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", median_imputer, make_column_selector(dtype_include=np.int64)),
        ("categorical", categorical_transformer, make_column_selector(dtype_include=object))
    ], remainder="passthrough"
)

In [12]:
preprocessor.fit_transform(X_train)

array([[4.10000e+01, 2.08330e+05, 1.00000e+01, ..., 4.00000e+00,
        1.00000e+00, 3.90000e+01],
       [2.50000e+01, 1.91921e+05, 1.30000e+01, ..., 4.00000e+00,
        1.00000e+00, 3.90000e+01],
       [2.50000e+01, 1.80212e+05, 9.00000e+00, ..., 2.00000e+00,
        0.00000e+00, 3.90000e+01],
       ...,
       [2.60000e+01, 5.19610e+04, 8.00000e+00, ..., 2.00000e+00,
        1.00000e+00, 3.90000e+01],
       [4.40000e+01, 1.15323e+05, 1.40000e+01, ..., 4.00000e+00,
        1.00000e+00, 3.90000e+01],
       [3.90000e+01, 2.24531e+05, 9.00000e+00, ..., 4.00000e+00,
        1.00000e+00, 3.90000e+01]])

In [13]:
rf_clf = RandomForestClassifier(random_state=SEED)

pipe_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_clf)
])

In [14]:
param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_leaf_nodes": [5, 10, 50]
}

pipe_gscv = GridSearchCV(pipe_rf,
                         param_grid=param_grid,
                         scoring=["accuracy", "precision", "recall", "f1"],
                         refit="f1",
                         cv=5,
                         n_jobs=5,
                         verbose=1)

In [15]:
pipe_gscv.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [16]:
gscv_results = pd.DataFrame(pipe_gscv.cv_results_)
gscv_results.tail()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_leaf_nodes,param_classifier__n_estimators,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,std_test_recall,rank_test_recall,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
4,6.217181,0.233845,0.392686,0.04993,10,100,"{'classifier__max_leaf_nodes': 10, 'classifier...",0.844146,0.851248,0.84856,...,0.01866,5,0.60697,0.624333,0.623389,0.586414,0.623188,0.612859,0.014719,5
5,13.377493,0.254174,0.603909,0.064164,10,200,"{'classifier__max_leaf_nodes': 10, 'classifier...",0.843186,0.850288,0.847985,...,0.019307,6,0.600879,0.621359,0.621777,0.583166,0.619951,0.609427,0.015286,6
6,5.521259,0.825141,0.492204,0.033797,50,50,"{'classifier__max_leaf_nodes': 50, 'classifier...",0.84952,0.860461,0.86142,...,0.026698,3,0.626667,0.660756,0.663873,0.612683,0.660739,0.644944,0.021131,3
7,8.493214,0.86941,0.297888,0.065558,50,100,"{'classifier__max_leaf_nodes': 50, 'classifier...",0.850288,0.859885,0.863532,...,0.026958,1,0.627863,0.660149,0.668531,0.615684,0.664486,0.647343,0.021394,1
8,12.663201,0.227063,0.307338,0.134824,50,200,"{'classifier__max_leaf_nodes': 50, 'classifier...",0.85048,0.858541,0.862572,...,0.028118,2,0.628517,0.65673,0.665421,0.613203,0.670084,0.646791,0.022138,2


In [17]:
print(pipe_gscv.best_params_)

best_model = pipe_gscv.best_estimator_

{'classifier__max_leaf_nodes': 50, 'classifier__n_estimators': 100}


In [18]:
y_test_pred = best_model.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)

print(f"Final F1-score on test data : {f1_test}")

Final F1-score on test data : 0.6310452418096724


In [19]:
if not os.path.exists("models/"):
    os.makedirs("models/")
joblib.dump(pipe_gscv, 'models/pipeline_train_model_20230118.joblib')

['models/pipeline_train_model_20230118.joblib']

In [20]:
import mlflow
import mlflow.sklearn

In [21]:
# Automatic discovery : if MLFlow has been launched before Jupyter/VSCode
if "MLFLOW_TRACKING_URI" in os.environ:
    print(os.environ["MLFLOW_TRACKING_URI"])
else:
    print("MLflow was not automatically discovered, a tracking URI must be provided manually.")

MLflow was not automatically discovered, a tracking URI must be provided manually.


In [22]:
def log_gsvc_to_mlflow(gscv, mlflow_experiment_name):
    """Log a scikit-learn trained GridSearchCV object as an MLflow experiment."""
     # Set up MLFlow context
    mlflow.set_experiment(experiment_name=mlflow_experiment_name)

    for run_idx in range(len(gscv.cv_results_["params"])):
        # For each hyperparameter combination we trained the model with, we log a run in MLflow
        run_name = f"run {run_idx}"
        with mlflow.start_run(run_name=run_name):
            # Log hyperparameters
            params = gscv.cv_results_["params"][run_idx]
            for param in params:
                mlflow.log_param(param, params[param])

            # Log fit metrics
            scores = [score for score in gscv.cv_results_ if "mean_test" in score or "std_test" in score]
            for score in scores:
                mlflow.log_metric(score, gscv.cv_results_[score][run_idx])

            # Log model as an artifact
            mlflow.sklearn.log_model(gscv, "gscv_model")

            # Log training data URL
            mlflow.log_param("data_url", DATA_URL)


In [23]:
log_gsvc_to_mlflow(gscv=pipe_gscv, mlflow_experiment_name="yx-mlflow-1")

2024/02/21 14:50:00 INFO mlflow.tracking.fluent: Experiment with name 'yx-mlflow-1' does not exist. Creating a new experiment.


In [24]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.1.2-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.2


In [25]:
from pyngrok import ngrok

In [26]:
ngrok.kill()

In [27]:
ngrok_auth_token = "2cHdNRDksh5jOIQlh3alvpOnvIO_4GrfiKctWoFEaSkRYm3o9"
ngrok.set_auth_token(ngrok_auth_token)



In [28]:
ngrok_tunnel = ngrok.connect(addr = "5000", proto="http", bind_tls=True)
print(ngrok_tunnel.public_url)


https://8e1c-35-197-23-217.ngrok-free.app


In [None]:
!mlflow ui


[2024-02-21 14:50:33 +0000] [1995] [INFO] Starting gunicorn 21.2.0
[2024-02-21 14:50:33 +0000] [1995] [INFO] Listening at: http://127.0.0.1:5000 (1995)
[2024-02-21 14:50:33 +0000] [1995] [INFO] Using worker: sync
[2024-02-21 14:50:33 +0000] [1996] [INFO] Booting worker with pid: 1996
[2024-02-21 14:50:33 +0000] [1997] [INFO] Booting worker with pid: 1997
[2024-02-21 14:50:34 +0000] [1998] [INFO] Booting worker with pid: 1998
[2024-02-21 14:50:34 +0000] [1999] [INFO] Booting worker with pid: 1999
