In [6]:
import os
import warnings
import sys

import pandas as pd
import numpy as np  

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import train_test_split

import mlflow
import warnings

warnings.simplefilter("ignore")


In [7]:
data_path = 'data/winequality-red.csv'
df = pd.read_csv(data_path)
print(df.shape)
df.head(3)

(1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


### Tracking Experiments

MLflow runs can be recorded to local files, to a SQLAlchemy compatible database, or remotely to a tracking server. ML Flow supports 2 types of tracking stores: file store and database store

* Local file path where data is stored locally.
* Database like mysql,sqlite,postgres
* remote tracking server(https://my-server:5000)
* Databricks works space.

Two main components of storage:

* Backend store storing entities(runs, parameters, metrics, tags, notes, metadata, etc).
* Artifact store storing artifacts(files, models, images, in-memory objects, or model summary, etc).ie, basically it store whatever is a file. Some examples of artifact store include amazon s3, azure blob storage,nfs,ftp server etc.




In [8]:
#mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 127.0.0.1 --port 5000
#mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root mlruns/ --host 127.0.0.1 --port 5000
# (if we need model registry we need to save backend store in sql db)


remote_server_uri = "http://127.0.0.1:5000"
backend_store_uri = "sqlite:///mlflow.db"
mlflow.set_registry_uri(remote_server_uri)
mlflow.set_tracking_uri(backend_store_uri)
mlflow.tracking.get_registry_uri(),mlflow.tracking.get_tracking_uri()

('http://127.0.0.1:5000', 'sqlite:///mlflow.db')

What do we track?

* code version:  git commit hash used for the run
* start and end time: Start and end time of the run
* Source: What code run?
* parameters: key value paramteres
* metrics: metrics of model. key-value pairs
* Artifacts: output files in any format


There are two ways to login.
* Manual logging
* Automatic logging

<h4>1. Manual Logging</h4>

Here we will use manual login first. for that we can use following-

* with mlflow.start_run(run_name=run_name) as run ->  start a run
* run.info -> provides details like run id (run.info.run_uuid), experiment id(run.info.experiment_id) etc
* mlflow.log_params({Key1:value1,key2:value2}) -> log parameters
* mlflow.log_metrics({Key1:value1,key2:value2}) -> log metrics
* mlflow.log_artifact(data_path) -> saves data used for training
* mlflow.sklearn.log_model(model, "model") -> saves model


In [9]:
# set experiment name
EXP_NAME = 'Wine_quality_Random_Forest'
mlflow.set_experiment(EXP_NAME)

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2022/05/09 16:07:25 INFO mlflow.tracking.fluent: Experiment with name 'Wine_quality_Random_Forest' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='Wine_quality_Random_Forest', tags={}>

In [10]:
from sklearn.ensemble import RandomForestRegressor


def eval_metric(actual,pred):
    #compute relevant metrics
    rmse = np.sqrt(mean_squared_error(actual,pred))
    mae = mean_absolute_error(actual,pred)
    r2 = r2_score(actual,pred)
    return rmse,mae,r2  

def load_data(data_path ='data/winequality-red.csv'):
    data = pd.read_csv(data_path)
    X = data.drop(["quality"], axis=1)
    y = data['quality']
    X_train,X_test,y_train, y_test =  train_test_split(X,y,test_size=0.25,random_state=42)
    return X_train,y_train,X_test,y_test


def train_model(no_estimators,max_depth,run_name="best"):
    np.random.seed(40)
    X_train,y_train,X_test,y_test = load_data()
    with mlflow.start_run(run_name=run_name) as run:
        print(f'Run id: {run.info.run_uuid}')
        print(f'Run name: {run_name}')
        print(f'Exp id: {run.info.experiment_id}')
        model = RandomForestRegressor(n_estimators=no_estimators,max_depth=max_depth)
        model.fit(X_train,y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_metrics = eval_metric(y_train,y_train_pred)
        test_metrics = eval_metric(y_test,y_test_pred) 

        mlflow.log_params({"n_estimators":no_estimators,"max_depth":max_depth})
        mlflow.log_metrics({"train_rmse":train_metrics[0],"train_mae": train_metrics[1], "r2": train_metrics[2]})
        mlflow.log_metrics({"test_rmse":test_metrics[0],"test_mae": test_metrics[1], "r2": test_metrics[2]})
        mlflow.log_artifact(data_path)
        print("Save to: {}".format(mlflow.get_artifact_uri()))
        mlflow.sklearn.log_model(model, "model")
        
        # Print out metrics
        print(f"Random Forest model:")  
        print(f"n_estimators:{no_estimators}, max_depth:{max_depth}")
        print("  Train RMSE: %s" % train_metrics[0])
        print("  Train MAE: %s" % train_metrics[1])
        print("  Train R2: %s" % train_metrics[2])
        print('--'*50)
        print("  Test RMSE: %s" % test_metrics[0])
        print("  Test MAE: %s" % test_metrics[1])
        print("  Test R2: %s" % test_metrics[2])


In [11]:
# for i,par in enumerate([(100,5),(200,5),(200,7),(300,5),(300,7),(300,3)]):
for i,par in enumerate([(100,5),(300,7),(300,3)]):
    n_estim,max_depth = par[0],par[1]
    run_name = 'RF_'+str(i)
    train_model(no_estimators=n_estim,max_depth=max_depth,run_name=run_name)
    print('__'*70)

Run id: ed7620729b374017baf95b35a74a3180
Run name: RF_0
Exp id: 1
Save to: ./mlruns/1/ed7620729b374017baf95b35a74a3180/artifacts
Random Forest model:
n_estimators:100, max_depth:5
  Train RMSE: 0.5435386929189447
  Train MAE: 0.4231440290642347
  Train R2: 0.5540704718491849
----------------------------------------------------------------------------------------------------
  Test RMSE: 0.6112396588832881
  Test MAE: 0.48762102125993073
  Test R2: 0.39602700384257294
____________________________________________________________________________________________________________________________________________
Run id: 9f119d19982d473fa77783a8fa84dfa7
Run name: RF_1
Exp id: 1
Save to: ./mlruns/1/9f119d19982d473fa77783a8fa84dfa7/artifacts
Random Forest model:
n_estimators:300, max_depth:7
  Train RMSE: 0.4473604176962957
  Train MAE: 0.34690298225856253
  Train R2: 0.6979210868093256
----------------------------------------------------------------------------------------------------
  Test RM

```
Note:

Always its important to note that we need to excute the command to run mlflow with backend store and artifact root in correct location.

```

<h4>2. Automatic Logging</h4>

Let see how automatic logging works with sklearn.

- Saves Parameters obtained by estimator.get_params and scores obtained by estimator.get_score .(so we have limited use cases and its better to use manual logging)

In [12]:
#let us create a new experiment in a different way using mlflow client api

# Create an experiment with a name that is unique and case sensitive.

from mlflow.tracking import MlflowClient
from sklearn.linear_model import LinearRegression

client = MlflowClient()
EXP_NAME = 'Wine_quality_Decision_Tree'
experiment_id = client.create_experiment(EXP_NAME)
# here we will also set experiment tag
client.set_experiment_tag(experiment_id, "sklearn.framework", "LR")

# Fetch experiment metadata information
experiment = client.get_experiment(experiment_id)
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))


Name: Wine_quality_Decision_Tree
Experiment_id: 2
Artifact Location: ./mlruns/2
Tags: {'sklearn.framework': 'LR'}
Lifecycle_stage: active


In [13]:
from sklearn.tree import DecisionTreeRegressor

def load_data(data_path ='data/winequality-red.csv'):
    data = pd.read_csv(data_path)
    X = data.drop(["quality"], axis=1)
    y = data['quality']
    X_train,X_test,y_train, y_test =  train_test_split(X,y,test_size=0.25,random_state=42)
    return X_train,y_train,X_test,y_test
    

    
def fetch_logged_data(run_id):
    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    # tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    print("run_id: {}".format(run_id))
    print("artifacts: {}".format(artifacts))
    print("params: {}".format(data.params))
    print("metrics: {}".format(data.metrics))


def train_model(max_depth,run_name="best",exp_name=EXP_NAME):
    np.random.seed(40)
    X_train,y_train,X_test,y_test = load_data()
    mlflow.sklearn.autolog()
    experiment = mlflow.get_experiment_by_name(EXP_NAME)
    with mlflow.start_run(experiment_id=experiment.experiment_id,run_name=run_name) as run:
        print(f'Exp id: {run.info.experiment_id}')
        print(f'Run name: {run_name}')
        run_id = run.info.run_uuid
        model = DecisionTreeRegressor(max_depth=max_depth)
        model.fit(X_train,y_train)
        mlflow.sklearn.eval_and_log_metrics(model, X_test, y_test, prefix="test_")
        print(run_id)
        fetch_logged_data(run_id)
        
  


In [14]:
for i,max_depth in enumerate([3,5,10,12,15]):
    run_name = 'DT_'+str(i)
    train_model(max_depth=max_depth,run_name=run_name)
    print('__'*70)

Exp id: 2
Run name: DT_0
a4cca023adf6435cb9b5e87948f42d22
run_id: a4cca023adf6435cb9b5e87948f42d22
artifacts: ['model/MLmodel', 'model/conda.yaml', 'model/model.pkl', 'model/requirements.txt']
params: {'ccp_alpha': '0.0', 'criterion': 'squared_error', 'max_depth': '3', 'max_features': 'None', 'max_leaf_nodes': 'None', 'min_impurity_decrease': '0.0', 'min_samples_leaf': '1', 'min_samples_split': '2', 'min_weight_fraction_leaf': '0.0', 'random_state': 'None', 'splitter': 'best'}
metrics: {'training_mse': 0.4254849819252294, 'training_mae': 0.5010648728230344, 'training_r2_score': 0.357771557489063, 'training_rmse': 0.6522920986224112, 'training_score': 0.357771557489063, 'test_mse': 0.4592138510123652, 'test_mae': 0.5384902859594534, 'test_r2_score': 0.2576487379441431, 'test_rmse': 0.6776531937594371, 'test_score': 0.2576487379441431}
____________________________________________________________________________________________________________________________________________
Exp id: 2
Run


<h4>More Operations</h4>
<h5>Fetching details regarding experiments</h5>

In [15]:
#list all experiments
from  mlflow.tracking import MlflowClient
client = MlflowClient()
experiments = client.list_experiments()
experiments

[<Experiment: artifact_location='mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='Wine_quality_Random_Forest', tags={}>,
 <Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='Wine_quality_Decision_Tree', tags={'sklearn.framework': 'LR'}>]

In [16]:

# Fetch experiment metadata information
experiment_id = str(1)
experiment = client.get_experiment(experiment_id)
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

Name: Wine_quality_Random_Forest
Experiment_id: 1
Artifact Location: ./mlruns/1
Tags: {}
Lifecycle_stage: active


<h5>Fetch all run id informations of an experiment</h5>

In [17]:
experiment_name = "Wine_quality_Random_Forest"

# get exp id by name
exp_details = dict(mlflow.get_experiment_by_name(experiment_name))
exp_id = exp_details['experiment_id']
df = mlflow.search_runs([exp_id])
df

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.train_rmse,metrics.train_mae,metrics.test_mae,metrics.r2,metrics.test_rmse,params.n_estimators,params.max_depth,tags.mlflow.user,tags.mlflow.log-model.history,tags.mlflow.runName,tags.mlflow.source.name,tags.mlflow.source.type
0,ae913b31a8304ae2800a34f48746097f,1,FINISHED,./mlruns/1/ae913b31a8304ae2800a34f48746097f/ar...,2022-05-09 10:38:23.221000+00:00,2022-05-09 10:38:31.985000+00:00,0.62519,0.486478,0.515765,0.336691,0.640561,300,3,Arun Mohan,"[{""run_id"": ""ae913b31a8304ae2800a34f48746097f""...",RF_2,c:\Users\Arun Mohan\.conda\envs\mlflowenv\lib\...,LOCAL
1,9f119d19982d473fa77783a8fa84dfa7,1,FINISHED,./mlruns/1/9f119d19982d473fa77783a8fa84dfa7/ar...,2022-05-09 10:38:08.469000+00:00,2022-05-09 10:38:23.168000+00:00,0.44736,0.346903,0.465243,0.439982,0.588578,300,7,Arun Mohan,"[{""run_id"": ""9f119d19982d473fa77783a8fa84dfa7""...",RF_1,c:\Users\Arun Mohan\.conda\envs\mlflowenv\lib\...,LOCAL
2,ed7620729b374017baf95b35a74a3180,1,FINISHED,./mlruns/1/ed7620729b374017baf95b35a74a3180/ar...,2022-05-09 10:37:56.759000+00:00,2022-05-09 10:38:08.365000+00:00,0.543539,0.423144,0.487621,0.396027,0.61124,100,5,Arun Mohan,"[{""run_id"": ""ed7620729b374017baf95b35a74a3180""...",RF_0,c:\Users\Arun Mohan\.conda\envs\mlflowenv\lib\...,LOCAL


<h5>Fetch information of a run by run id</h5>

In [18]:
run_id = '663e342cb43745efbab3b00fdb744d20'
client = mlflow.tracking.MlflowClient()
data = client.get_run(run_id).data
data

MlflowException: Run with id=663e342cb43745efbab3b00fdb744d20 not found

<h5>Filter Runs based on metrics</h5>

<img src="../tmp/ui1.PNG">

 

<img src="../tmp/ui2.PNG">