In [5]:
# How long the whole notebook took to run
import time

start_time = time.perf_counter()

# Step 2: Preprocessing & Classification model
This section will load up the defined settings from the pickles directory and run the machine learning pipeline with the help of the `pycaret` library and save respective data.

In [3]:
%load_ext jupyternotify
# importing all packages needed in this section
import pandas as pd
import os
import sys 

from sklearn.metrics import classification_report
from pycaret.classification import *
from pycaret.containers.models.classification import get_all_model_containers

# utility functions for the experiment
sys.path.append('../src')

from mlflow_manager import MLFlowManager
from tuning_grids import Grids
from utils import getPicklesFromDir, getExperimentConfig, run_pycaret_setup, translate_model_name

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

<IPython.core.display.Javascript object>

In [None]:
%notify

### Testing that the provided hyperparameters work with pycaret and the system
settings = dataset_settings[0]
dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
s = run_pycaret_setup(dataset_path, settings['setup_param'])
for ml_model in ['rf', 'gbc', 'mlp']: #config['clf']['ml_models']:
    # create & tune model
    #model = s.create_model(ml_model)
    #Quickfix for efficiency
    all_models = get_all_model_containers(s)
    model = all_models[ml_model].class_def()

    tune_grid = Grids.get_tuning_grid(ml_model)
    tuned_model = s.tune_model(model, 
                               **config['clf']['tuning_param'], 
                               custom_grid=tune_grid
                              )

<IPython.core.display.Javascript object>

Unnamed: 0,Description,Value
0,Session id,8285
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8065,0.9068,0.6818,0.75,0.7143,0.5684,0.5699
1,0.7097,0.7545,0.5,0.6111,0.55,0.3389,0.3426
2,0.7581,0.8409,0.5455,0.7059,0.6154,0.4431,0.451
3,0.629,0.733,0.2727,0.4615,0.3429,0.1076,0.1149
4,0.8033,0.8452,0.8095,0.68,0.7391,0.5831,0.5889
5,0.8197,0.8571,0.619,0.8125,0.7027,0.5767,0.5876
6,0.8852,0.9536,0.8095,0.85,0.8293,0.7429,0.7434
7,0.6721,0.7143,0.381,0.5333,0.4444,0.2209,0.2272
8,0.7213,0.8155,0.6667,0.5833,0.6222,0.403,0.4052
9,0.7541,0.8333,0.619,0.65,0.6341,0.4491,0.4494


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


Unnamed: 0,Description,Value
0,Session id,4425
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7581,0.8273,0.7727,0.6296,0.6939,0.4973,0.5044
1,0.7903,0.8364,0.6364,0.7368,0.6829,0.5275,0.5307
2,0.7742,0.8409,0.5,0.7857,0.6111,0.4629,0.4863
3,0.8065,0.9159,0.5909,0.8125,0.6842,0.5496,0.5641
4,0.8197,0.8488,0.6667,0.7778,0.7179,0.5866,0.5903
5,0.7705,0.8536,0.5238,0.7333,0.6111,0.4547,0.4676
6,0.7377,0.8298,0.4286,0.6923,0.5294,0.3613,0.3812
7,0.7213,0.7952,0.7143,0.5769,0.6383,0.4158,0.4221
8,0.8033,0.8726,0.5714,0.8,0.6667,0.5326,0.5477
9,0.7049,0.7952,0.5714,0.5714,0.5714,0.3464,0.3464


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


Unnamed: 0,Description,Value
0,Session id,5813
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


dataset_settings pickle is saved as follows:
```
"meta_data": meta_dataset,  # contains information about the dataset, including path
"setup_param": setup_param, # contains all the setup parameters for pycaret setup() function
"sdg_param": sdg_param,     # contains all sdg parameters for the CTGAN() function

```

In [None]:
run_dataset = config['run_dataset']

for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        # Checks if run_dataset contains dataset_id's
        # if it does, run the experiment only on specified datasets
        continue
        
    # get path
    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    # run setup function
    s = run_pycaret_setup(dataset_path, settings['setup_param'])
    
    USI = s.get_config('USI')

    
    # Init experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
    
    logg_tags = {
        'USI': USI,
        'Dataset ID': settings['meta']['id'],
        'Dataset Type': 'original'
    }
    
    mlflow.start_run("Original data models", tags=logg_tags)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"
        logg_tags['model']=ml_model
        
        mlflow.start_run(model_name, tags=logg_tags, nested=True)

        # create & tune model
        #model = s.create_model(ml_model)
        
        #Quickfix for efficiency
        all_models = get_all_model_containers(s)
        model = all_models[ml_model].class_def()
        
        tune_grid = Grids.get_tuning_grid(ml_model)
        
        tuned_model = s.tune_model(model, **config['clf']['tuning_param'], custom_grid=tune_grid)
        #Old: tuned_model = s.tune_model(model, **config['clf']['tuning_param'])
        
        # get validation results
        val_df = s.pull()
        val_score = {}
        val_score['Accuracy'] = val_df['Accuracy']['Mean']
        val_score['F1-score'] = val_df['F1']['Mean']
        val_score['AUC']      = val_df['AUC']['Mean']
        val_score['Kappa']    = val_df['Kappa']['Mean']
        val_score['MCC']      = val_df['MCC']['Mean']
        
        # test the model on the holdout-data
        holdout_score = s.predict_model(estimator=tuned_model)
        #metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        #metrics_df = pd.DataFrame(metrics).transpose()
        
        # log parameters     
        mlflow.log_params(tuned_model.get_params())
        # log performance
        mlflow.log_metrics(val_score)
        mlflow.log_score_report_to_html(val_df, "Validation")
        mlflow.log_score_report_to_html(holdout_score, "Holdout")
        # log model
        mlflow.log_model(model=tuned_model)
        
        mlflow.end_run()
        
    # Save model details on the model with best accurracy under the the 'Original data models' run
    best_run = mlflow.get_best_run_by_metric(metric_name='Accuracy')
    
    mlflow.log_params(best_run.data.params)
    mlflow.log_metrics(best_run.data.metrics)
    mlflow.log_tag('model run name', best_run.data.tags['mlflow.runName'])
    mlflow.log_tag('model', best_run.data.tags['model'])
    mlflow.log_tag('model run id', best_run.info.run_id)
    
    mlflow.end_run()          

Unnamed: 0,Description,Value
0,Session id,892
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
end_time = time.perf_counter()

print(f"Time to run the whole notebook: {int(round(end_time-start_time, 0))} seconds")
print(f"Time to run the whole notebook: {round((end_time-start_time)/60, 1)} minutes")

Time to run the whole notebook: 7 seconds
Time to run the whole notebook: 0.1 minutes


In [None]:

best_run.data.tags['mlflow.runName']
mlflow.get_best_run_by_metric()

In [None]:
mlflow.log_tag('Best model run name', mlflow.get_run_name(best_run))
mlflow.log_tag('model', mlflow.get_model_tag(best_run))
mlflow.log_tag('Best model run id', best_run.info.run_id)

---

### Notice 
Following cells until end of section (i.e. section 3.0) contains experimental code that will not be run.

In [None]:
s.get_leaderboard()

In [None]:
### Following shows which are models are natively available in the pycaret library
# It is possible to add estimators
all_models = models()
display(all_models)

In [None]:
# Need to define param that should be explored, define which method e.g. grid_search vs random vs optuna
# default search method: random grid search
# Todo: lookup default search range parameters

# uses the best model to optimze
#tuned = tune_model(clf, optimize='Accuracy', n_iter=10)