In [1]:
# How long the whole notebook took to run
import time

start_time = time.perf_counter()

# Step 2: Preprocessing & Classification model
This section will load up the defined settings from the pickles directory and run the machine learning pipeline with the help of the `pycaret` library and save respective data.

In [2]:
# importing all packages needed in this section
import pandas as pd
import os
import sys 

from sklearn.metrics import classification_report
from pycaret.classification import *

# utility functions for the experiment
sys.path.append('../src')

from mlflow_manager import MLFlowManager
from tuning_grids import Grids
from utils import getPicklesFromDir, getExperimentConfig, run_pycaret_setup, translate_model_name

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

dataset_settings pickle is saved as follows:
```
"meta_data": meta_dataset,  # contains information about the dataset, including path
"setup_param": setup_param, # contains all the setup parameters for pycaret setup() function
"sdg_param": sdg_param,     # contains all sdg parameters for the CTGAN() function

```

In [3]:
for settings in dataset_settings:
    # get path
    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    # run setup function
    s = run_pycaret_setup(dataset_path, settings['setup_param'])
    
    USI = s.get_config('USI')

    
    # Init experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
    
    logg_tags = {
        'USI': USI,
        'Dataset ID': settings['meta']['id'],
        'Dataset Type': 'original'
    }
    
    mlflow.start_run("Original data models", tags=logg_tags)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"
        logg_tags['model']=ml_model
        
        mlflow.start_run(model_name, tags=logg_tags, nested=True)

        # create & tune model
        model = s.create_model(ml_model)
        
        tune_grid = Grids.get_tuning_grid(ml_model)
        
        tuned_model = s.tune_model(model, **config['clf']['tuning_param'], custom_grid=tune_grid)
        
        # get validation results
        val_df = s.pull()
        val_score = {}
        val_score['Accuracy'] = val_df['Accuracy']['Mean']
        val_score['F1-score'] = val_df['F1']['Mean']
        val_score['AUC']      = val_df['AUC']['Mean']
        val_score['Kappa']    = val_df['Kappa']['Mean']
        val_score['MCC']      = val_df['MCC']['Mean']
        
        # test the model on the holdout-data
        holdout_score = s.predict_model(estimator=tuned_model)
        #metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        #metrics_df = pd.DataFrame(metrics).transpose()
        
        # log parameters     
        mlflow.log_params(tuned_model.get_params())
        # log performance
        mlflow.log_metrics(val_score)
        mlflow.log_score_report_to_html(val_df, "Validation")
        mlflow.log_score_report_to_html(holdout_score, "Holdout")
        # log model
        mlflow.log_model(model=tuned_model)
        
        mlflow.end_run()
        
    # Save model details on the model with best accurracy under the the 'Original data models' run
    best_run = mlflow.get_best_run_by_metric(metric_name='Accuracy')
    
    mlflow.log_params(best_run.data.params)
    mlflow.log_metrics(best_run.data.metrics)
    mlflow.log_tag('model run name', best_run.data.tags['mlflow.runName'])
    mlflow.log_tag('model', best_run.data.tags['model'])
    mlflow.log_tag('model run id', best_run.info.run_id)
    
    mlflow.end_run()          

Unnamed: 0,Description,Value
0,Session id,6636
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8065,0.0,0.5,0.9167,0.6471,0.5291,0.5753
1,0.6935,0.0,0.5455,0.5714,0.5581,0.3238,0.324
2,0.629,0.0,0.5,0.4783,0.4889,0.198,0.1981
3,0.6452,0.0,0.8182,0.5,0.6207,0.3221,0.357
4,0.7869,0.0,0.381,1.0,0.5517,0.4466,0.5362
5,0.7213,0.0,0.381,0.6667,0.4848,0.3128,0.3358
6,0.6885,0.0,0.7143,0.5357,0.6122,0.3607,0.3712
7,0.7049,0.0,0.5714,0.5714,0.5714,0.3464,0.3464
8,0.7869,0.0,0.4286,0.9,0.5806,0.4609,0.5179
9,0.623,0.0,0.2857,0.4286,0.3429,0.0931,0.0968


Fitting 10 folds for each of 1 candidates, totalling 10 fits


ValueError: Invalid parameter 'actual_estimator' for estimator SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.001, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l2',
              power_t=0.5, random_state=6636, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False). Valid parameters are: ['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'].

In [10]:
#import numpy as np
tune_grid['C'] = np.logspace(-3,2,20).tolist()
tuned_model = s.tune_model(model, **config['clf']['tuning_param'], custom_grid=tune_grid)



Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 1 candidates, totalling 10 fits


ValueError: Invalid parameter 'actual_estimator' for estimator SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.001, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l2',
              power_t=0.5, random_state=6636, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False). Valid parameters are: ['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'].

In [None]:
end_time = time.perf_counter()

print(f"Time to run the whole notebook: {int(round(end_time-start_time, 0))} seconds")
print(f"Time to run the whole notebook: {round((end_time-start_time)/60, 1)} minutes")

In [None]:

best_run.data.tags['mlflow.runName']
mlflow.get_best_run_by_metric()

In [None]:
mlflow.log_tag('Best model run name', mlflow.get_run_name(best_run))
mlflow.log_tag('model', mlflow.get_model_tag(best_run))
mlflow.log_tag('Best model run id', best_run.info.run_id)

---

### Notice 
Following cells until end of section (i.e. section 3.0) contains experimental code that will not be run.

In [None]:
s.get_leaderboard()

In [None]:
### Following shows which are models are natively available in the pycaret library
# It is possible to add estimators
all_models = models()
display(all_models)

In [None]:
# Need to define param that should be explored, define which method e.g. grid_search vs random vs optuna
# default search method: random grid search
# Todo: lookup default search range parameters

# uses the best model to optimze
#tuned = tune_model(clf, optimize='Accuracy', n_iter=10)