In [1]:
# How long the whole notebook took to run
import time

start_time = time.perf_counter()

# Step 2: Preprocessing & Classification model
This section will load up the defined settings from the pickles directory and run the machine learning pipeline with the help of the `pycaret` library and save respective data.

In [2]:
# importing all packages needed in this section
import pandas as pd
import os
import sys 

from sklearn.metrics import classification_report
from pycaret.classification import *

# utility functions for the experiment
sys.path.append('../src')

from mlflow_manager import MLFlowManager
from utils import getPicklesFromDir, getExperimentConfig, run_pycaret_setup, translate_model_name

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

dataset_settings pickle is saved as follows:
```
"meta_data": meta_dataset,  # contains information about the dataset, including path
"setup_param": setup_param, # contains all the setup parameters for pycaret setup() function
"sdg_param": sdg_param,     # contains all sdg parameters for the CTGAN() function

```

In [None]:
for settings in dataset_settings:
    # get path
    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    # run setup function
    s = run_pycaret_setup(dataset_path, settings['setup_param'])
    
    USI = s.get_config('USI')
    
    #TODO: clean commented code
    #test_data = s.get_config('test')
    #x_test_transformed = get_config('X_test_transformed')
    #y_test = get_config('y_test')

    
    # Init experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
    
    logg_tags = {
        'USI': USI,
        'Dataset ID': settings['meta']['id'],
        'Dataset Type': 'original'
    }
    
    mlflow.start_run("Original data models", tags=logg_tags)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"
        mlflow.start_run(model_name, tags=logg_tags, nested=True)

        # create & tune model
        model = s.create_model(ml_model)
        tuned_model = s.tune_model( model, **config['clf']['tuning_param'] )
        
        # get validation results
        val_df = s.pull()
        val_score = {}
        val_score['Accuracy'] = val_df['Accuracy']['Mean']
        val_score['F1-score'] = val_df['F1']['Mean']
        val_score['AUC']      = val_df['AUC']['Mean']
        val_score['Kappa']    = val_df['Kappa']['Mean']
        val_score['MCC']      = val_df['MCC']['Mean']
        
        # test the model on the holdout-data
        holdout_score = s.predict_model(estimator=tuned_model)
        #metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        #metrics_df = pd.DataFrame(metrics).transpose()
        
        # log parameters     
        mlflow.log_params(tuned_model.get_params())
        mlflow.log_tag('model', ml_model)
        # log performance
        mlflow.log_metrics(val_score)
        mlflow.log_score_report_to_html(val_df, "Validation")
        mlflow.log_score_report_to_html(holdout_score, "Holdout")
        # log model
        mlflow.log_model(model=tuned_model)
        
        mlflow.end_run()
        
    # Save model details on the model with best accurracy under the the 'Original data models' run
    best_run = mlflow.get_best_run_by_metric(metric_name='Accuracy')
    
    mlflow.log_params(best_run.data.params)
    mlflow.log_metrics(best_run.data.metrics)
    mlflow.log_tag('Best model run name', mlflow.get_run_name(best_run))
    mlflow.log_tag('model', mlflow.get_model_tag(best_run))
    mlflow.log_tag('Best model run id', best_run.info.run_id)
    
    mlflow.end_run()          

Unnamed: 0,Description,Value
0,Session id,7588
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6774,0.7864,0.3636,0.5714,0.4444,0.2327,0.2445
1,0.7258,0.7364,0.4091,0.6923,0.5143,0.3404,0.3633
2,0.8387,0.9045,0.7727,0.7727,0.7727,0.6477,0.6477
3,0.7097,0.8295,0.5,0.6111,0.55,0.3389,0.3426
4,0.7213,0.8131,0.5238,0.6111,0.5641,0.3611,0.3634
5,0.7541,0.8214,0.619,0.65,0.6341,0.4491,0.4494
6,0.8033,0.8655,0.5238,0.8462,0.6471,0.5209,0.5497
7,0.7049,0.75,0.4286,0.6,0.5,0.2989,0.3074
8,0.7377,0.8083,0.619,0.619,0.619,0.419,0.419
9,0.8361,0.8333,0.5714,0.9231,0.7059,0.6008,0.634


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6774,0.7864,0.3636,0.5714,0.4444,0.2327,0.2445
1,0.7258,0.7352,0.4091,0.6923,0.5143,0.3404,0.3633
2,0.8387,0.9045,0.7727,0.7727,0.7727,0.6477,0.6477
3,0.7097,0.8295,0.5,0.6111,0.55,0.3389,0.3426
4,0.7213,0.8131,0.5238,0.6111,0.5641,0.3611,0.3634
5,0.7541,0.8214,0.619,0.65,0.6341,0.4491,0.4494
6,0.7869,0.8631,0.5238,0.7857,0.6286,0.4874,0.5071
7,0.7049,0.7512,0.4286,0.6,0.5,0.2989,0.3074
8,0.7377,0.8095,0.619,0.619,0.619,0.419,0.419
9,0.8361,0.8333,0.5714,0.9231,0.7059,0.6008,0.634


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.8506,0.8885,0.7037,0.8444,0.7677,0.659,0.6649


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7419,0.7239,0.5909,0.65,0.619,0.4246,0.4257
1,0.6774,0.6716,0.4545,0.5556,0.5,0.2654,0.2683
2,0.7419,0.7528,0.6364,0.6364,0.6364,0.4364,0.4364
3,0.7419,0.758,0.5455,0.6667,0.6,0.4123,0.4168
4,0.7049,0.753,0.5238,0.5789,0.55,0.3313,0.3322
5,0.6885,0.7244,0.4762,0.5556,0.5128,0.2859,0.2877
6,0.7049,0.7298,0.4286,0.6,0.5,0.2989,0.3074
7,0.7049,0.7185,0.4762,0.5882,0.5263,0.3155,0.3192
8,0.7377,0.8327,0.5238,0.6471,0.5789,0.3915,0.3961
9,0.8361,0.7988,0.619,0.8667,0.7222,0.6105,0.6279


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7258,0.8205,0.4091,0.6923,0.5143,0.3404,0.3633
1,0.6935,0.6784,0.3182,0.6364,0.4242,0.2458,0.2733
2,0.8065,0.8352,0.6364,0.7778,0.7,0.5592,0.5654
3,0.7742,0.8091,0.5455,0.75,0.6316,0.4746,0.4871
4,0.7705,0.8071,0.4762,0.7692,0.5882,0.4411,0.4655
5,0.7213,0.772,0.381,0.6667,0.4848,0.3128,0.3358
6,0.7705,0.856,0.381,0.8889,0.5333,0.4118,0.4769
7,0.7213,0.75,0.3333,0.7,0.4516,0.295,0.3315
8,0.7705,0.8452,0.4762,0.7692,0.5882,0.4411,0.4655
9,0.8033,0.9119,0.4762,0.9091,0.625,0.5087,0.5576


Fitting 10 folds for each of 2 candidates, totalling 20 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.7662,0.8407,0.5,0.75,0.6,0.444,0.4623


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7097,0.8011,0.5909,0.5909,0.5909,0.3659,0.3659
1,0.6774,0.708,0.4091,0.5625,0.4737,0.2494,0.256
2,0.8387,0.9114,0.9091,0.7143,0.8,0.6681,0.6817
3,0.7258,0.8136,0.5455,0.6316,0.5854,0.3822,0.3845
4,0.7213,0.7702,0.4762,0.625,0.5405,0.3457,0.3523
5,0.8033,0.8226,0.7143,0.7143,0.7143,0.5643,0.5643
6,0.7869,0.8714,0.5238,0.7857,0.6286,0.4874,0.5071
7,0.6721,0.7286,0.4762,0.5263,0.5,0.257,0.2577
8,0.7049,0.8083,0.5714,0.5714,0.5714,0.3464,0.3464
9,0.7869,0.806,0.5714,0.75,0.6486,0.4997,0.5092


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [None]:
s.get_leaderboard()

In [None]:
end_time = time.perf_counter()

print(f"Time to run the whole notebook: {int(round(end_time-start_time, 0))} seconds")
print(f"Time to run the whole notebook: {round((end_time-start_time)/60, 1)} minutes")

---

### Notice 
Following cells until end of section (i.e. section 3.0) contains experimental code that will not be run.

In [None]:
### Following shows which are models are natively available in the pycaret library
# It is possible to add estimators
all_models = models()
display(all_models)

In [None]:
# Need to define param that should be explored, define which method e.g. grid_search vs random vs optuna
# default search method: random grid search
# Todo: lookup default search range parameters

# uses the best model to optimze
#tuned = tune_model(clf, optimize='Accuracy', n_iter=10)