In [1]:
# How long the whole notebook took to run
import time

start_time = time.perf_counter()

# Step 2: Preprocessing & Classification model
This section will load up the defined settings from the pickles directory and run the machine learning pipeline with the help of the `pycaret` library and save respective data.

In [2]:
# importing all packages needed in this section
import pandas as pd
import os
import sys 

from sklearn.metrics import classification_report
from pycaret.classification import *

# utility functions for the experiment
sys.path.append('../src')

from mlflow_manager import MLFlowManager
from utils import getPicklesFromDir, getExperimentConfig, run_pycaret_setup, translate_model_name

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

dataset_settings pickle is saved as follows:
```
"meta_data": meta_dataset,  # contains information about the dataset, including path
"setup_param": setup_param, # contains all the setup parameters for pycaret setup() function
"sdg_param": sdg_param,     # contains all sdg parameters for the CTGAN() function

```

In [3]:
for settings in dataset_settings:
    # get path
    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    # run setup function
    s = run_pycaret_setup(dataset_path, settings['setup_param'])
    
    USI = s.get_config('USI')

    
    # Init experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
    
    logg_tags = {
        'USI': USI,
        'Dataset ID': settings['meta']['id'],
        'Dataset Type': 'original'
    }
    
    mlflow.start_run("Original data models", tags=logg_tags)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"
        logg_tags['model']=ml_model
        
        mlflow.start_run(model_name, tags=logg_tags, nested=True)

        # create & tune model
        model = s.create_model(ml_model)
        tuned_model = s.tune_model( model, **config['clf']['tuning_param'] )
        
        # get validation results
        val_df = s.pull()
        val_score = {}
        val_score['Accuracy'] = val_df['Accuracy']['Mean']
        val_score['F1-score'] = val_df['F1']['Mean']
        val_score['AUC']      = val_df['AUC']['Mean']
        val_score['Kappa']    = val_df['Kappa']['Mean']
        val_score['MCC']      = val_df['MCC']['Mean']
        
        # test the model on the holdout-data
        holdout_score = s.predict_model(estimator=tuned_model)
        #metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        #metrics_df = pd.DataFrame(metrics).transpose()
        
        # log parameters     
        mlflow.log_params(tuned_model.get_params())
        # log performance
        mlflow.log_metrics(val_score)
        mlflow.log_score_report_to_html(val_df, "Validation")
        mlflow.log_score_report_to_html(holdout_score, "Holdout")
        # log model
        mlflow.log_model(model=tuned_model)
        
        mlflow.end_run()
        
    # Save model details on the model with best accurracy under the the 'Original data models' run
    best_run = mlflow.get_best_run_by_metric(metric_name='Accuracy')
    
    mlflow.log_params(best_run.data.params)
    mlflow.log_metrics(best_run.data.metrics)
    mlflow.log_tag('model run name', best_run.data.tags['mlflow.runName'])
    mlflow.log_tag('model', best_run.data.tags['model'])
    mlflow.log_tag('model run id', best_run.info.run_id)
    
    mlflow.end_run()          

Unnamed: 0,Description,Value
0,Session id,890
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7903,0.8693,0.5909,0.7647,0.6667,0.5174,0.5265
1,0.7903,0.8864,0.6364,0.7368,0.6829,0.5275,0.5307
2,0.7581,0.8625,0.5909,0.6842,0.6341,0.4549,0.4576
3,0.6935,0.7955,0.4091,0.6,0.4865,0.2791,0.2895
4,0.8033,0.8583,0.7619,0.6957,0.7273,0.5739,0.5754
5,0.8197,0.8976,0.619,0.8125,0.7027,0.5767,0.5876
6,0.7705,0.8464,0.4762,0.7692,0.5882,0.4411,0.4655
7,0.7377,0.7583,0.6667,0.6087,0.6364,0.4319,0.433
8,0.7377,0.7881,0.4286,0.6923,0.5294,0.3613,0.3812
9,0.7869,0.8452,0.619,0.7222,0.6667,0.5114,0.5147


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7742,0.8693,0.6818,0.6818,0.6818,0.5068,0.5068
1,0.7903,0.8886,0.8636,0.6552,0.7451,0.5726,0.5884
2,0.7742,0.8614,0.6818,0.6818,0.6818,0.5068,0.5068
3,0.7097,0.8,0.6818,0.5769,0.625,0.3908,0.3945
4,0.7869,0.8643,0.8095,0.6538,0.7234,0.5532,0.5616
5,0.8197,0.8964,0.7619,0.7273,0.7442,0.6051,0.6055
6,0.7213,0.8381,0.619,0.5909,0.6047,0.3896,0.3899
7,0.6393,0.7607,0.7619,0.4848,0.5926,0.2966,0.3212
8,0.7705,0.7893,0.619,0.6842,0.65,0.4799,0.4812
9,0.7705,0.8452,0.6667,0.6667,0.6667,0.4917,0.4917


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7792,0.7793,0.5556,0.75,0.6383,0.4844,0.4957


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7581,0.808,0.4545,0.7692,0.5714,0.418,0.4461
1,0.7258,0.7693,0.5909,0.619,0.6047,0.3949,0.3952
2,0.7419,0.7636,0.5,0.6875,0.5789,0.3995,0.41
3,0.6613,0.7506,0.3636,0.5333,0.4324,0.2032,0.2108
4,0.7049,0.7708,0.5238,0.5789,0.55,0.3313,0.3322
5,0.7541,0.8071,0.619,0.65,0.6341,0.4491,0.4494
6,0.7705,0.7744,0.619,0.6842,0.65,0.4799,0.4812
7,0.7049,0.7173,0.619,0.5652,0.5909,0.3609,0.3618
8,0.7377,0.7357,0.5238,0.6471,0.5789,0.3915,0.3961
9,0.8197,0.8845,0.619,0.8125,0.7027,0.5767,0.5876


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7903,0.8284,0.5,0.8462,0.6286,0.4956,0.5289
1,0.7581,0.8682,0.4545,0.7692,0.5714,0.418,0.4461
2,0.7903,0.8261,0.4545,0.9091,0.6061,0.484,0.538
3,0.7581,0.8642,0.4091,0.8182,0.5455,0.4046,0.4497
4,0.7705,0.8738,0.619,0.6842,0.65,0.4799,0.4812
5,0.7541,0.828,0.381,0.8,0.5161,0.378,0.4247
6,0.7705,0.8298,0.381,0.8889,0.5333,0.4118,0.4769
7,0.7377,0.725,0.5238,0.6471,0.5789,0.3915,0.3961
8,0.7541,0.7863,0.3333,0.875,0.4828,0.3615,0.434
9,0.7705,0.8661,0.4286,0.8182,0.5625,0.4268,0.4678


Fitting 10 folds for each of 2 candidates, totalling 20 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.7273,0.7964,0.3704,0.7143,0.4878,0.3265,0.3592


In [4]:
end_time = time.perf_counter()

print(f"Time to run the whole notebook: {int(round(end_time-start_time, 0))} seconds")
print(f"Time to run the whole notebook: {round((end_time-start_time)/60, 1)} minutes")

Time to run the whole notebook: 156 seconds
Time to run the whole notebook: 2.6 minutes


In [5]:

best_run.data.tags['mlflow.runName']
mlflow.get_best_run_by_metric()

<Run: data=<RunData: metrics={'AUC': 0.8337,
 'Accuracy': 0.7735,
 'F1-score': 0.6413,
 'Kappa': 0.4794,
 'MCC': 0.4908}, params={'C': '2.506',
 'class_weight': '{}',
 'dual': 'False',
 'fit_intercept': 'True',
 'intercept_scaling': '1',
 'l1_ratio': 'None',
 'max_iter': '1000',
 'multi_class': 'auto',
 'n_jobs': 'None',
 'penalty': 'l2',
 'random_state': '202',
 'solver': 'lbfgs',
 'tol': '0.0001',
 'verbose': '0',
 'warm_start': 'False'}, tags={'Dataset ID': 'D0',
 'Dataset Type': 'original',
 'Run ID': '68afdec904da49348d9da9de08d43a7c',
 'USI': 'a572',
 'mlflow.runName': 'Original data models',
 'mlflow.source.name': 'C:\\Users\\flore\\miniconda3\\envs\\master\\lib\\site-packages\\ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'flore',
 'model': 'lr',
 'model run id': '928f7a4e95c34cbab14442aec087223b',
 'model run name': 'D0-Logistic Regression'}>, info=<RunInfo: artifact_uri='file:///C:/Users/flore/source/repos/master-thesis-vt23/notebooks/mlruns/3/68afde

In [6]:
mlflow.log_tag('Best model run name', mlflow.get_run_name(best_run))
mlflow.log_tag('model', mlflow.get_model_tag(best_run))
mlflow.log_tag('Best model run id', best_run.info.run_id)

---

### Notice 
Following cells until end of section (i.e. section 3.0) contains experimental code that will not be run.

In [7]:
s.get_leaderboard()

Unnamed: 0_level_0,Model Name,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Logistic Regression,"(TransformerWrapper(exclude=None,\n ...",0.7688,0.8408,0.5799,0.7086,0.6321,0.4675,0.4762
1,Logistic Regression,"(TransformerWrapper(exclude=None,\n ...",0.7557,0.8413,0.7147,0.6403,0.6715,0.4793,0.4848
2,Logistic Regression,"(TransformerWrapper(exclude=None,\n ...",0.7688,0.8408,0.5799,0.7086,0.6321,0.4675,0.4762
3,K Neighbors Classifier,"(TransformerWrapper(exclude=None,\n ...",0.7379,0.7781,0.5433,0.6547,0.5894,0.4005,0.4071
4,K Neighbors Classifier,"(TransformerWrapper(exclude=None,\n ...",0.7654,0.8296,0.4485,0.8056,0.5675,0.4252,0.4644
5,K Neighbors Classifier,"(TransformerWrapper(exclude=None,\n ...",0.7379,0.7781,0.5433,0.6547,0.5894,0.4005,0.4071


In [8]:
### Following shows which are models are natively available in the pycaret library
# It is possible to add estimators
all_models = models()
display(all_models)

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [9]:
# Need to define param that should be explored, define which method e.g. grid_search vs random vs optuna
# default search method: random grid search
# Todo: lookup default search range parameters

# uses the best model to optimze
#tuned = tune_model(clf, optimize='Accuracy', n_iter=10)