# Step 2: Preprocessing & Classification model
This section will load up the defined settings from the pickles directory and run the machine learning pipeline with the help of the `pycaret` library and save respective data.

In [1]:
# importing all packages needed in this section
import pandas as pd
import os
import sys 

from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             matthews_corrcoef,
                             cohen_kappa_score)

from pycaret.classification import *
from pycaret.containers.models.classification import get_all_model_containers

# utility functions for the experiment
sys.path.append('../src')

from mlflow_manager import MLFlowManager
from tuning_grids import Grids
from utils import getPicklesFromDir, getExperimentConfig, run_pycaret_setup, translate_model_name

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

dataset_settings pickle is saved as follows:
```
"meta_data": meta_dataset,  # contains information about the dataset, including path
"setup_param": setup_param, # contains all the setup parameters for pycaret setup() function
"sdg_param": sdg_param,     # contains all sdg parameters for the CTGAN() function

```

In [2]:
#TODO: use test-holdout data

run_dataset = config['run_dataset']

for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        # Checks if run_dataset contains dataset_id's
        # if it does, run the experiment only on specified datasets
        continue
        
    # get path
    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    # run setup function
    s = run_pycaret_setup(dataset_path, settings['setup_param'])
    
    USI = s.get_config('USI')
    
    # Init experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
    
    logg_tags = {
        'USI': USI,
        'Dataset ID': settings['meta']['id'],
        'TrainedOn': 'original'
    }
    
    mlflow.start_run("Original data models", tags=logg_tags)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"
        
        logg_tags['model']=ml_model
        
        mlflow.start_run(model_name, tags=logg_tags, nested=True)

        # create & tune model
        #model = s.create_model(ml_model)
        
        #Quickfix for efficiency
        all_models = get_all_model_containers(s)
        model = all_models[ml_model].class_def()
        
        tune_grid = Grids.get_tuning_grid(ml_model)
        
        tuned_model = s.tune_model(model, **config['clf']['tuning_param'], custom_grid=tune_grid)
        
        # get validation results
        val_df = s.pull()
        val_score = {}
        val_score['val_Accuracy'] = val_df['Accuracy']['Mean']
        val_score['val_F1'] = val_df['F1']['Mean']
        #val_score['AUC']      = val_df['AUC']['Mean']
        #val_score['Kappa']    = val_df['Kappa']['Mean']
        #val_score['MCC']      = val_df['MCC']['Mean']
        
        
        # get the performance on the holdout data
        y_test = s.get_config('y_test')
        pred_model = s.predict_model(estimator=tuned_model)
        y_pred = pred_model['prediction_label']

        metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        holdout_score = pd.DataFrame.from_dict(metrics).transpose()

        test_metrics = {
            "Accuracy": metrics['accuracy'],
            "F1": metrics['macro avg']['f1-score'],
            "MCC": matthews_corrcoef(y_true=y_test, y_pred=y_pred),
            "Kappa": cohen_kappa_score(y1=y_test, y2=y_pred)
        }

        # If there is a prediction_score in the from predict_model (sometimes there isn't)
        if 'prediction_score' in pred_model.columns:
            y_pred_score = pred_model['prediction_score']
            # If multiclass classification, set argument multi_class='one-vs-one'
            if y_test.nunique() > 2:
                m_class = 'ovo'
            else:
                m_class = 'raise'
            test_metrics['AUC'] = roc_auc_score(y_true=y_test, y_score=y_pred_score, multi_class=m_class)


        
        # log parameters     
        mlflow.log_params(tuned_model.get_params())
        # log performance
        mlflow.log_tag('model', ml_model)
        mlflow.log_metrics(test_metrics)
        mlflow.log_metrics(val_score)
        mlflow.log_score_report_to_html(val_df, "Validation")
        mlflow.log_score_report_to_html(holdout_score, "Holdout")
        # log model
        mlflow.log_model(model=tuned_model)
        
        mlflow.end_run()
        
    # Save model details on the model with best accurracy under the the 'Original data models' run
    # Note: Error with google colab, ends up in infinite run when get_best_run_by_metric is called
    best_run = mlflow.get_best_run_by_metric(metric_name='F1')
    # save under the "parent" run
    mlflow.log_params(best_run.data.params)
    mlflow.log_metrics(best_run.data.metrics)
    mlflow.log_tag('model run name', best_run.data.tags['mlflow.runName'])
    mlflow.log_tag('model', best_run.data.tags['model'])
    mlflow.log_tag('TrainedOn', 'original')
    mlflow.log_tag('Dataset ID', best_run.data.tags['Dataset ID'])
    mlflow.log_tag('model run id', best_run.info.run_id)
    
    
    mlflow.end_run()          

Unnamed: 0,Description,Value
0,Session id,6371
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7742,0.7693,0.5455,0.75,0.6316,0.4746,0.4871
1,0.7258,0.7943,0.5455,0.6316,0.5854,0.3822,0.3845
2,0.7903,0.8625,0.5909,0.7647,0.6667,0.5174,0.5265
3,0.8226,0.892,0.5455,0.9231,0.6857,0.5732,0.6117
4,0.7213,0.7869,0.4762,0.625,0.5405,0.3457,0.3523
5,0.8361,0.8476,0.6667,0.8235,0.7368,0.6197,0.627
6,0.8197,0.8619,0.619,0.8125,0.7027,0.5767,0.5876
7,0.7541,0.7976,0.5238,0.6875,0.5946,0.4227,0.4308
8,0.6885,0.819,0.4286,0.5625,0.4865,0.2688,0.2739
9,0.8033,0.8333,0.5714,0.8,0.6667,0.5326,0.5477


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7792,0.8707,0.5556,0.75,0.6383,0.4844,0.4957


In [3]:
#%notify

### Testing that the provided hyperparameters work with pycaret and the system
#settings = dataset_settings[0]
#dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
#s = run_pycaret_setup(dataset_path, settings['setup_param'])
#for ml_model in ['rf', 'gbc', 'mlp']: #config['clf']['ml_models']:
#    # create & tune model
#    #model = s.create_model(ml_model)
#    #Quickfix for efficiency
#    all_models = get_all_model_containers(s)
#    model = all_models[ml_model].class_def()

#    tune_grid = Grids.get_tuning_grid(ml_model)
#    tuned_model = s.tune_model(model, 
#                               **config['clf']['tuning_param'], 
#                               custom_grid=tune_grid
#                              )