# Step 2: Preprocessing & Classification model
This section will load up the defined settings from the pickles directory and run the machine learning pipeline with the help of the `pycaret` library and save respective data.

In [1]:
# importing all packages needed in this section
import pandas as pd
import os
import sys 

from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             matthews_corrcoef,
                             cohen_kappa_score)

from pycaret.classification import *
from pycaret.containers.models.classification import get_all_model_containers

# utility functions for the experiment
sys.path.append('../src')

from mlflow_manager import MLFlowManager
from tuning_grids import Grids
from utils import getPicklesFromDir, getExperimentConfig, run_pycaret_setup, translate_model_name

# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

dataset_settings pickle is saved as follows:
```
"meta_data": meta_dataset,  # contains information about the dataset, including path
"setup_param": setup_param, # contains all the setup parameters for pycaret setup() function
"sdg_param": sdg_param,     # contains all sdg parameters for the CTGAN() function

```

In [2]:
"""
Create the dataset to save the performance. Initially was going to use mlflow for this. 
However, a bugg surfaced when google colab was used, where it got stuck in a endless loop
trying to read the loggs via the colab cell. Thus this implementation.

Columns:
    Dataset id: str
        the dataset id that the model was evaluated on.
    model: str
        the shortend model name/id (e.g. lr = logistic regression, rf = random forest, etc.)
    F1, Accuracy, AUC: float
        performance metrics from evaluating the model on the hold-out data.
    Params: dict
        the hyperparameters for the model.
    Tuned on: str
        wheter the hyperparameters comes from tuning on original data or synthetic
    Trained on: str
        the type of data that the model was trained on, "original" or "synthetic"
    Quality: str
        if synthetic, the quality id of the generator
    SDG:
        the synthetic genenerator id.
    Dataset type: str
        if the dataset that the model trained on is "original" or "synthetic"
    USI: str
        Unique Settings Identifier, a unique string generated by pycaret setup each initialization
    
    
"""

# Create an empty DataFrame with the specified columns
columns = ["Dataset id", "model", "F1", "Accuracy", "AUC", "MCC", "Kappa", "Params", "Tuned on", "Trained on", "USI", "Quality", "SDG"]
model_performance_df = pd.DataFrame(columns=columns)

performance_row = {}

In [None]:
run_dataset = config['run_dataset']

for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        # Checks if run_dataset contains dataset_id's
        # if it does, run the experiment only on specified datasets
        continue
        
    # get path
    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    # run setup function
    s = run_pycaret_setup(dataset_path, settings['setup_param'])
    
    USI = s.get_config('USI')
    
    # Init experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
    
    logg_tags = {
        'Dataset id': settings['meta']['id'],
        'Tuned on': 'original',
        'Trained on': 'original',
        'USI': USI,
    }
    
    mlflow.start_run(mlflow.run_name_with_original_data, tags=logg_tags)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"
        
        logg_tags['model']=ml_model
        
        mlflow.start_run(model_name, tags=logg_tags, nested=True)

        # create & tune model
        #model = s.create_model(ml_model)
        
        #Quickfix for efficiency
        all_models = get_all_model_containers(s)
        model = all_models[ml_model].class_def()
        
        tune_grid = Grids.get_tuning_grid(ml_model)
        
        tuned_model = s.tune_model(model, **config['clf']['tuning_param'], custom_grid=tune_grid)
        
        # get validation results
        val_df = s.pull()
        val_score = {}
        val_score['val_Accuracy'] = val_df['Accuracy']['Mean']
        val_score['val_F1'] = val_df['F1']['Mean']
        #val_score['AUC']      = val_df['AUC']['Mean']
        #val_score['Kappa']    = val_df['Kappa']['Mean']
        #val_score['MCC']      = val_df['MCC']['Mean']
        
        
        # get the performance on the holdout data
        y_test = s.get_config('y_test')
        pred_model = s.predict_model(estimator=tuned_model)
        y_pred = pred_model['prediction_label']

        metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        holdout_score = pd.DataFrame.from_dict(metrics).transpose()

        test_metrics = {
            "Accuracy": metrics['accuracy'],
            "F1": metrics['macro avg']['f1-score'],
            "MCC": matthews_corrcoef(y_true=y_test, y_pred=y_pred),
            "Kappa": cohen_kappa_score(y1=y_test, y2=y_pred)
        }

        # If there is a prediction_score in the from predict_model (sometimes there isn't)
        if 'prediction_score' in pred_model.columns:
            y_pred_score = pred_model['prediction_score']
            # If multiclass classification, set argument multi_class='one-vs-one'
            if y_test.nunique() > 2:
                m_class = 'ovo'
            else:
                m_class = 'raise'
            test_metrics['AUC'] = roc_auc_score(y_true=y_test, y_score=y_pred_score, multi_class=m_class)


        
        # log parameters     
        mlflow.log_params(tuned_model.get_params())
        # log performance
        mlflow.log_tag('model', ml_model)
        mlflow.log_metrics(test_metrics)
        mlflow.log_metrics(val_score)
        mlflow.log_score_report_to_html(val_df, "Validation")
        mlflow.log_score_report_to_html(holdout_score, "Holdout")
        # log model
        mlflow.log_model(model=tuned_model)
        # end run for the model
        mlflow.end_run()
        
        # quick fix for colab issue
        performance_row = {**logg_tags, **test_metrics}
        performance_row['Params'] = tuned_model.get_params()
        model_performance_df = model_performance_df.append(performance_row, ignore_index=True)

        
    # Save model details on the model with best performance under the the 'Original data models' run
    # Note: Error with google colab, ends up in infinite run when get_best_run_by_metric is called
    # So far, found that it is unable to read the files for getting the data?
    
    ## Fix, dont use mlfow to get the best run, instead, save them into a regular file
    """ Removed for mlflow
    run_id = mlflow.get_active_run_id()
    best_run = mlflow.get_best_nested_run_by_metric(parent_run_id=run_id, metric_name="F1")
    # save under the "parent" run
    mlflow.log_params(best_run.data.params)
    mlflow.log_metrics(best_run.data.metrics)
    mlflow.log_tag('model run name', best_run.data.tags['mlflow.runName'])
    mlflow.log_tag('model', best_run.data.tags['model'])
    mlflow.log_tag('TrainedOn', 'original')
    mlflow.log_tag('Dataset ID', best_run.data.tags['Dataset ID'])
    mlflow.log_tag('model run id', best_run.info.run_id)
    """

    # end run for this dataset
    mlflow.end_run()

# Save model performance to csv
model_performance_df.to_csv(folders['model_perf_filepath'], index=False)

Unnamed: 0,Description,Value
0,Session id,592
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [None]:
#%notify

### Testing that the provided hyperparameters work with pycaret and the system
#settings = dataset_settings[0]
#dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
#s = run_pycaret_setup(dataset_path, settings['setup_param'])
#for ml_model in ['rf', 'gbc', 'mlp']: #config['clf']['ml_models']:
#    # create & tune model
#    #model = s.create_model(ml_model)
#    #Quickfix for efficiency
#    all_models = get_all_model_containers(s)
#    model = all_models[ml_model].class_def()

#    tune_grid = Grids.get_tuning_grid(ml_model)
#    tuned_model = s.tune_model(model, 
#                               **config['clf']['tuning_param'], 
#                               custom_grid=tune_grid
#                              )