# Step 2: Preprocessing & Classification model
This section will load up the defined settings from the pickles directory and run the machine learning pipeline with the help of the `pycaret` library and save respective data.

In [1]:
# importing all packages needed in this section
import pandas as pd
import os
import sys 

from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             matthews_corrcoef,
                             cohen_kappa_score)

from pycaret.classification import *
from pycaret.containers.models.classification import get_all_model_containers

# utility functions for the experiment
sys.path.append('../src')

from mlflow_manager import MLFlowManager
from tuning_grids import Grids
from utils import getPicklesFromDir, getExperimentConfig, run_pycaret_setup, translate_model_name
# Get global experiment settings
config = getExperimentConfig()
folders = config['folders']
# get a list of all settings for the datasets prepared beforehand
dataset_settings = getPicklesFromDir(folders['settings_dir'])  

dataset_settings pickle is saved as follows:
```
"meta_data": meta_dataset,  # contains information about the dataset, including path
"setup_param": setup_param, # contains all the setup parameters for pycaret setup() function
"sdg_param": sdg_param,     # contains all sdg parameters for the CTGAN() function

```

In [2]:
"""
Create the dataset to save the performance. Initially was going to use mlflow for this. 
However, a bugg surfaced when google colab was used, where it got stuck in a endless loop
trying to read the loggs via the colab cell. Thus this implementation.

Columns:
    Dataset id: str
        the dataset id that the model was evaluated on.
    model: str
        the shortend model name/id (e.g. lr = logistic regression, rf = random forest, etc.)
    F1, Accuracy, AUC: float
        performance metrics from evaluating the model on the hold-out data.
    Params: dict
        the hyperparameters for the model.
    Tuned on: str
        wheter the hyperparameters comes from tuning on original data or synthetic
    Trained on: str
        the type of data that the model was trained on, "original" or "synthetic"
    Quality: str
        if synthetic, the quality id of the generator
    SDG:
        the synthetic genenerator id.
    Dataset type: str
        if the dataset that the model trained on is "original" or "synthetic"
    USI: str
        Unique Settings Identifier, a unique string generated by pycaret setup each initialization
    
    
"""

# Create an empty DataFrame with the specified columns
columns = ["Dataset id", "model", "F1", "Accuracy", "AUC", "MCC", "Kappa", "Params", "Tuned on", "Trained on", "USI", "Quality", "SDG"]

# if it exists, read it, else create a new one
if os.path.isfile(folders['model_perf_filepath']):
    model_performance_df = pd.read_csv(folders['model_perf_filepath'])
else:
    model_performance_df = pd.DataFrame(columns=columns)

performance_row = {}

In [None]:
run_dataset = config['run_dataset']

for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        # Checks if run_dataset contains dataset_id's
        # if it does, run the experiment only on specified datasets
        continue
        
    # get path
    dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
    # run setup function
    s = run_pycaret_setup(dataset_path, settings['setup_param'], meta=settings['meta'])
    
    USI = s.get_config('USI')
    
    # Init experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
    
    logg_tags = {
        'Dataset id': settings['meta']['id'],
        'Tuned on': 'original',
        'Trained on': 'original',
        'USI': USI,
    }
    
    mlflow.start_run(mlflow.run_name_with_original_data, tags=logg_tags)
    
    # for each defined model in the global config
    # create specified model and tune it
    for ml_model in config['clf']['ml_models']:
        
        model_name = f"{settings['meta']['id']}-{translate_model_name(ml_model)}"
        
        logg_tags['model']=ml_model
        
        mlflow.start_run(model_name, tags=logg_tags, nested=True)
        # select model based on wheter using GPU or not

        all_models = get_all_model_containers(s)
        model = all_models[ml_model].class_def()

        tune_grid = Grids.get_tuning_grid(ml_model)  
        print(f"Tune grid: {tune_grid}")
        tuned_model = s.tune_model(model, custom_grid=tune_grid, **config['clf']['tuning_param'])

        # get validation results
        val_df = s.pull()
        val_score = {}
        val_score['val_Accuracy'] = val_df['Accuracy']['Mean']
        val_score['val_F1'] = val_df['F1']['Mean']
               
        # get the performance on the holdout data
        y_test = s.get_config('y_test')
        pred_model = s.predict_model(estimator=tuned_model)
        y_pred = pred_model['prediction_label']

        metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        holdout_score = pd.DataFrame.from_dict(metrics).transpose()

        test_metrics = {
            "Accuracy": metrics['accuracy'],
            "F1": metrics['macro avg']['f1-score'],
            "MCC": matthews_corrcoef(y_true=y_test, y_pred=y_pred),
            "Kappa": cohen_kappa_score(y1=y_test, y2=y_pred)
        }
        
        # If there is a prediction_score in the from predict_model (sometimes there isn't)
        if 'prediction_score' in pred_model.columns:
            y_pred_score = pred_model['prediction_score']
            # If multiclass classification, set argument multi_class='one-vs-one'
            if y_test.nunique() <= 2:
                test_metrics['AUC'] = roc_auc_score(y_true=y_test, y_score=y_pred_score.values)
    
        # log parameters     
        mlflow.log_params(tuned_model.get_params())
        # log performance
        mlflow.log_tag('model', ml_model)
        mlflow.log_metrics(test_metrics)
        mlflow.log_metrics(val_score)
        mlflow.log_score_report_to_html(val_df, "Validation")
        mlflow.log_score_report_to_html(holdout_score, "Holdout")
        # log model
        mlflow.log_model(model=tuned_model)
        # end run for the model
        mlflow.end_run()
        
        # quick fix for colab issue
        performance_row = {**logg_tags, **test_metrics}
        performance_row['Params'] = tuned_model.get_params()
        model_performance_df = model_performance_df.append(performance_row, ignore_index=True)

    # end run for this dataset
    mlflow.end_run()

# Save model performance to csv
model_performance_df.to_csv(folders['model_perf_filepath'], index=False)

Unnamed: 0,Description,Value
0,Session id,8601
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,


Tune grid: {'C': [0.0001, 0.00026366508987303583, 0.0006951927961775605, 0.0018329807108324356, 0.004832930238571752, 0.012742749857031334, 0.03359818286283781, 0.08858667904100823, 0.23357214690901212, 0.615848211066026, 1.623776739188721, 4.281332398719396, 11.288378916846883, 29.763514416313132, 78.47599703514607, 206.913808111479, 545.5594781168514, 1438.44988828766, 3792.690190732246, 10000.0], 'penalty': ['l2', None]}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8226,0.8909,0.7273,0.7619,0.7442,0.6085,0.6089
1,0.8065,0.9148,0.5909,0.8125,0.6842,0.5496,0.5641
2,0.7097,0.8148,0.4091,0.6429,0.5,0.3094,0.3251
3,0.8226,0.8557,0.6818,0.7895,0.7317,0.6002,0.6038
4,0.7213,0.7643,0.4762,0.625,0.5405,0.3457,0.3523
5,0.7869,0.7893,0.6667,0.7,0.6829,0.5226,0.5229
6,0.8033,0.9155,0.5238,0.8462,0.6471,0.5209,0.5497
7,0.7541,0.8381,0.5238,0.6875,0.5946,0.4227,0.4308
8,0.8197,0.8107,0.619,0.8125,0.7027,0.5767,0.5876
9,0.7049,0.8179,0.5238,0.5789,0.55,0.3313,0.3322


[32m[I 2023-04-30 16:58:30,287][0m Searching the best hyperparameters using 614 samples...[0m
[32m[I 2023-04-30 17:01:32,151][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Tune grid: {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski'], 'p': [1, 2]}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8387,0.8534,0.7727,0.7727,0.7727,0.6477,0.6477
1,0.8226,0.8727,0.6364,0.8235,0.7179,0.5916,0.6021
2,0.6935,0.8153,0.3182,0.6364,0.4242,0.2458,0.2733
3,0.7903,0.8205,0.6364,0.7368,0.6829,0.5275,0.5307
4,0.7377,0.7565,0.5238,0.6471,0.5789,0.3915,0.3961
5,0.7377,0.7887,0.4762,0.6667,0.5556,0.3768,0.3875
6,0.7705,0.9024,0.4286,0.8182,0.5625,0.4268,0.4678
7,0.7377,0.7732,0.5238,0.6471,0.5789,0.3915,0.3961
8,0.7869,0.8089,0.5714,0.75,0.6486,0.4997,0.5092
9,0.7213,0.7857,0.4286,0.6429,0.5143,0.3297,0.343


[32m[I 2023-04-30 17:02:18,379][0m Searching the best hyperparameters using 614 samples...[0m
[32m[I 2023-04-30 17:05:22,278][0m Finished hyperparemeter search![0m


Tune grid: {'C': [0.0001, 0.00026366508987303583, 0.0006951927961775605, 0.0018329807108324356, 0.004832930238571752, 0.012742749857031334, 0.03359818286283781, 0.08858667904100823, 0.23357214690901212, 0.615848211066026, 1.623776739188721, 4.281332398719396, 11.288378916846883, 29.763514416313132, 78.47599703514607, 206.913808111479, 545.5594781168514, 1438.44988828766, 3792.690190732246, 10000.0], 'kernel': ['poly', 'sigmoid', 'rbf'], 'degree': [2, 3, 4, 5, 6], 'gamma': ['scale', 'auto'], 'coef0': [0.0, 0.1, 0.25, 0.5, 0, 75, 1.0], 'shrinking': [True, False], 'max_iter': [100000, 100000]}


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

[32m[I 2023-04-30 17:06:06,759][0m Searching the best hyperparameters using 614 samples...[0m


In [None]:
#%notify

### Testing that the provided hyperparameters work with pycaret and the system
#settings = dataset_settings[0]
#dataset_path = f"{folders['real_dir']}{settings['meta']['filename']}"
#s = run_pycaret_setup(dataset_path, settings['setup_param'])
#for ml_model in ['rf', 'gbc', 'mlp']: #config['clf']['ml_models']:
#    # create & tune model
#    #model = s.create_model(ml_model)
#    #Quickfix for efficiency
#    all_models = get_all_model_containers(s)
#    model = all_models[ml_model].class_def()

#    tune_grid = Grids.get_tuning_grid(ml_model)
#    tuned_model = s.tune_model(model, 
#                               **config['clf']['tuning_param'], 
#                               custom_grid=tune_grid
#                              )

In [None]:

#type_of_target(y_df)

#from sklearn.preprocessing import LabelEncoder

#label_encoder = LabelEncoder()
#y = label_encoder.fit_transform(y_df)

#display(type(y))
#display(type_of_target(y))