# Step 4: Create models with SD

In [2]:
import pandas as pd 
import os 
import sys 
import pickle
import re
import ast

from pycaret.classification import ClassificationExperiment
from pycaret.containers.models.classification import get_all_model_containers

from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             matthews_corrcoef,
                             cohen_kappa_score)
from sklearn.model_selection import train_test_split

# Import help methods
sys.path.append('../src')
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   run_pycaret_setup, 
                   translate_model_name,
                   get_synthetic_filepaths_from_original_data_id,
                   convert_and_clean_dict)

from tuning_grids import Grids
from mlflow_manager import MLFlowManager

# Get global variables for the experiment
config = getExperimentConfig()
# Get folders
folders = config['folders']
# Load dataset specific settings (from the real-data)
dataset_settings = getPicklesFromDir(folders['settings_dir'])

In [2]:
# read performance data from Step 2
model_performance_df = pd.read_csv(folders['model_perf_filepath'])
# Specify the metrics to sort by for choosing best model
# Choose the target metric when tuning the models
sort_by = config['clf']['tuning_param']['optimize']

run_dataset = config['run_dataset']

for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        continue
        
    # update system_log name
    settings['setup_param']['system_log'] = folders['log_dir']+"Step4_SD"
    # disable saving train-test split data (to save space)
    settings['setup_param']['log_data'] = False
    
    # Get experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
     
    # load original dataset
    cols_dtype=None
    if 'cols_dtype' in meta:
        cols_dtyped = settings['meta']['cols_dtype']
        
    original_data = pd.read_csv(f"{folders['real_dir']}{settings['meta']['filename']}", dtype=cols_dtype)
    
    # Only need the test data, using same stratified split size as in Step 2 
    _, x_test, _, y_test = train_test_split(original_data.drop(columns=[settings['meta']['target']], axis=1), # X (predictors)
                                            original_data[settings['meta']['target']],        # y (target label)
                                            train_size=settings['setup_param']['train_size'], 
                                            stratify=original_data[settings['meta']['target']])


    logg_tags = {
        'Trained on': 'synthetic',
        'Tuned on': 'original',
    }
    mlflow.start_run('Synthetic data models', tags=logg_tags)
    
    # Filter the DataFrame based on the Dataset id and sort by specified column
    # to get hyperparameters and model name for the "best model"
    filtered_df = model_performance_df[model_performance_df["Dataset id"] == settings['meta']['id']]
    sorted_df = filtered_df.sort_values(by=sort_by, ascending=False)
    
    best_ml_model = sorted_df.iloc[0].model
    best_hyperparameters = ast.literal_eval(sorted_df.iloc[0].Params)
    
    #buggfix: remove 'priors' from hyperparameters
    best_hyperparameters.pop('priors', None)

    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(settings['meta']['id'])

    for sd_filename in synthetic_datasets:
        
        sd_id = os.path.splitext(sd_filename)[0]
        quality = re.findall('Q\d+', sd_id)[0]
        sd_path = folders['sd_dir']+sd_filename
        
        s = run_pycaret_setup(sd_path, settings['setup_param'], meta=settings['meta'])
        USI = s.get_config('USI')

        ########### Test the model with best performance from best original dataset ###########
        #mlflow version# hyperparameters = convert_and_clean_dict(hyperparameters)
        model_name = f"Original_{sd_id}{translate_model_name(best_ml_model)}"      
        run_name = model_name
                
        # Add custom tags to the logg, defining dataset type, and Id
        logg_tags = {
            'Trained on': 'synthetic',
            'Dataset id': sd_id,
            'model': best_ml_model,
            'Quality': quality,
            'Tuned on': 'original',
            'SDG': sd_id.split("_")[0],
            'USI': USI
        }
        mlflow.start_run(run_name, tags=logg_tags, nested=True)

        # create & tune model
        model = s.create_model(best_ml_model)
        model = model.set_params(**best_hyperparameters)
        tuned_model = s.create_model(model)

        #Quickfix for efficiency, however, some buggs rose
        #all_models = get_all_model_containers(s)
        #model = all_models[best_ml_model].class_def(**best_hyperparameters)
        #tuned_model = s.create_model(model)

        # get validation results
        val_df = s.pull()
        val_score = {}
        val_score['val_Accuracy'] = val_df['Accuracy']['Mean']
        val_score['val_F1'] = val_df['F1']['Mean']

        pred_model = s.predict_model(estimator=tuned_model, data=x_test)
        y_pred = pred_model['prediction_label']

        metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
        test_score = pd.DataFrame.from_dict(metrics).transpose()

        test_metrics = {
            "Accuracy": metrics['accuracy'],
            "F1": metrics['macro avg']['f1-score'],
            "MCC": matthews_corrcoef(y_true=y_test, y_pred=y_pred),
            "Kappa": cohen_kappa_score(y1=y_test, y2=y_pred)
        }

        # If there is a prediction_score in the from predict_model (sometimes there isn't)
        if 'prediction_score' in pred_model.columns:
            y_pred_score = pred_model['prediction_score']
            # If multiclass classification, set argument multi_class='one-vs-one'
            if y_test.nunique() > 2:
                m_class = 'ovo'
            else:
                m_class = 'raise'
            test_metrics['AUC'] = roc_auc_score(y_true=y_test, y_score=y_pred_score, multi_class=m_class)


        # log parameters     
        mlflow.log_params(tuned_model.get_params())
        # log performance
        mlflow.log_metric_report(test_metrics)
        mlflow.log_metrics(val_score)
        mlflow.log_score_report_to_html(val_df, "Validation")
        mlflow.log_score_report_to_html(test_score, "Test_score")
        # log model
        mlflow.log_tag('model', best_ml_model)
        mlflow.log_model(model=tuned_model)
        # end logging run for the model
        mlflow.end_run()
        
        # quick fix for colab issue
        performance_row = {**logg_tags, **test_metrics}
        performance_row['Params'] = tuned_model.get_params()
        model_performance_df = model_performance_df.append(performance_row, ignore_index=True)
        ########### End test hyper-param ###########
        
        # Start testing all models
        for ml_model in config['clf']['ml_models']:
            #start log run
            logg_tags['model'] = ml_model
            logg_tags['Tuned on'] = 'synthetic'
            
            model_name = f"{sd_id}-{translate_model_name(ml_model)}"
            mlflow.start_run(model_name, tags=logg_tags, nested=True)
            
            # create & tune model
            #model = s.create_model(ml_model)
            
            #Quickfix for efficiency
            all_models = get_all_model_containers(s)
            model = all_models[ml_model].class_def()
            # Get tuning grid
            tune_grid = Grids.get_tuning_grid(ml_model)
            
            # Is buggy, use default tuning by pycaret
            tuned_model = s.tune_model(model, **config['clf']['tuning_param'], custom_grid=tune_grid)
            # Old: tuned_model = s.tune_model(model, **config['clf']['tuning_param'])            
            
            # get validation results
            val_df = s.pull()
            val_score = {}
            val_score['val_Accuracy'] = val_df['Accuracy']['Mean']
            val_score['val_F1'] = val_df['F1']['Mean']


            pred_model = s.predict_model(estimator=tuned_model, data=x_test)
            y_pred = pred_model['prediction_label']

            metrics =  classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, digits=4)
            test_score = pd.DataFrame.from_dict(metrics).transpose()
            
            test_metrics = {
                "Accuracy": metrics['accuracy'],
                "F1": metrics['macro avg']['f1-score'],
                "MCC": matthews_corrcoef(y_true=y_test, y_pred=y_pred),
                "Kappa": cohen_kappa_score(y1=y_test, y2=y_pred)
            }

            # If there is a prediction_score in the from predict_model (sometimes there isn't)
            if 'prediction_score' in pred_model.columns:
                y_pred_score = pred_model['prediction_score']
                # If multiclass classification, set argument multi_class='one-vs-one'
                if y_test.nunique() > 2:
                    m_class = 'ovo'
                else:
                    m_class = 'raise'
                test_metrics['AUC'] = roc_auc_score(y_true=y_test, y_score=y_pred_score, multi_class=m_class)


            # log parameters     
            mlflow.log_params(tuned_model.get_params())
            # log performance
            mlflow.log_metric_report(test_metrics)
            mlflow.log_metrics(val_score)
            mlflow.log_score_report_to_html(val_df, "Validation")
            mlflow.log_score_report_to_html(test_score, "Test_score")
            # log model
            mlflow.log_model(model=tuned_model)
            # end logging run for the model
            mlflow.end_run()
            
            # quick fix for colab issue
            performance_row = {**logg_tags, **test_metrics}
            performance_row['Params'] = tuned_model.get_params()
            model_performance_df = model_performance_df.append(performance_row, ignore_index=True)

            
        # Notes: no need to end logging run for SD_id 
        #TODO: save best model for each sd_id-models
        
        # Save model details on the model with best accurracy under the the 'Original data models' run
        #best_run = mlflow.get_best_run_by_metric(metric_name='Accuracy')

        #mlflow.log_params(best_run.data.params)
        #mlflow.log_metrics(best_run.data.metrics)
        #mlflow.log_tag('model run name', best_run.data.tags['mlflow.runName'])
        #mlflow.log_tag('model', best_run.data.tags['model'])
        #mlflow.log_tag('model run id', best_run.info.run_id)
        #mlflow.end_run()

    # end logging for the synthetic datasets based on original id
    mlflow.end_run()          

# Save model performance to csv
model_performance_df.to_csv(folders['model_perf_filepath'], index=False)

Unnamed: 0,Description,Value
0,Session id,414
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5161,0.5151,0.9677,0.5085,0.6667,0.0323,0.0752
1,0.5323,0.5307,0.9677,0.5172,0.6742,0.0645,0.1313
2,0.5323,0.5323,1.0,0.5167,0.6813,0.0645,0.1826
3,0.5161,0.5161,1.0,0.5082,0.6739,0.0323,0.128
4,0.5574,0.5333,1.0,0.5345,0.6966,0.1015,0.2312
5,0.5246,0.5323,1.0,0.5085,0.6742,0.0635,0.1811
6,0.5246,0.5323,1.0,0.5085,0.6742,0.0635,0.1811
7,0.5738,0.5806,1.0,0.5357,0.6977,0.1591,0.2939
8,0.541,0.5484,1.0,0.5172,0.6818,0.0953,0.2237
9,0.5082,0.5161,1.0,0.5,0.6667,0.0317,0.127


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5161,0.5151,0.9677,0.5085,0.6667,0.0323,0.0752
1,0.5323,0.5307,0.9677,0.5172,0.6742,0.0645,0.1313
2,0.5323,0.5323,1.0,0.5167,0.6813,0.0645,0.1826
3,0.5161,0.5161,1.0,0.5082,0.6739,0.0323,0.128
4,0.5574,0.5333,1.0,0.5345,0.6966,0.1015,0.2312
5,0.5246,0.5323,1.0,0.5085,0.6742,0.0635,0.1811
6,0.5246,0.5323,1.0,0.5085,0.6742,0.0635,0.1811
7,0.5738,0.5806,1.0,0.5357,0.6977,0.1591,0.2939
8,0.541,0.5484,1.0,0.5172,0.6818,0.0953,0.2237
9,0.5082,0.5161,1.0,0.5,0.6667,0.0317,0.127


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
