# Step 4: Create models with SD

In [1]:
import pandas as pd 
import os 
import sys 
import pickle
import re
import ast

from pycaret.classification import ClassificationExperiment
from pycaret.containers.models.classification import get_all_model_containers

from sklearn.metrics import (classification_report, 
                             roc_auc_score, 
                             matthews_corrcoef,
                             cohen_kappa_score)
from sklearn.model_selection import train_test_split

# Import help methods
sys.path.append('../src')
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   run_pycaret_setup, 
                   translate_model_name,
                   get_synthetic_filepaths_from_original_data_id,
                   convert_and_clean_dict)

from tuning_grids import Grids
from mlflow_manager import MLFlowManager

# Get global variables for the experiment
config = getExperimentConfig()
# Get folders
folders = config['folders']
# Load dataset specific settings (from the real-data)
dataset_settings = getPicklesFromDir(folders['settings_dir'])

def print_metrics(test_metrics):
    print("Performance:")
    print(f"Accuracy (same): {test_metrics['Accuracy_same']:.2f}")
    print(f"Accuracy (strat): {test_metrics['Accuracy_strat']:.2f}")
    print(f"F1 (same): {test_metrics['F1_same']:.2f}")
    print(f"F1 (strat): {test_metrics['F1_strat']:.2f}")
    print("\n")


In [None]:
# read performance data from Step 2
model_performance_df = pd.read_csv(folders['model_perf_filepath'])
# Specify the metrics to sort by for choosing best model
# Choose the target metric when tuning the models
sort_by = config['clf']['tuning_param']['optimize']

run_dataset = config['run_dataset']

for settings in dataset_settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        continue
        
    settings['setup_param']['fold'] = config['clf']['cv_folds']
    # update system_log name
    settings['setup_param']['system_log'] = folders['log_dir']+"Step4_SD"
    # disable saving train-test split data (to save space)
    settings['setup_param']['log_data'] = False
    target_label=settings['meta']['target']
    
    # Get experiment logging
    experiment_name = f"{settings['meta']['id']}-{settings['meta']['name']}"
    mlflow = MLFlowManager(experiment_name)
     
    # load original dataset
    cols_dtype=None
    if settings['meta']['cols_dtype'] != None:
        cols_dtyped = settings['meta']['cols_dtype']
        
    original_data = pd.read_csv(f"{folders['real_dir']}{settings['meta']['filename']}", dtype=cols_dtype)
    test_data = pd.read_csv(f"{folders['real_dir']}{settings['meta']['id']}-{settings['meta']['name']}_test.csv", dtype=cols_dtype)
    
    #get_same test data as original
    y_test_same = test_data[target_label]
    x_test_same = test_data.drop(columns=[target_label], axis=1)
    # Only need the test data, using same stratified split size as in Step 2 
    _, x_test_strat, _, y_test_strat = train_test_split(original_data.drop(columns=[settings['meta']['target']], axis=1), # X (predictors)
                                            original_data[settings['meta']['target']],        # y (target label)
                                            train_size=settings['setup_param']['train_size'], 
                                            stratify=original_data[settings['meta']['target']])   

    logg_tags = {
        'Trained on': 'synthetic',
        'Tuned on': 'original',
    }
    mlflow.start_run('Synthetic data models', tags=logg_tags)
    
    # Filter the DataFrame based on the Dataset id and sort by specified column
    # to get hyperparameters and model name for the "best model"
    filtered_df = model_performance_df[model_performance_df["Dataset id"] == settings['meta']['id']]
    sorted_df = filtered_df.sort_values(by=sort_by, ascending=False)
    
    best_ml_model = sorted_df.iloc[0].model
    best_hyperparameters = ast.literal_eval(sorted_df.iloc[0].Params)
    
    #buggfix: remove 'priors' from hyperparameters
    best_hyperparameters.pop('priors', None)

    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(settings['meta']['id'])

    for sd_filename in synthetic_datasets:
        
        sd_id = os.path.splitext(sd_filename)[0]
        quality = re.findall('Q\d+', sd_id)[0]
        sd_path = folders['sd_dir']+sd_filename
        settings['setup_param']['verbose'] = False
        
        s = run_pycaret_setup(sd_path, settings['setup_param'], meta=settings['meta'])
        USI = s.get_config('USI')

        ########### Test the model with best performance from best original dataset ###########
        #mlflow version# hyperparameters = convert_and_clean_dict(hyperparameters)
        model_name = f"Original_{sd_id}{translate_model_name(best_ml_model)}"      
        run_name = model_name
                
        # Add custom tags to the logg, defining dataset type, and Id
        logg_tags = {
            'Trained on': 'synthetic',
            'Dataset id': sd_id,
            'model': best_ml_model,
            'Quality': quality,
            'Tuned on': 'original',
            'SDG': sd_id.split("_")[0],
            'USI': USI
        }
        # Check if the setup has already been evaluated
        row_exists = ((model_performance_df['Dataset id'] == logg_tags['Dataset id']) & 
                      (model_performance_df['model'] == logg_tags['model']) &
                      (model_performance_df['Tuned on'] == logg_tags['Tuned on'])).any()
        
        if not row_exists:
            mlflow.start_run(run_name, tags=logg_tags, nested=True)

            # create & tune model
            # select model based on wheter using GPU or not

            all_models = get_all_model_containers(s)
            model = all_models[best_ml_model].class_def()
            
            print(f"Dataset id: {sd_id}")
            print("Tuned on Original")
            print(f"Model: {model.__class__.__name__}")
            #print(f"Hyperparameters : {best_hyperparameters}")

            tuned_model = model.set_params(**best_hyperparameters)
            tuned_model = s.create_model(tuned_model)

            x_test_same_transformed = s.pipeline.transform(x_test_same)
            #rearange the column order in the same order as the train data
            x_test_same_transformed = x_test_same_transformed[s.X_train_transformed.columns]
            x_test_strat_transformed = s.pipeline.transform(x_test_strat)
            #rearange the column order in the same order as the train data
            x_test_strat_transformed = x_test_strat_transformed[s.X_train_transformed.columns]

            # get validation results
            val_df = s.pull()
            val_score = {}
            val_score['val_Accuracy'] = val_df['Accuracy']['Mean']
            val_score['val_F1'] = val_df['F1']['Mean']

            #pred_model = s.predict_model(estimator=tuned_model, data=x_test)
            y_pred_same = tuned_model.predict(x_test_same_transformed)
            y_pred_strat = tuned_model.predict(x_test_same_transformed)

            metrics_same =  classification_report(y_true=y_test_same, y_pred=y_pred_same, output_dict=True, digits=4)
            test_same_score = pd.DataFrame.from_dict(metrics_same).transpose()
            
            metrics_strat =  classification_report(y_true=y_test_strat, y_pred=y_pred_strat, output_dict=True, digits=4)
            test_strat_score = pd.DataFrame.from_dict(metrics_strat).transpose()

            test_metrics = {
                "Accuracy_same": metrics_same['accuracy'],
                "F1_same": metrics_same['macro avg']['f1-score'],
                "MCC_same": matthews_corrcoef(y_true=y_test_same, y_pred=y_pred_same),
                "Kappa_same": cohen_kappa_score(y1=y_test_same, y2=y_pred_same),
                "Accuracy_strat": metrics_strat['accuracy'],
                "F1_strat": metrics_strat['macro avg']['f1-score'],
                "MCC_strat": matthews_corrcoef(y_true=y_test_strat, y_pred=y_pred_strat),
                "Kappa_strat": cohen_kappa_score(y1=y_test_strat, y2=y_pred_strat)
            }
            
            print_metrics(test_metrics)


            # log parameters     
            mlflow.log_params(tuned_model.get_params())
            # log performance 'same'
            mlflow.log_metric_report(test_metrics)
            mlflow.log_metrics(val_score)
            mlflow.log_score_report_to_html(val_df, "Validation")
            mlflow.log_score_report_to_html(test_same_score, "Test_same_score")
            mlflow.log_score_report_to_html(test_strat_score, "Test_strat_score")           
            # log model
            mlflow.log_tag('model', best_ml_model)
            mlflow.log_model(model=tuned_model)
            # end logging run for the model
            mlflow.end_run()

            # quick fix for colab issue
            performance_row = {**logg_tags, **test_metrics}
            performance_row['Params'] = tuned_model.get_params()
            model_performance_df = model_performance_df.append(performance_row, ignore_index=True)
            ########### End test hyper-param ###########

        # Start testing all models
        for ml_model in config['clf']['ml_models']:
            #start log run
            logg_tags['model'] = ml_model
            logg_tags['Tuned on'] = 'synthetic'
            
            # Check if the setup has already been evaluated
            row_exists = ((model_performance_df['Dataset id'] == logg_tags['Dataset id']) & 
                          (model_performance_df['model'] == logg_tags['model']) &
                          (model_performance_df['Tuned on'] == logg_tags['Tuned on'])).any()

            if not row_exists:
            
                model_name = f"{sd_id}-{translate_model_name(ml_model)}"
                mlflow.start_run(model_name, tags=logg_tags, nested=True)

                # create & tune model
                #model = s.create_model(ml_model)

                # create & tune model
                all_models = get_all_model_containers(s)
                model = all_models[ml_model].class_def()

                # Get tuning grid
                tune_grid = Grids.get_tuning_grid(ml_model)

                print(f"Dataset id: {sd_id}")
                print(f"Model: {model.__class__.__name__}")
                #print(f"Hyperparameters : {tune_grid}")

                # Is buggy, use default tuning by pycaret
                tuned_model = s.tune_model(model, custom_grid=tune_grid, **config['clf']['tuning_param'])

                # get validation results
                val_df = s.pull()
                val_score = {}
                val_score['val_Accuracy'] = val_df['Accuracy']['Mean']
                val_score['val_F1'] = val_df['F1']['Mean']

                x_test_same_transformed = s.pipeline.transform(x_test_same)
                #rearange the column order in the same order as the train data
                x_test_same_transformed = x_test_same_transformed[s.X_train_transformed.columns]
                x_test_strat_transformed = s.pipeline.transform(x_test_strat)
                #rearange the column order in the same order as the train data
                x_test_strat_transformed = x_test_strat_transformed[s.X_train_transformed.columns]
                
                #pred_model = s.predict_model(estimator=tuned_model, data=x_test)
                y_pred_same = tuned_model.predict(x_test_same_transformed)
                y_pred_strat = tuned_model.predict(x_test_same_transformed)

                metrics_same =  classification_report(y_true=y_test_same, y_pred=y_pred_same, output_dict=True, digits=4)
                test_same_score = pd.DataFrame.from_dict(metrics_same).transpose()

                metrics_strat =  classification_report(y_true=y_test_strat, y_pred=y_pred_strat, output_dict=True, digits=4)
                test_strat_score = pd.DataFrame.from_dict(metrics_strat).transpose()

                test_metrics = {
                    "Accuracy_same": metrics_same['accuracy'],
                    "F1_same": metrics_same['macro avg']['f1-score'],
                    "MCC_same": matthews_corrcoef(y_true=y_test_same, y_pred=y_pred_same),
                    "Kappa_same": cohen_kappa_score(y1=y_test_same, y2=y_pred_same),
                    "Accuracy_strat": metrics_strat['accuracy'],
                    "F1_strat": metrics_strat['macro avg']['f1-score'],
                    "MCC_strat": matthews_corrcoef(y_true=y_test_strat, y_pred=y_pred_strat),
                    "Kappa_strat": cohen_kappa_score(y1=y_test_strat, y2=y_pred_strat)
                }
                print_metrics(test_metrics)

                # log parameters     
                mlflow.log_params(tuned_model.get_params())
                # log performance 'same'
                mlflow.log_metric_report(test_metrics)
                mlflow.log_metrics(val_score)
                mlflow.log_score_report_to_html(val_df, "Validation")
                mlflow.log_score_report_to_html(test_same_score, "Test_same_score")
                mlflow.log_score_report_to_html(test_strat_score, "Test_strat_score")           
                # log model
                mlflow.log_tag('model', best_ml_model)
                mlflow.log_model(model=tuned_model)
                # end logging run for the model
                mlflow.end_run()

                # quick fix for colab issue
                performance_row = {**logg_tags, **test_metrics}
                performance_row['Params'] = tuned_model.get_params()
                model_performance_df = model_performance_df.append(performance_row, ignore_index=True)

        # update model performance to csv after each sd_id
        model_performance_df.to_csv(folders['model_perf_filepath'], index=False)
    # end logging for the synthetic datasets based on original id
    mlflow.end_run()          



Unnamed: 0,Description,Value
0,Session id,8709
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,


Dataset id: SD0Q1_0
Model: KNeighborsClassifier
Hyperparameters : {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 32, 'p': 2, 'weights': 'distance'}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.629,0.6082,0.8235,0.6222,0.7089,0.2242,0.2414
1,0.629,0.5952,0.8571,0.625,0.7229,0.2016,0.2259
2,0.6452,0.6265,0.7714,0.6585,0.7105,0.2595,0.265
3,0.6774,0.6423,0.9143,0.6531,0.7619,0.3026,0.3467
4,0.6393,0.6117,0.8529,0.6304,0.725,0.234,0.2576
5,0.6557,0.6187,0.9412,0.6275,0.7529,0.2539,0.3186
6,0.5738,0.5376,0.8529,0.58,0.6905,0.08,0.0971
7,0.6066,0.5822,0.7941,0.6136,0.6923,0.171,0.1822
8,0.6557,0.6264,0.8824,0.6383,0.7407,0.2659,0.2985
9,0.6393,0.604,0.9118,0.62,0.7381,0.2216,0.2688


Dataset id: SD0Q1_0
Model: LogisticRegression
Hyperparameters : {'C': [0.0001, 0.00026366508987303583, 0.0006951927961775605, 0.0018329807108324356, 0.004832930238571752, 0.012742749857031334, 0.03359818286283781, 0.08858667904100823, 0.23357214690901212, 0.615848211066026, 1.623776739188721, 4.281332398719396, 11.288378916846883, 29.763514416313132, 78.47599703514607, 206.913808111479, 545.5594781168514, 1438.44988828766, 3792.690190732246, 10000.0], 'penalty': ['l2', None]}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.629,0.6082,0.8235,0.6222,0.7089,0.2242,0.2414
1,0.629,0.5952,0.8571,0.625,0.7229,0.2016,0.2259
2,0.6452,0.6265,0.7714,0.6585,0.7105,0.2595,0.265
3,0.6774,0.6423,0.9143,0.6531,0.7619,0.3026,0.3467
4,0.6393,0.6117,0.8529,0.6304,0.725,0.234,0.2576
5,0.6557,0.6187,0.9412,0.6275,0.7529,0.2539,0.3186
6,0.5738,0.5376,0.8529,0.58,0.6905,0.08,0.0971
7,0.6066,0.5822,0.7941,0.6136,0.6923,0.171,0.1822
8,0.6557,0.6264,0.8824,0.6383,0.7407,0.2659,0.2985
9,0.6393,0.604,0.9118,0.62,0.7381,0.2216,0.2688


[32m[I 2023-04-30 16:04:44,557][0m Searching the best hyperparameters using 614 samples...[0m
[32m[I 2023-04-30 16:07:42,953][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Dataset id: SD0Q1_0
Model: KNeighborsClassifier
Hyperparameters : {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski'], 'p': [1, 2]}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.629,0.6082,0.8235,0.6222,0.7089,0.2242,0.2414
1,0.629,0.5952,0.8571,0.625,0.7229,0.2016,0.2259
2,0.6452,0.6265,0.7714,0.6585,0.7105,0.2595,0.265
3,0.6774,0.6423,0.9143,0.6531,0.7619,0.3026,0.3467
4,0.6393,0.6117,0.8529,0.6304,0.725,0.234,0.2576
5,0.6557,0.6187,0.9412,0.6275,0.7529,0.2539,0.3186
6,0.5738,0.5376,0.8529,0.58,0.6905,0.08,0.0971
7,0.6066,0.5822,0.7941,0.6136,0.6923,0.171,0.1822
8,0.6557,0.6264,0.8824,0.6383,0.7407,0.2659,0.2985
9,0.6393,0.604,0.9118,0.62,0.7381,0.2216,0.2688


[32m[I 2023-04-30 16:08:26,384][0m Searching the best hyperparameters using 614 samples...[0m
[32m[I 2023-04-30 16:11:25,628][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Dataset id: SD0Q1_0
Model: SVC
Hyperparameters : {'C': [0.0001, 0.00026366508987303583, 0.0006951927961775605, 0.0018329807108324356, 0.004832930238571752, 0.012742749857031334, 0.03359818286283781, 0.08858667904100823, 0.23357214690901212, 0.615848211066026, 1.623776739188721, 4.281332398719396, 11.288378916846883, 29.763514416313132, 78.47599703514607, 206.913808111479, 545.5594781168514, 1438.44988828766, 3792.690190732246, 10000.0], 'kernel': ['poly', 'sigmoid', 'rbf'], 'degree': [2, 3, 4, 5, 6], 'gamma': ['scale', 'auto'], 'coef0': [0.0, 0.1, 0.25, 0.5, 0, 75, 1.0], 'shrinking': [True, False], 'max_iter': [100000, 100000]}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.629,0.0,0.8235,0.6222,0.7089,0.2242,0.2414
1,0.629,0.0,0.8571,0.625,0.7229,0.2016,0.2259
2,0.6452,0.0,0.7714,0.6585,0.7105,0.2595,0.265
3,0.6774,0.0,0.9143,0.6531,0.7619,0.3026,0.3467
4,0.6393,0.0,0.8529,0.6304,0.725,0.234,0.2576
5,0.6557,0.0,0.9412,0.6275,0.7529,0.2539,0.3186
6,0.5738,0.0,0.8529,0.58,0.6905,0.08,0.0971
7,0.6066,0.0,0.7941,0.6136,0.6923,0.171,0.1822
8,0.6557,0.0,0.8824,0.6383,0.7407,0.2659,0.2985
9,0.6393,0.0,0.9118,0.62,0.7381,0.2216,0.2688


[32m[I 2023-04-30 16:12:09,902][0m Searching the best hyperparameters using 614 samples...[0m
[32m[I 2023-04-30 16:15:08,695][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Dataset id: SD0Q1_0
Model: RandomForestClassifier
Hyperparameters : {'criterion': ['gini', 'entropy', 'log_loss'], 'n_estimators': [100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990], 'min_samples_split': [0.1, 0.25, 0.5, 0.75, 1.0], 'min_samples_leaf': [0.25, 0.5, 0.75, 0.9], 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True, Fals

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5484,0.5,1.0,0.5484,0.7083,0.0,0.0
1,0.5645,0.5,1.0,0.5645,0.7216,0.0,0.0
2,0.5645,0.5,1.0,0.5645,0.7216,0.0,0.0
3,0.5645,0.5,1.0,0.5645,0.7216,0.0,0.0
4,0.5574,0.5,1.0,0.5574,0.7158,0.0,0.0
5,0.5574,0.5,1.0,0.5574,0.7158,0.0,0.0
6,0.5574,0.5,1.0,0.5574,0.7158,0.0,0.0
7,0.5574,0.5,1.0,0.5574,0.7158,0.0,0.0
8,0.5574,0.5,1.0,0.5574,0.7158,0.0,0.0
9,0.5574,0.5,1.0,0.5574,0.7158,0.0,0.0


[32m[I 2023-04-30 16:15:52,181][0m Searching the best hyperparameters using 614 samples...[0m
[32m[I 2023-04-30 16:20:15,312][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Description,Value
0,Session id,5899
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(614, 9)"
6,Transformed test set shape,"(154, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,


Dataset id: SD0Q1_1
Model: KNeighborsClassifier
Hyperparameters : {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 32, 'p': 2, 'weights': 'distance'}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5968,0.5667,0.8,0.6087,0.6914,0.1398,0.1511
1,0.6129,0.5903,0.8235,0.6087,0.7,0.1878,0.2055
2,0.629,0.6019,0.8824,0.6122,0.7229,0.2139,0.2491
3,0.5968,0.5693,0.8529,0.5918,0.6988,0.1455,0.1695
4,0.6885,0.6596,0.9118,0.6596,0.7654,0.3358,0.377
5,0.541,0.5005,0.8529,0.5577,0.6744,0.0012,0.0015
6,0.5902,0.5637,0.7941,0.6,0.6835,0.133,0.1439
7,0.5574,0.5267,0.7941,0.5745,0.6667,0.0562,0.063
8,0.623,0.5931,0.8529,0.617,0.716,0.196,0.22
9,0.6393,0.6155,0.8235,0.6364,0.7179,0.2401,0.2558


Dataset id: SD0Q1_1
Model: LogisticRegression
Hyperparameters : {'C': [0.0001, 0.00026366508987303583, 0.0006951927961775605, 0.0018329807108324356, 0.004832930238571752, 0.012742749857031334, 0.03359818286283781, 0.08858667904100823, 0.23357214690901212, 0.615848211066026, 1.623776739188721, 4.281332398719396, 11.288378916846883, 29.763514416313132, 78.47599703514607, 206.913808111479, 545.5594781168514, 1438.44988828766, 3792.690190732246, 10000.0], 'penalty': ['l2', None]}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5968,0.5667,0.8,0.6087,0.6914,0.1398,0.1511
1,0.6129,0.5903,0.8235,0.6087,0.7,0.1878,0.2055
2,0.629,0.6019,0.8824,0.6122,0.7229,0.2139,0.2491
3,0.5968,0.5693,0.8529,0.5918,0.6988,0.1455,0.1695
4,0.6885,0.6596,0.9118,0.6596,0.7654,0.3358,0.377
5,0.541,0.5005,0.8529,0.5577,0.6744,0.0012,0.0015
6,0.5902,0.5637,0.7941,0.6,0.6835,0.133,0.1439
7,0.5574,0.5267,0.7941,0.5745,0.6667,0.0562,0.063
8,0.623,0.5931,0.8529,0.617,0.716,0.196,0.22
9,0.6393,0.6155,0.8235,0.6364,0.7179,0.2401,0.2558


[32m[I 2023-04-30 16:21:35,301][0m Searching the best hyperparameters using 614 samples...[0m
[32m[I 2023-04-30 16:24:36,274][0m Finished hyperparemeter search![0m


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Dataset id: SD0Q1_1
Model: KNeighborsClassifier
Hyperparameters : {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski'], 'p': [1, 2]}


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

[32m[I 2023-04-30 16:25:23,645][0m Searching the best hyperparameters using 614 samples...[0m
