In this notebook we will load the CSV datasets that we previously preprocessed and split, apply the scaling techniques and then measure the performance of several classifiers on these datasets when scaled with the distinc scaling techniques.

# Importing required libs

In [None]:
# f= open("monitor.txt","w+")
# f.write('Started loading libs')
# f.close()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
from collections import Counter
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
#from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer


from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
#from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
#from sklearn.neighbors import KNeighborsClassifier
#!pip install sklearn_lvq
from sklearn_lvq import GlvqModel
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier

#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier

from deslib.dcs import OLA
from deslib.dcs import MCB
from deslib.dcs import LCA
from deslib.des import KNORAU
from deslib.des.knora_e import KNORAE
from deslib.des import METADES


In [None]:
# f= open("monitor.txt","w+")
# f.write('Finished Loading libs')
# f.close()

# Loading data

Here we are going to load the data that we previously preprocessed, split into 5-folds and saved in CSV format.

In [None]:
import os
cwd = os.getcwd()
print('Current working directory: ', cwd)
if 'ST_performances'not in cwd:
    os.chdir(cwd+'/ST_performances')

cwd = os.getcwd()
print('Current working directory: ', cwd)

In [None]:
# f= open("monitor.txt","w+")
# f.write('Started loading data')
# f.close()

In [None]:
# I will create a dict structure such that I can access train fold 1 from 
# dataset D1 as datasets[1]['train'][0]
print('Loading data ', end='')
data_dir = '../../data/5-fold'
datasets = {}
for i in range(1,301):
    datasets[i] = {}
    datasets[i]['train'] = []
    datasets[i]['test'] = []
    for f in range(1,6): #for each fold
        csv_filename = f'{data_dir}/D{i}-fold{f}-train.csv'
        df_train = pd.read_csv(csv_filename, encoding='utf8', engine='python', sep=',', 
                     header=0, on_bad_lines='skip')
        csv_filename = f'{data_dir}/D{i}-fold{f}-test.csv'
        df_test = pd.read_csv(csv_filename, encoding='utf8', engine='python', sep=',', 
                     header=0, on_bad_lines='skip')
        datasets[i]['train'].append(df_train)
        datasets[i]['test'].append(df_test)
    print('.', end='')


In [None]:
# f= open("monitor.txt","w+")
# f.write('Finished loading data')
# f.close()

# Checking Imbalance ratio

In [None]:
# # Let's count how many instances we have per class and calculate the imbalance ratios:
# cnts = {}
# imb_ratios = {}
# for key in datasets:
#     #First let's create a dataframe containing all data (appending train and test):
#     ds = datasets[key]['train'][0].append(datasets[key]['test'][0], ignore_index=True)
#     class_att = ds.columns[-1]
#     cnt = Counter(ds[class_att])
#     cnts[key] = (cnt[list(cnt)[0]], cnt[list(cnt)[1]])
#     imb_ratios[key] = max(cnts[key])/min(cnts[key])
# #for i in imb_ratios.values(): print('%.2f'%i)

In [None]:
# IRs = pd.Series(imb_ratios)
# IRs.plot.hist(grid=True, bins=25, rwidth=0.9)
# plt.xlabel('Imbalance Ratio')
# plt.grid(axis='x')

Most of the datasets are in the "low imbalance" range (IR<3), but some are highly imbalanced. Some treatment will have to be applied.

In [None]:
# cp = list(Counter(datasets[300]['train'][0]['class']).values())
# IR = max(cp)/min(cp)
# IR

# Scaling

Here, the idea is to create 5 copies of each dataset, for each copy we are going to apply one of the following scaling techniques to the attributes: Standard Scaler, Min-max Scaler, Maximum Absolute Scaler, Robust Scaler and Quantile Transformer.

In [None]:
# f= open("monitor.txt","w+")
# f.write('Started Scaling')
# f.close()

In [None]:
print('Scaling ', end='')
# Creating copies of the datasets:
datasets_ss = copy.deepcopy(datasets)
datasets_mms = copy.deepcopy(datasets)
datasets_mas = copy.deepcopy(datasets)
datasets_rs = copy.deepcopy(datasets)
datasets_qt = copy.deepcopy(datasets)

In [None]:
ss = StandardScaler()
mms = MinMaxScaler() 
mas = MaxAbsScaler() 
rs = RobustScaler()
qt = QuantileTransformer(output_distribution='normal')

import warnings
# Ignoring warnings from QuantileTransformer when number of samples is lower then 1000:
warnings.filterwarnings(action = "ignore", category=UserWarning) 

for i in range(1,301):
    for fold in range(5):
        #print(f'Dataset: {name}, fold {fold}.', end = '')
        datasets_ss[i]['train'][fold].iloc[:,:-1] = ss.fit_transform(datasets_ss[i]['train'][fold].iloc[:,:-1])
        datasets_ss[i]['test'][fold].iloc[:,:-1] = ss.transform(datasets_ss[i]['test'][fold].iloc[:,:-1])
        datasets_mms[i]['train'][fold].iloc[:,:-1] = mms.fit_transform(datasets_mms[i]['train'][fold].iloc[:,:-1])
        datasets_mms[i]['test'][fold].iloc[:,:-1] = mms.transform(datasets_mms[i]['test'][fold].iloc[:,:-1])
        datasets_mas[i]['train'][fold].iloc[:,:-1] = mas.fit_transform(datasets_mas[i]['train'][fold].iloc[:,:-1])
        datasets_mas[i]['test'][fold].iloc[:,:-1] = mas.transform(datasets_mas[i]['test'][fold].iloc[:,:-1])
        datasets_rs[i]['train'][fold].iloc[:,:-1] = rs.fit_transform(datasets_rs[i]['train'][fold].iloc[:,:-1])
        datasets_rs[i]['test'][fold].iloc[:,:-1] = rs.transform(datasets_rs[i]['test'][fold].iloc[:,:-1])
        datasets_qt[i]['train'][fold].iloc[:,:-1] = qt.fit_transform(datasets_qt[i]['train'][fold].iloc[:,:-1])
        datasets_qt[i]['test'][fold].iloc[:,:-1] = qt.transform(datasets_qt[i]['test'][fold].iloc[:,:-1])
    print('.', end='') 
# Restablishing warnings:
warnings.filterwarnings(action = "default", category=UserWarning)

In [None]:
# datasets[1]['train'][0]['att1'].plot.hist(grid=True, bins=25, rwidth=0.9)
# datasets_ss[1]['train'][0]['att1'].plot.hist(grid=True, bins=25, rwidth=0.9)
# datasets_qt[1]['train'][0]['att1'].plot.hist(grid=True, bins=25, rwidth=0.9)

In [None]:
# f= open("monitor.txt","w+")
# f.write('Finished Scaling')
# f.close()

# Training and validation

### Creating functions to cross-validate models:

In [None]:
def run_model(model, model_name, results_df):
    superset = {'NS': datasets, 'SS': datasets_ss,
            'MMS': datasets_mms,'MAS':datasets_mas,
            'RS':datasets_rs, #'PT': datasets_pt, 
            'QT': datasets_qt}
    
    print('Starting '+ model_name +', time: ', datetime.now())
    for name in range(1,301): #name is actually a number
    #for name in [1]: #testing 
        print(f'\nCurrent dataset: {name}', end = '')
        for k in superset:
            print(' '+k+' ', end = '')
            acc_folds = []
            recall_folds = []
            precision_folds = []
            f1_folds = []
            #roc_auc_folds = []
            gmean_folds = []
            
            ds = superset[k]
            target_att = ds[name]['train'][0].columns.tolist()[-1]
            for fold in range(5):
                print('.', end = '')
                #Gather training data:
                ds_train = ds[name]['train'][fold]
                X_train = ds_train.drop(labels=target_att, axis = 1)
                y_train = ds_train[target_att]
            
                # Gather test data:
                ds_test = ds[name]['test'][fold]
                X_test = ds_test.drop(labels=target_att, axis = 1)
                y_test = ds_test[target_att]
                
                
                # @TODO Class balancing with SMOTE?
                
                # Train model with the training data, 
                # If we need y_score for calculating ROC-AUC we do:
                #y_score = model.fit(X_train, y_train).decision_function(X_test)
                
                # If we won't calculate ROC-AUC, we can just fit the model.
                model.fit(X_train, y_train)
                
                # Test model:
                y_pred = model.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred, pos_label=1)
                precision = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
                f1 = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
                gmean = geometric_mean_score(y_test, y_pred, pos_label=1)
                #roc_auc = roc_auc_score(y_test, y_score)

                # Store metrics for this fold
                acc_folds.append(acc)
                recall_folds.append(recall)
                precision_folds.append(precision)
                f1_folds.append(f1)
                #roc_auc_folds.append(roc_auc)
                gmean_folds.append(gmean)
            
            new_row = {'Dataset' : name, 'Scaling technique' : k, 'Model' : model_name,
                       'acc_fold1' : acc_folds[0], 'acc_fold2' : acc_folds[1], 'acc_fold3' : acc_folds[2], 
                       'acc_fold4' : acc_folds[3], 'acc_fold5' : acc_folds[4], 
                       'acc_mean': np.mean(acc_folds), 'acc_stddev': np.std(acc_folds),
                       'recall_fold1' : recall_folds[0], 'recall_fold2' : recall_folds[1], 'recall_fold3' : recall_folds[2],
                       'recall_fold4' : recall_folds[3], 'recall_fold5' : recall_folds[4], 
                       'recall_mean': np.mean(recall_folds), 'recall_stddev':np.std(recall_folds),
                       'precision_fold1' : precision_folds[0], 'precision_fold2' : precision_folds[1] , 'precision_fold3' : precision_folds[2],
                       'precision_fold4' : precision_folds[3], 'precision_fold5' : precision_folds[4],
                       'precision_mean': np.mean(precision_folds), 'precision_stddev': np.std(precision_folds),
                       'f1_fold1' : f1_folds[0], 'f1_fold2' : f1_folds[1], 'f1_fold3' : f1_folds[2], 
                       'f1_fold4' : f1_folds[3], 'f1_fold5' : f1_folds[4], 
                       'f1_mean': np.mean(f1_folds), 'f1_stddev': np.std(f1_folds),
#                        'roc_auc_fold1' : roc_auc_folds[0], 'roc_auc_fold2' : roc_auc_folds[1], 'roc_auc_fold3' : roc_auc_folds[2], 
#                        'roc_auc_fold4' : roc_auc_folds[3], 'roc_auc_fold5' : roc_auc_folds[4], 
#                        'roc_auc_mean': np.mean(f1_folds), 'roc_auc_stddev': np.std(roc_auc_folds),
                       'gmean_fold1' : gmean_folds[0], 'gmean_fold2' : gmean_folds[1], 'gmean_fold3' : gmean_folds[2], 
                       'gmean_fold4' : gmean_folds[3], 'gmean_fold5' : gmean_folds[4], 
                       'gmean_mean': np.mean(gmean_folds), 'gmean_stddev' : np.std(gmean_folds),
                      }

            #results_df = results_df.append(new_row, ignore_index=True) #Deprecated
            results_df = pd.concat([results_df, pd.DataFrame.from_records([new_row])],ignore_index=True)

    print('Finishing '+ model_name +', time: ', datetime.now())   
    return results_df

In [None]:
# This version is for ensemble models that need a prefit pool of base classifiers:
def run_model2(model, model_name, pool, results_df):
    superset = {'NS': datasets, 'SS': datasets_ss,
            'MMS': datasets_mms,'MAS':datasets_mas,
            'RS':datasets_rs, #'PT': datasets_pt, 
            'QT': datasets_qt}

    print('Starting '+ model_name +', time: ', datetime.now())
    for name in range(1,301): #name is actually a number
    #for name in [1]: #testing with just one dataset
        print(f'\nCurrent dataset: {name}', end = '')
        for k in superset:
            print(' '+k+' ', end = '')
            acc_folds = []
            recall_folds = []
            precision_folds = []
            f1_folds = []
            #roc_auc_folds = []
            gmean_folds = []
            
            ds = superset[k]
            target_att = ds[name]['train'][0].columns.tolist()[-1]
            for fold in range(5):
                print('.', end = '')
                #Gather training data:
                ds_train = ds[name]['train'][fold]
                X_train = ds_train.drop(labels=target_att, axis = 1)
                y_train = ds_train[target_att]
            
                # Gather test data:
                ds_test = ds[name]['test'][fold]
                X_test = ds_test.drop(labels=target_att, axis = 1)
                y_test = ds_test[target_att]
                
                # Train model with the training data, 
                # If we need y_score for calculating ROC-AUC we do:
                #y_score = model.fit(X_train, y_train).decision_function(X_test)
                
                # If we won't calculate ROC-AUC, we can just fit the model.
                # If it is an ensemble model that needs prefit base models, we fit them first:
                pool.fit(X_train, y_train)
                model.fit(X_train, y_train)
                
                # Test model:
                y_pred = model.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred, pos_label=1)
                precision = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
                f1 = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
                gmean = geometric_mean_score(y_test, y_pred, pos_label=1)
                #roc_auc = roc_auc_score(y_test, y_score)

                # Store metrics for this fold
                acc_folds.append(acc)
                recall_folds.append(recall)
                precision_folds.append(precision)
                f1_folds.append(f1)
                #roc_auc_folds.append(roc_auc)
                gmean_folds.append(gmean)
            
            new_row = {'Dataset' : name, 'Scaling technique' : k, 'Model' : model_name,
                       'acc_fold1' : acc_folds[0], 'acc_fold2' : acc_folds[1], 'acc_fold3' : acc_folds[2], 
                       'acc_fold4' : acc_folds[3], 'acc_fold5' : acc_folds[4], 
                       'acc_mean': np.mean(acc_folds), 'acc_stddev': np.std(acc_folds),
                       'recall_fold1' : recall_folds[0], 'recall_fold2' : recall_folds[1], 'recall_fold3' : recall_folds[2],
                       'recall_fold4' : recall_folds[3], 'recall_fold5' : recall_folds[4], 
                       'recall_mean': np.mean(recall_folds), 'recall_stddev':np.std(recall_folds),
                       'precision_fold1' : precision_folds[0], 'precision_fold2' : precision_folds[1] , 'precision_fold3' : precision_folds[2],
                       'precision_fold4' : precision_folds[3], 'precision_fold5' : precision_folds[4],
                       'precision_mean': np.mean(precision_folds), 'precision_stddev': np.std(precision_folds),
                       'f1_fold1' : f1_folds[0], 'f1_fold2' : f1_folds[1], 'f1_fold3' : f1_folds[2], 
                       'f1_fold4' : f1_folds[3], 'f1_fold5' : f1_folds[4], 
                       'f1_mean': np.mean(f1_folds), 'f1_stddev': np.std(f1_folds),
#                        'roc_auc_fold1' : roc_auc_folds[0], 'roc_auc_fold2' : roc_auc_folds[1], 'roc_auc_fold3' : roc_auc_folds[2], 
#                        'roc_auc_fold4' : roc_auc_folds[3], 'roc_auc_fold5' : roc_auc_folds[4], 
#                        'roc_auc_mean': np.mean(f1_folds), 'roc_auc_stddev': np.std(roc_auc_folds),
                       'gmean_fold1' : gmean_folds[0], 'gmean_fold2' : gmean_folds[1], 'gmean_fold3' : gmean_folds[2], 
                       'gmean_fold4' : gmean_folds[3], 'gmean_fold5' : gmean_folds[4], 
                       'gmean_mean': np.mean(gmean_folds), 'gmean_stddev' : np.std(gmean_folds),
                      }

            #results_df = results_df.append(new_row, ignore_index=True) #Deprecated
            results_df = pd.concat([results_df, pd.DataFrame.from_records([new_row])],ignore_index=True)

    print('Finishing '+ model_name +', time: ', datetime.now())   
    return results_df

In [None]:
import warnings
warnings.filterwarnings(action = "ignore", category=UserWarning) 

### Running monolithic models

In [None]:
# Creating a dataframe to store results:
results_df_mono = pd.DataFrame({'Dataset' : [], 'Scaling technique' : [], 'Model' : [],
                           'acc_fold1' : [], 'acc_fold2' : [], 'acc_fold3' : [], 'acc_fold4' : [], 'acc_fold5' : [], 
                           'acc_mean':[], 'acc_stddev':[],
                           'recall_fold1' : [], 'recall_fold2' : [], 'recall_fold3' : [], 'recall_fold4' : [], 'recall_fold5' : [], 
                           'recall_mean':[], 'recall_stddev':[],
                           'precision_fold1' : [], 'precision_fold2' : [], 'precision_fold3' : [], 'precision_fold4' : [], 
                           'precision_fold5' : [], 'precision_mean':[], 'precision_stddev': [],
                           'f1_fold1' : [], 'f1_fold2' : [], 'f1_fold3' : [], 'f1_fold4' : [], 'f1_fold5' : [], 
                           'f1_mean': [], 'f1_stddev': [],
                           'gmean_fold1' : [], 'gmean_fold2' : [], 'gmean_fold3' : [], 'gmean_fold4' : [], 'gmean_fold5' : [], 
                           'gmean_mean':[], 'gmean_stddev' : []
                           })

## Instantiating models:
# Monolithic models
monolithic_models = {'SVM_lin': SVC(kernel='linear', probability=True),
                     'SVM_RBF': SVC(kernel='rbf', probability=True),
                     #'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
                     #'GNB': GaussianNB(),
                     'GLVQ': GlvqModel(random_state=0), #Generalized Learning Vector Quantization
                     #'LDA': LinearDiscriminantAnalysis(),
                     #'QDA': QuadraticDiscriminantAnalysis(),
                     'GP': GaussianProcessClassifier(1.0 * RBF(1.0), random_state=0, n_jobs=-1),
                     #'DT': DecisionTreeClassifier(random_state=0),
                     'Percep': Perceptron(random_state=0, n_jobs=-1),
                     'MLP': MLPClassifier(activation='relu', solver='adam', alpha=1e-5, max_iter=10000, hidden_layer_sizes=(5, 2), random_state=0)
                    }

In [None]:
# f= open("monitor.txt","w+")
# f.write('Started running models')
# f.close()

In [None]:
# Running models:

for name,model in monolithic_models.items():
        results_df_mono = run_model(model, name, results_df_mono)
#results_df_mono.to_csv('../../results/csv_tabs/results_ST_perfs_monolithic.csv', index=False)

### Running Ensemble models

In [None]:
# Creating a dataframe to store results:
results_df_ensemble = pd.DataFrame({'Dataset' : [], 'Scaling technique' : [], 'Model' : [],
                           'acc_fold1' : [], 'acc_fold2' : [], 'acc_fold3' : [], 'acc_fold4' : [], 'acc_fold5' : [], 
                           'acc_mean':[], 'acc_stddev':[],
                           'recall_fold1' : [], 'recall_fold2' : [], 'recall_fold3' : [], 'recall_fold4' : [], 'recall_fold5' : [], 
                           'recall_mean':[], 'recall_stddev':[],
                           'precision_fold1' : [], 'precision_fold2' : [], 'precision_fold3' : [], 'precision_fold4' : [], 
                           'precision_fold5' : [], 'precision_mean':[], 'precision_stddev': [],
                           'f1_fold1' : [], 'f1_fold2' : [], 'f1_fold3' : [], 'f1_fold4' : [], 'f1_fold5' : [], 
                           'f1_mean': [], 'f1_stddev': [],
                           'gmean_fold1' : [], 'gmean_fold2' : [], 'gmean_fold3' : [], 'gmean_fold4' : [], 'gmean_fold5' : [], 
                           'gmean_mean':[], 'gmean_stddev' : []
                           })


#  Ensemble models

base_model = Perceptron(random_state=0)
pool_classifiers = BaggingClassifier(base_estimator=base_model, n_estimators=100, random_state=0, bootstrap=True,
                                bootstrap_features=False, max_features=1.0, n_jobs=-1)

base_model_calib = CalibratedClassifierCV(base_estimator = Perceptron(random_state=0), cv=5) 
pool_classifiers_calib = BaggingClassifier(base_estimator=base_model_calib, n_estimators=100, random_state=0, bootstrap=True,
                                bootstrap_features=False, max_features=1.0, n_jobs=-1) 

ensemble_models = {#'RF': RandomForestClassifier(random_state = 0, n_jobs=-1),
                   #'XGBoost': XGBClassifier(n_jobs=-1, random_state=0),
                   #'AdaBoost': AdaBoostClassifier(n_estimators=100),
                   'Bagging': pool_classifiers,
                   'OLA': OLA(pool_classifiers, random_state=0),
                   'LCA': LCA(pool_classifiers, random_state=0),
                   'MCB': MCB(pool_classifiers, random_state=0),
                   'KNORAE': KNORAE(pool_classifiers, random_state=0),
                   'KNORAU': KNORAU(pool_classifiers, random_state=0),
                   #'METADES': METADES(pool_classifiers_calib, random_state=0)
                  }


In [None]:
# Running models:
for name,model in ensemble_models.items():
    if name in ['OLA','LCA','MCB', 'KNORAE', 'KNORAU']: # these metamodels need pool_classifiers to be fit before applying fit to the metamodel.
        results_df_ensemble = run_model2(model, name, pool_classifiers, results_df_ensemble)
    elif name in ['METADES']: #This also needs a prefit pool_classifiers but needs base_estimators to return probabilities too.
        results_df_ensemble = run_model2(model, name, pool_classifiers_calib, results_df_ensemble)
    else: 
        results_df_ensemble = run_model(model, name, results_df_ensemble)
#results_df_ensemble.to_csv('../../results/csv_tabs/results_ST_perfs_ensemble.csv', index=False)

In [None]:
results_df = pd.concat([results_df_mono, results_df_ensemble], axis=0)
results_df.sort_values(by = ['Model', 'Dataset']).to_csv('../../results/csv_tabs/results_ST_perfs.csv', index=False)