### Importing libs

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
import random
import scikit_posthocs as sp
#import statistics as stats
from pandas.api.types import CategoricalDtype
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
#from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import LabelEncoder
#from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
#from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
#!pip install sklearn_lvq
from sklearn_lvq import GlvqModel
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier

from deslib.dcs import OLA
from deslib.dcs import MCB
from deslib.dcs import LCA
from deslib.des import KNORAU
from deslib.des.knora_e import KNORAE
from deslib.des import METADES

from datetime import datetime
import scipy.stats as stats
from scipy.stats import friedmanchisquare
from scipy.stats import kruskal
import pickle

In [3]:
# This code has been written for the following versions of the most relevant libraries:
# Scikit-learn v0.23.1, sklearn_lvq v1.1.0, xgboost 1.2.1, DESlib v0.3.5 e SciPy v1.5.0.

### Loading data

#### First let's build a function to convert KEEL data to a regular CSV (remove annotations before data):

In [4]:
def keel2csv(file):
    ''' Reads a KEEL .dat file, converts it into a regular CSV data file 
    that contains a header line. The output .csv file is written to the same 
    dir as the original .dat file. This function also returns a dict 
    {'numeric':[], 'nominal':[]} containg two lists, one for the numeric 
    attributes and the other for the nominal atributes.'''
    filename = file.name
    # Let's read the attribute types (useful for preprocessing) and also the 
    # column names from the @annotations, inclunding the target (class) column:
    has_inputs = has_outputs = False
    numeric_atts = []
    nominal_atts = []
    for line in file:
        if '@attribute' in line:
            if (' real' in line) or (' integer' in line):
                numeric_atts.append(line.split(' ')[1])
            elif '{' in line:
                nominal_atts.append(line.split(' ')[1])
        if line.startswith('@inputs'):
            att_names = line[8:-1].replace(' ', '')
            has_inputs = True
        elif line.startswith('@input'):
            att_names = line[7:-1].replace(' ', '')
            has_inputs = True
        elif line.startswith('@outputs') or line.startswith('@output'):
            class_name = line[9:-1]
            has_outputs = True
            break
        elif line.startswith('@output'):
            class_name = line[8:-1]
            has_outputs = True
            break
    if (not has_inputs) or (not has_outputs):
        print('File ', filename, 'missing annotations?' )

    columns = att_names + ',' + class_name

    #Then, lets remove the annotations and save the column names and data into a csv file:
    lines = file.readlines() 
    file.close()
    new_file = open(filename[:-4]+'.csv','w')
    new_file.write(columns+'\n')
    for line in lines:
        if not line.startswith('@'):
            new_file.write(line)
    new_file.close()    
    return {'numeric':numeric_atts, 'nominal':nominal_atts}

#### Now, we need to run through the files and execute the keel2csv function for each KEEL dat file:

In [5]:
rootdir = './KEEL_imb_classification_data_5-fold'
dl_file = open(rootdir+'/dir_list.txt', 'r')
ds_names = dl_file.readline().split(',')

In [6]:
# Converting KEEL .dat files to CSV:
att_types = {} #This dictionary will have each dataset name as key and will hold the attribute types.
for name in ds_names:
    for fold in range(1,6):
        full_path = rootdir+'/'+name+'/'+name[:-4]+str(fold)+'tra.dat' 
        f = open(full_path, 'r')
        att_types[name] = keel2csv(f)
        f.close()
        full_path = rootdir+'/'+name+'/'+name[:-4]+str(fold)+'tst.dat' 
        f = open(full_path, 'r')
        keel2csv(f)
        f.close()

In [7]:
# Removing datasets with no numeric attributes:
to_remove = []
for name in ds_names:
    if len(att_types[name]['numeric']) == 0:
        to_remove.append(name)

for name in to_remove:
    ds_names.remove(name)

#### Ok, now that we finally have all the data in CSV format, lets load them:

In [8]:
# I will create a dict structure such that I can access train fold 1 from 
# dataset wisconsin as datasets['wisconsin']['train'][0]

datasets = {}
for name in ds_names:
    datasets[name] = {}
    datasets[name]['train'] = []
    datasets[name]['test'] = []
    for fold in range(1,6):
        csv_filename = rootdir+'/'+name+'/'+name[:-4]+str(fold)+'tra.csv'
        df_train = pd.read_csv(csv_filename, encoding='utf8', engine='python', sep=',', 
                     header=0, error_bad_lines=False)
        csv_filename = rootdir+'/'+name+'/'+name[:-4]+str(fold)+'tst.csv'
        df_test = pd.read_csv(csv_filename, encoding='utf8', engine='python', sep=',', 
                     header=0, error_bad_lines=False)
        datasets[name]['train'].append(df_train)
        datasets[name]['test'].append(df_test)




  exec(code_obj, self.user_global_ns, self.user_ns)


The att_types dictionary will allow us to do things like selecting only the numeric attributes from a dataset very easily, e.g.:

In [9]:
# # Let's take a look at the second dataset, for example:
# print(ds_names[1])
# datasets[ds_names[1]]['train'][0]

In [10]:
# # Now a subset of it containing only the attributes stated as integer or real:
# datasets[ds_names[1]]['train'][0][att_types[ds_names[1]]['numeric']]

In [11]:
len(datasets)

91

Calculating Imbalance Ratios (IRs):

In [12]:
# Let's count how many instances we have per class and calculate the imbalance ratios:
cnts = {}
imb_ratios = {}
for key in datasets:
    #First let's create a dataframe containing all data (appending train and test):
    ds = datasets[key]['train'][0].append(datasets[key]['test'][0], ignore_index=True)
    class_att = ds.columns[-1]
    cnt = Counter(ds[class_att])
    cnts[key] = (cnt[list(cnt)[0]], cnt[list(cnt)[1]])
    imb_ratios[key] = max(cnts[key])/min(cnts[key])
#for i in imb_ratios.values(): print('%.2f'%i)

Sorting Datasets by IR to help future analysis:

In [13]:
# Let's sort the datasets names in ds_names by their corresponding IR in imb_ratios dict.
d = imb_ratios
#d = { for sorted(imb_ratios.values())
def getkeybyvalue(d,i):
    for k, v in d.items():
        if v == i:
            return (k)

sortvaluelist = sorted(d.values())
sortresult ={}
for i1 in sortvaluelist:   
    key = getkeybyvalue(d,i1)
    sortresult[key] = i1

ds_names = list(sortresult.keys())

In [14]:
len(ds_names)

88

In [15]:
# Removing datasets with too many categorical features:
for name in ds_names:
     if len(att_types[name]['nominal'][:-1]) >= 5:
            print('Removing dataset', name)
            ds_names.remove(name)

Removing dataset lymphography-normal-fibrosis-5-fold
Removing dataset kddcup-guess_passwd_vs_satan-5-fold
Removing dataset kddcup-land_vs_portsweep-5-fold
Removing dataset kddcup-buffer_overflow_vs_back-5-fold
Removing dataset kddcup-rootkit-imap_vs_back-5-fold


In [16]:
# Removing datasets with too many categorical features:
for name in ds_names:
     if len(att_types[name]['nominal'][:-1]) >= 5:
            print('Removing dataset', name)
            ds_names.remove(name)

Removing dataset kddcup-land_vs_satan-5-fold


In [17]:
len(ds_names)

82

In [18]:
imb_ratios

{'abalone-17_vs_7-8-9-10-5-fold': 39.310344827586206,
 'abalone-19_vs_10-11-12-13-5-fold': 49.6875,
 'abalone-20_vs_8-9-10-5-fold': 72.6923076923077,
 'abalone-21_vs_8-5-fold': 40.5,
 'abalone-3_vs_11-5-fold': 32.46666666666667,
 'abalone19-5-fold': 129.4375,
 'abalone9-18-5-fold': 16.404761904761905,
 'cleveland-0_vs_4-5-fold': 12.307692307692308,
 'dermatology-6-5-fold': 16.9,
 'ecoli-0-1-3-7_vs_2-6-5-fold': 39.142857142857146,
 'ecoli-0-1-4-6_vs_5-5-fold': 13.0,
 'ecoli-0-1-4-7_vs_2-3-5-6-5-fold': 10.586206896551724,
 'ecoli-0-1-4-7_vs_5-6-5-fold': 12.28,
 'ecoli-0-1_vs_2-3-5-5-fold': 9.166666666666666,
 'ecoli-0-1_vs_5-5-fold': 11.0,
 'ecoli-0-2-3-4_vs_5-5-fold': 9.1,
 'ecoli-0-2-6-7_vs_3-5-5-fold': 9.181818181818182,
 'ecoli-0-3-4-6_vs_5-5-fold': 9.25,
 'ecoli-0-3-4-7_vs_5-6-5-fold': 9.28,
 'ecoli-0-3-4_vs_5-5-fold': 9.0,
 'ecoli-0-4-6_vs_5-5-fold': 9.15,
 'ecoli-0-6-7_vs_3-5-5-fold': 9.090909090909092,
 'ecoli-0-6-7_vs_5-5-fold': 10.0,
 'ecoli-0_vs_1-5-fold': 1.8571428571428572,


Displaying dataset info as a table:

In [17]:
#print('Dataset name'.ljust(35),'\tClass counts\tIR')
#for k in ds_names: print(str(k).ljust(35),'\t',cnts[k],'\t', imb_ratios[k])

from IPython.display import display, Markdown
#open('table.txt', 'w').close()
tab_file = open('table.txt', 'w')
tab_file
tab_file.write('|#|Dataset name|Numeric atts|Nominal atts|Class counts|IR|\n')
tab_file.write('|-|:-----------|:----------:|:----------:|:----------:|--|\n')
n = 1
for k in ds_names:
    num_att = len(att_types[k]['numeric'])
    nom_att = len(att_types[k]['nominal'][:-1])
    tab_file.write('|'+str(n)+'|'+str(k)[:-7].ljust(35)+'|'+str(num_att)+'|'+str(nom_att)+'|'+str(cnts[k])+'|'+('%.2f'%imb_ratios[k])+'|\n')
    n+=1
tab_file.close()
display(Markdown(filename='./table.txt'))

|#|Dataset name|Numeric atts|Nominal atts|Class counts|IR|
|-|:-----------|:----------:|:----------:|:----------:|--|
|1|glass1                             |9|0|(138, 76)|1.82|
|2|ecoli-0_vs_1                       |7|0|(143, 77)|1.86|
|3|wisconsin                          |9|0|(444, 239)|1.86|
|4|pima                               |8|0|(268, 500)|1.87|
|5|iris0                              |4|0|(50, 100)|2.00|
|6|glass0                             |9|0|(70, 144)|2.06|
|7|yeast1                             |8|0|(1055, 429)|2.46|
|8|haberman                           |3|0|(225, 81)|2.78|
|9|vehicle2                           |18|0|(628, 218)|2.88|
|10|vehicle1                           |18|0|(629, 217)|2.90|
|11|vehicle3                           |18|0|(634, 212)|2.99|
|12|glass-0-1-2-3_vs_4-5-6             |9|0|(163, 51)|3.20|
|13|vehicle0                           |18|0|(199, 647)|3.25|
|14|ecoli1                             |7|0|(259, 77)|3.36|
|15|new-thyroid1                       |5|0|(180, 35)|5.14|
|16|ecoli2                             |7|0|(284, 52)|5.46|
|17|segment0                           |19|0|(1979, 329)|6.02|
|18|glass6                             |9|0|(185, 29)|6.38|
|19|yeast3                             |8|0|(1321, 163)|8.10|
|20|ecoli3                             |7|0|(301, 35)|8.60|
|21|page-blocks0                       |10|0|(4913, 559)|8.79|
|22|ecoli-0-3-4_vs_5                   |7|0|(180, 20)|9.00|
|23|yeast-2_vs_4                       |8|0|(463, 51)|9.08|
|24|ecoli-0-6-7_vs_3-5                 |7|0|(200, 22)|9.09|
|25|ecoli-0-2-3-4_vs_5                 |7|0|(182, 20)|9.10|
|26|glass-0-1-5_vs_2                   |9|0|(155, 17)|9.12|
|27|yeast-0-3-5-9_vs_7-8               |8|0|(456, 50)|9.12|
|28|yeast-0-2-5-6_vs_3-7-8-9           |8|0|(905, 99)|9.14|
|29|ecoli-0-4-6_vs_5                   |6|0|(183, 20)|9.15|
|30|ecoli-0-1_vs_2-3-5                 |7|0|(220, 24)|9.17|
|31|ecoli-0-2-6-7_vs_3-5               |7|0|(202, 22)|9.18|
|32|glass-0-4_vs_5                     |9|0|(83, 9)|9.22|
|33|ecoli-0-3-4-6_vs_5                 |7|0|(185, 20)|9.25|
|34|ecoli-0-3-4-7_vs_5-6               |7|0|(232, 25)|9.28|
|35|yeast-0-5-6-7-9_vs_4               |8|0|(477, 51)|9.35|
|36|vowel0                             |13|0|(90, 898)|9.98|
|37|ecoli-0-6-7_vs_5                   |6|0|(200, 20)|10.00|
|38|glass-0-1-6_vs_2                   |9|0|(175, 17)|10.29|
|39|ecoli-0-1-4-7_vs_2-3-5-6           |7|0|(307, 29)|10.59|
|40|led7digit-0-2-4-5-6-7-8-9_vs_1     |7|0|(406, 37)|10.97|
|41|ecoli-0-1_vs_5                     |6|0|(220, 20)|11.00|
|42|glass-0-1-4-6_vs_2                 |9|0|(188, 17)|11.06|
|43|glass2                             |9|0|(197, 17)|11.59|
|44|ecoli-0-1-4-7_vs_5-6               |6|0|(307, 25)|12.28|
|45|cleveland-0_vs_4                   |13|0|(160, 13)|12.31|
|46|ecoli-0-1-4-6_vs_5                 |6|0|(260, 20)|13.00|
|47|shuttle-c0-vs-c4                   |9|0|(1706, 123)|13.87|
|48|yeast-1_vs_7                       |7|0|(429, 30)|14.30|
|49|glass4                             |9|0|(201, 13)|15.46|
|50|ecoli4                             |7|0|(316, 20)|15.80|
|51|page-blocks-1-3_vs_4               |10|0|(444, 28)|15.86|
|52|abalone9-18                        |7|1|(689, 42)|16.40|
|53|dermatology-6                      |34|0|(338, 20)|16.90|
|54|glass-0-1-6_vs_5                   |9|0|(175, 9)|19.44|
|55|shuttle-c2-vs-c4                   |9|0|(6, 123)|20.50|
|56|shuttle-6_vs_2-3                   |9|0|(220, 10)|22.00|
|57|yeast-1-4-5-8_vs_7                 |8|0|(663, 30)|22.10|
|58|glass5                             |9|0|(205, 9)|22.78|
|59|yeast-2_vs_8                       |8|0|(462, 20)|23.10|
|60|yeast4                             |8|0|(1433, 51)|28.10|
|61|winequality-red-4                  |11|0|(1546, 53)|29.17|
|62|poker-9_vs_7                       |10|0|(236, 8)|29.50|
|63|yeast-1-2-8-9_vs_7                 |8|0|(917, 30)|30.57|
|64|abalone-3_vs_11                    |7|1|(15, 487)|32.47|
|65|winequality-white-9_vs_4           |11|0|(163, 5)|32.60|
|66|yeast5                             |8|0|(1440, 44)|32.73|
|67|winequality-red-8_vs_6             |11|0|(638, 18)|35.44|
|68|ecoli-0-1-3-7_vs_2-6               |7|0|(274, 7)|39.14|
|69|abalone-17_vs_7-8-9-10             |7|1|(2280, 58)|39.31|
|70|abalone-21_vs_8                    |7|1|(14, 567)|40.50|
|71|yeast6                             |8|0|(1449, 35)|41.40|
|72|winequality-white-3_vs_7           |11|0|(880, 20)|44.00|
|73|winequality-red-8_vs_6-7           |11|0|(837, 18)|46.50|
|74|abalone-19_vs_10-11-12-13          |7|1|(32, 1590)|49.69|
|75|winequality-white-3-9_vs_5         |11|0|(1457, 25)|58.28|
|76|poker-8-9_vs_6                     |10|0|(1460, 25)|58.40|
|77|shuttle-2_vs_5                     |9|0|(3267, 49)|66.67|
|78|winequality-red-3_vs_5             |11|0|(681, 10)|68.10|
|79|abalone-20_vs_8-9-10               |7|1|(1890, 26)|72.69|
|80|poker-8-9_vs_5                     |10|0|(2050, 25)|82.00|
|81|poker-8_vs_6                       |10|0|(1460, 17)|85.88|
|82|abalone19                          |7|1|(4142, 32)|129.44|


## Pre-processing

### Cleaning strings

In [18]:
# Cleaning (stripping) strings within dataframe and also changing class labels to 1 and 0.
for name in ds_names:
    for s in ['train', 'test']:
        for fold in range(5):
            df = datasets[name][s][fold]
            df_obj = df.select_dtypes(['object'])
            df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
            class_att = df.columns[-1]
            #print(class_att)
            #df[class_att] = df[class_att].replace(['positive', 'negative'],[1,0])
                

### Dealing with missing values

Applying a Simple Imputer to the numeric attributes when there are missing values.

In [19]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
for name in ds_names:
    ds = datasets[name]['train'][0].append(datasets[name]['test'][0], ignore_index=True) #full dataset.
    if df.isnull().values.any(): 
        print('There is(are) missing values on ', name)
        imp_mean.fit(ds[att_types[name]['numeric']]) #Fit on full dataset.
        for s in ['train', 'test']:
            for fold in range(5):
                datasets[name][s][fold][att_types[name]['numeric']] = imp_mean.transform(datasets[name][s][fold][att_types[name]['numeric']])

There were no missing values!

### One-hot encoding

Here we must apply this encoding method to the nominal attributes in order to allow them to be managed by the classification algorithms.

In [20]:
# First, let's join the dataset (train+test), find all unique values for nominal columns, and store them
# in a dict. 
unique_values = {n:None for n in ds_names}
for name in ds_names:
    unique_values[name] = {k:None for k in att_types[name]['nominal'][:-1]}
    ds = datasets[name]['train'][0].append(datasets[name]['test'][0], ignore_index=True)
    for att in att_types[name]['nominal'][:-1]:#For each nominal attribute, except the target one (last one)
        unique_values[name][att] = ds[att].unique()
        
# Before applying the encoding, we will cast each nominal attribute to CategoricalDtype in order to explicitly 
# set all their possible values. This will avoid different encodings (thus, different dimensions) in train/test sets.
for name in ds_names:    
    for s in ['train', 'test']:
        for fold in range(5):
            for att in att_types[name]['nominal'][:-1]: #For each nominal attribute, except the target one (last one)
                datasets[name][s][fold][att] = datasets[name][s][fold][att].astype(CategoricalDtype(unique_values[name][att]))
                att_encoded = pd.get_dummies(datasets[name][s][fold][att], prefix = att)
                datasets[name][s][fold] = datasets[name][s][fold].drop([att], axis = 1)
                datasets[name][s][fold] = pd.concat([att_encoded, datasets[name][s][fold]], axis = 1)
                


In [21]:
# #  Let's take a look at the second dataset, for example:
# print(ds_names[1])
# datasets[ds_names[1]]['train'][0]

### Scaling

Here, the idea is to create 5 copies of each dataset, for each copy we are going to apply one of the following scaling techniques to the numeric attributes: Standard Scaler, Min-max Scaler, Maximum Absolute Scaler, Robust Scaler and Power Transformer.

UPDATE 1: There is [a bug](https://github.com/scikit-learn/scikit-learn/issues/14959) in PowerTransformer that makes it fail for some of our datasets, I'll skip it for now. 

UPDATE 2: Instead of PowerTransformer, we will include QuantileTransformer which also outputs a distribution with a gaussian-like shape:
"This method transforms the features to follow a uniform or a normal distribution.  Therefore, for a given feature, this transformation tends to spread out the most frequent values. It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme."

In [22]:
ss = StandardScaler()
mms = MinMaxScaler() 
mas = MaxAbsScaler() 
rs = RobustScaler()
#pt = PowerTransformer()
qt = QuantileTransformer(output_distribution='normal')

datasets_ss = copy.deepcopy(datasets)
datasets_mms = copy.deepcopy(datasets)
datasets_mas = copy.deepcopy(datasets)
datasets_rs = copy.deepcopy(datasets)
#datasets_pt = copy.deepcopy(datasets)
datasets_qt = copy.deepcopy(datasets)

In [23]:
import warnings
# Ignoring warnings from QuantileTransformer when number of samples is lower then 1000:
warnings.filterwarnings(action = "ignore", category=UserWarning) 

for name in ds_names:
    for fold in range(5):
        #print(f'Dataset: {name}, fold {fold}.', end = '')
        datasets_ss[name]['train'][fold][att_types[name]['numeric']] = ss.fit_transform(datasets_ss[name]['train'][fold][att_types[name]['numeric']])
        datasets_ss[name]['test'][fold][att_types[name]['numeric']] = ss.transform(datasets_ss[name]['test'][fold][att_types[name]['numeric']])
        datasets_mms[name]['train'][fold][att_types[name]['numeric']] = mms.fit_transform(datasets_mms[name]['train'][fold][att_types[name]['numeric']])
        datasets_mms[name]['test'][fold][att_types[name]['numeric']] = mms.transform(datasets_mms[name]['test'][fold][att_types[name]['numeric']])
        datasets_mas[name]['train'][fold][att_types[name]['numeric']] = mas.fit_transform(datasets_mas[name]['train'][fold][att_types[name]['numeric']])
        datasets_mas[name]['test'][fold][att_types[name]['numeric']] = mas.transform(datasets_mas[name]['test'][fold][att_types[name]['numeric']])
        datasets_rs[name]['train'][fold][att_types[name]['numeric']] = rs.fit_transform(datasets_rs[name]['train'][fold][att_types[name]['numeric']])
        datasets_rs[name]['test'][fold][att_types[name]['numeric']] = rs.transform(datasets_rs[name]['test'][fold][att_types[name]['numeric']])
        datasets_qt[name]['train'][fold][att_types[name]['numeric']] = qt.fit_transform(datasets_qt[name]['train'][fold][att_types[name]['numeric']])
        datasets_qt[name]['test'][fold][att_types[name]['numeric']] = qt.transform(datasets_qt[name]['test'][fold][att_types[name]['numeric']])

# Restablishing warnings:
warnings.filterwarnings(action = "default", category=UserWarning)

Now, let's take a look at the effect of the different scaling methods in a certain variable:

In [None]:
#a_name = 'ecoli2-5-fold'
a_name = 'winequality-white-9_vs_4-5-fold'
#a_name ='glass1-5-fold'

In [None]:
def histograma(var: np.ndarray, title: str):
    hist, bin_edges = np.histogram(var)
    plt.figure()
    plt.bar(bin_edges[:-1], hist, width=5)
    #plt.xlim(min(bin_edges)-5, max(bin_edges)+5)
    if min(bin_edges) > -8 and max(bin_edges) < +8: #if histogram is in the interval (-8, 8)
        plt.xlim(-8, +8)
    plt.grid(axis='y', alpha=0.75)
    plt.xlabel('Value',fontsize=15)
    plt.ylabel('Frequency',fontsize=15)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.title(title,fontsize=15)
    plt.show()

v = att_types[a_name]['numeric'][4]
print('Dataset:', a_name, 'attribute', v)
    
v1 = datasets[a_name]['train'][0][v]
histograma(v1, 'Original (no scaling)')
v2 = datasets_ss[a_name]['train'][0][v]
histograma(v2, 'Standard Scaler')
v3 = datasets_mms[a_name]['train'][0][v]
histograma(v3, 'Min-Max Scaler')
v4 = datasets_mas[a_name]['train'][0][v]
histograma(v4, 'Max Abs Scaler')
v5 = datasets_rs[a_name]['train'][0][v]
histograma(v5, 'Robust Scaler')
#v6 = datasets_pt[a_name]['train'][0][v]
#histograma(v6, 'Power Transformer')
v7 = datasets_qt[a_name]['train'][0][v]
histograma(v7, 'Quantile Transformer')

In [None]:
tuples = [(v1, 'Original (no scaling)'), (v2, 'Standard Scaler'), (v3, 'Min-Max Scaler'),
          (v4, 'Max Abs Scaler'), (v5, 'Robust Scaler'), #(v6, 'Power Transformer'), 
          (v7, 'Quantile Transformer')
         ]
print('*** Distribution is not normal if p-value < 0.05 ***')
teste = []
for v, st in tuples:
    teste.append(v)
    print(st+': Statistic = %.2f, p-value = %.4f'%stats.normaltest(v))

### Creating functions to cross-validate models:

In [24]:
def run_model(model, model_name, results_df):
    superset = {'No scaling': datasets, 'Standard Scaler': datasets_ss,
            'Min-Max Scaler': datasets_mms,'Max Abs Scaler':datasets_mas,
            'Robust Scaler':datasets_rs, #'Power Transformer': datasets_pt, 
            'Quantile Transformer': datasets_qt}
    
    print('Starting '+ model_name +', time: ', datetime.now())
    for name in ds_names:
    #for name in ds_names[:5]:
    #for name in [ds_names[10]]: #Testing with just one dataset
        print('\nCurrent dataset: '+name, end = '')
        for k in superset:
            print(' '+k+' ', end = '')
            acc_folds = []
            recall_folds = []
            precision_folds = []
            f1_folds = []
            #roc_auc_folds = []
            gmean_folds = []
            
            ds = superset[k]
            target_att = ds[name]['train'][0].columns.tolist()[-1]
            for fold in range(5):
                print('.', end = '')
                #Gather training data:
                ds_train = ds[name]['train'][fold]
                X_train = ds_train.drop(labels=target_att, axis = 1)
                y_train = ds_train[target_att]
            
                # Gather test data:
                ds_test = ds[name]['test'][fold]
                X_test = ds_test.drop(labels=target_att, axis = 1)
                y_test = ds_test[target_att]
                
                # Train model with the training data, 
                # If we need y_score for calculating ROC-AUC we do:
                #y_score = model.fit(X_train, y_train).decision_function(X_test)
                
                # If we won't calculate ROC-AUC, we can just fit the model.
                model.fit(X_train, y_train)
                
                # Test model:
                y_pred = model.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred, pos_label='positive')
                precision = precision_score(y_test, y_pred, pos_label='positive', zero_division=0)
                f1 = f1_score(y_test, y_pred, pos_label='positive', zero_division=0)
                gmean = geometric_mean_score(y_test, y_pred, pos_label='positive')
                #roc_auc = roc_auc_score(y_test, y_score)

                # Store metrics for this fold
                acc_folds.append(acc)
                recall_folds.append(recall)
                precision_folds.append(precision)
                f1_folds.append(f1)
                #roc_auc_folds.append(roc_auc)
                gmean_folds.append(gmean)
            
            new_row = {'Dataset' : name, 'Scaling technique' : k, 'Model' : model_name,
                       'acc_fold1' : acc_folds[0], 'acc_fold2' : acc_folds[1], 'acc_fold3' : acc_folds[2], 
                       'acc_fold4' : acc_folds[3], 'acc_fold5' : acc_folds[4], 
                       'acc_mean': np.mean(acc_folds), 'acc_stddev': np.std(acc_folds),
                       'recall_fold1' : recall_folds[0], 'recall_fold2' : recall_folds[1], 'recall_fold3' : recall_folds[2],
                       'recall_fold4' : recall_folds[3], 'recall_fold5' : recall_folds[4], 
                       'recall_mean': np.mean(recall_folds), 'recall_stddev':np.std(recall_folds),
                       'precision_fold1' : precision_folds[0], 'precision_fold2' : precision_folds[1] , 'precision_fold3' : precision_folds[2],
                       'precision_fold4' : precision_folds[3], 'precision_fold5' : precision_folds[4],
                       'precision_mean': np.mean(precision_folds), 'precision_stddev': np.std(precision_folds),
                       'f1_fold1' : f1_folds[0], 'f1_fold2' : f1_folds[1], 'f1_fold3' : f1_folds[2], 
                       'f1_fold4' : f1_folds[3], 'f1_fold5' : f1_folds[4], 
                       'f1_mean': np.mean(f1_folds), 'f1_stddev': np.std(f1_folds),
                       'gmean_fold1' : gmean_folds[0], 'gmean_fold2' : gmean_folds[1], 'gmean_fold3' : gmean_folds[2], 
                       'gmean_fold4' : gmean_folds[3], 'gmean_fold5' : gmean_folds[4], 
                       'gmean_mean': np.mean(gmean_folds), 'gmean_stddev' : np.std(gmean_folds)
                      }

            results_df = results_df.append(new_row, ignore_index=True)

    print('\nFinishing '+ model_name +', time: ', datetime.now(),'\n')   
    return results_df

In [25]:
# This version is for ensemble models that need a prefit pool of base classifiers:
def run_model2(model, model_name, pool, results_df):
    superset = {'No scaling': datasets, 'Standard Scaler': datasets_ss,
            'Min-Max Scaler': datasets_mms,'Max Abs Scaler':datasets_mas,
            'Robust Scaler':datasets_rs, #'Power Transformer': datasets_pt, 
            'Quantile Transformer': datasets_qt}

    print('Starting '+ model_name +', time: ', datetime.now())
    for name in ds_names:
    #for name in ds_names[:5]:
        print('\nCurrent dataset: '+name, end = '')
        for k in superset:
            print(' '+k+' ', end = '')
            acc_folds = []
            recall_folds = []
            precision_folds = []
            f1_folds = []
            #roc_auc_folds = []
            gmean_folds = []
            
            ds = superset[k]
            target_att = ds[name]['train'][0].columns.tolist()[-1]
            for fold in range(5):
                print('.', end = '')
                #Gather training data:
                ds_train = ds[name]['train'][fold]
                X_train = ds_train.drop(labels=target_att, axis = 1)
                y_train = ds_train[target_att]
            
                # Gather test data:
                ds_test = ds[name]['test'][fold]
                X_test = ds_test.drop(labels=target_att, axis = 1)
                y_test = ds_test[target_att]
                
                # Train model with the training data, 
                # If we need y_score for calculating ROC-AUC we do:
                #y_score = model.fit(X_train, y_train).decision_function(X_test)
                
                # If we won't calculate ROC-AUC, we can just fit the model.
                # If it is an ensemble model that needs prefit base models, we fit them first:
                pool.fit(X_train, y_train)
                model.fit(X_train, y_train)
                
                # Test model:
                y_pred = model.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred, pos_label='positive')
                precision = precision_score(y_test, y_pred, pos_label='positive', zero_division=0)
                f1 = f1_score(y_test, y_pred, pos_label='positive', zero_division=0)
                gmean = geometric_mean_score(y_test, y_pred, pos_label='positive')
                #roc_auc = roc_auc_score(y_test, y_score)

                # Store metrics for this fold
                acc_folds.append(acc)
                recall_folds.append(recall)
                precision_folds.append(precision)
                f1_folds.append(f1)
                #roc_auc_folds.append(roc_auc)
                gmean_folds.append(gmean)
            
            new_row = {'Dataset' : name, 'Scaling technique' : k, 'Model' : model_name,
                       'acc_fold1' : acc_folds[0], 'acc_fold2' : acc_folds[1], 'acc_fold3' : acc_folds[2], 
                       'acc_fold4' : acc_folds[3], 'acc_fold5' : acc_folds[4], 
                       'acc_mean': np.mean(acc_folds), 'acc_stddev': np.std(acc_folds),
                       'recall_fold1' : recall_folds[0], 'recall_fold2' : recall_folds[1], 'recall_fold3' : recall_folds[2],
                       'recall_fold4' : recall_folds[3], 'recall_fold5' : recall_folds[4], 
                       'recall_mean': np.mean(recall_folds), 'recall_stddev':np.std(recall_folds),
                       'precision_fold1' : precision_folds[0], 'precision_fold2' : precision_folds[1] , 'precision_fold3' : precision_folds[2],
                       'precision_fold4' : precision_folds[3], 'precision_fold5' : precision_folds[4],
                       'precision_mean': np.mean(precision_folds), 'precision_stddev': np.std(precision_folds),
                       'f1_fold1' : f1_folds[0], 'f1_fold2' : f1_folds[1], 'f1_fold3' : f1_folds[2], 
                       'f1_fold4' : f1_folds[3], 'f1_fold5' : f1_folds[4], 
                       'f1_mean': np.mean(f1_folds), 'f1_stddev': np.std(f1_folds),
                       'gmean_fold1' : gmean_folds[0], 'gmean_fold2' : gmean_folds[1], 'gmean_fold3' : gmean_folds[2], 
                       'gmean_fold4' : gmean_folds[3], 'gmean_fold5' : gmean_folds[4], 
                       'gmean_mean': np.mean(gmean_folds), 'gmean_stddev' : np.std(gmean_folds)
                      }

            results_df = results_df.append(new_row, ignore_index=True)

    print('\nFinishing '+ model_name +', time: ', datetime.now(),'\n')   
    return results_df

### Running monolithic models

In [26]:
# Creating a dataframe to store results:
results_df = pd.DataFrame({'Dataset' : [], 'Scaling technique' : [], 'Model' : [],
                           'acc_fold1' : [], 'acc_fold2' : [], 'acc_fold3' : [], 'acc_fold4' : [], 'acc_fold5' : [], 
                           'acc_mean':[], 'acc_stddev':[],
                           'recall_fold1' : [], 'recall_fold2' : [], 'recall_fold3' : [], 'recall_fold4' : [], 'recall_fold5' : [], 
                           'recall_mean':[], 'recall_stddev':[],
                           'precision_fold1' : [], 'precision_fold2' : [], 'precision_fold3' : [], 'precision_fold4' : [], 
                           'precision_fold5' : [], 'precision_mean':[], 'precision_stddev': [],
                           'f1_fold1' : [], 'f1_fold2' : [], 'f1_fold3' : [], 'f1_fold4' : [], 'f1_fold5' : [], 
                           'f1_mean': [], 'f1_stddev': [],
                           'gmean_fold1' : [], 'gmean_fold2' : [], 'gmean_fold3' : [], 'gmean_fold4' : [], 'gmean_fold5' : [], 
                           'gmean_mean':[], 'gmean_stddev' : []
                           })

## Instantiating models:
# Monolithic models
monolithic_models = {'SVM_lin': SVC(kernel='linear', probability=True),
                     'SVM_RBF': SVC(kernel='rbf', probability=True),
                     'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
                     'GNB': GaussianNB(),
                     'GLVQ': GlvqModel(random_state=0), #Generalized Learning Vector Quantization
                     'LDA': LinearDiscriminantAnalysis(),
                     'QDA': QuadraticDiscriminantAnalysis(),
                     'GP': GaussianProcessClassifier(1.0 * RBF(1.0), random_state=0, n_jobs=-1),
                     'DT': DecisionTreeClassifier(random_state=0),
                     'Percep': Perceptron(random_state=0, n_jobs=-1),
                     'MLP': MLPClassifier(activation='relu', solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=0)
                    }

In [None]:
# Running models:
for name,model in monolithic_models.items():
        results_df = run_model(model, name, results_df)
results_df.to_csv('csv_tabs/results_monolithic.csv', index=False)

### Running Ensemble models

In [27]:
# Creating a dataframe to store results:
results_df = pd.DataFrame({'Dataset' : [], 'Scaling technique' : [], 'Model' : [],
                           'acc_fold1' : [], 'acc_fold2' : [], 'acc_fold3' : [], 'acc_fold4' : [], 'acc_fold5' : [], 
                           'acc_mean':[], 'acc_stddev':[],
                           'recall_fold1' : [], 'recall_fold2' : [], 'recall_fold3' : [], 'recall_fold4' : [], 'recall_fold5' : [], 
                           'recall_mean':[], 'recall_stddev':[],
                           'precision_fold1' : [], 'precision_fold2' : [], 'precision_fold3' : [], 'precision_fold4' : [], 
                           'precision_fold5' : [], 'precision_mean':[], 'precision_stddev': [],
                           'f1_fold1' : [], 'f1_fold2' : [], 'f1_fold3' : [], 'f1_fold4' : [], 'f1_fold5' : [], 
                           'f1_mean': [], 'f1_stddev': [],
                           'gmean_fold1' : [], 'gmean_fold2' : [], 'gmean_fold3' : [], 'gmean_fold4' : [], 'gmean_fold5' : [], 
                           'gmean_mean':[], 'gmean_stddev' : []
                           })


#  Ensemble models

base_model = Perceptron(random_state=0)
pool_classifiers = BaggingClassifier(base_estimator=base_model, n_estimators=100, random_state=0, bootstrap=True,
                                bootstrap_features=False, max_features=1.0, n_jobs=-1)

base_model_calib = CalibratedClassifierCV(base_estimator = Perceptron(random_state=0), cv=5) 
pool_classifiers_calib = BaggingClassifier(base_estimator=base_model_calib, n_estimators=100, random_state=0, bootstrap=True,
                                bootstrap_features=False, max_features=1.0, n_jobs=-1) 

ensemble_models = {'RF': RandomForestClassifier(random_state = 0, n_jobs=-1),
                   'XGBoost': XGBClassifier(n_jobs=-1, random_state=0),
                   'AdaBoost': AdaBoostClassifier(n_estimators=100),
                   'Bagging': pool_classifiers,
                   'OLA': OLA(pool_classifiers, random_state=0),
                   'LCA': LCA(pool_classifiers, random_state=0),
                   'MCB': MCB(pool_classifiers, random_state=0),
                   'KNORAE': KNORAE(pool_classifiers, random_state=0),
                   'KNORAU': KNORAU(pool_classifiers, random_state=0),
                   #'METADES': METADES(pool_classifiers_calib, random_state=0)
                  }


In [None]:
# Running models:
for name,model in ensemble_models.items():
    if name in ['OLA','LCA','MCB', 'KNORAE', 'KNORAU']: # these metamodels need pool_classifiers to be fit before applying fit to the metamodel.
        results_df = run_model2(model, name, pool_classifiers, results_df)
    elif name in ['METADES']: #This also needs a prefit pool_classifiers but needs base_estimators to return probabilities too.
        results_df = run_model2(model, name, pool_classifiers_calib, results_df)
    else: 
        results_df = run_model(model, name, results_df)
results_df.to_csv('csv_tabs/results_ensemble.csv', index=False)