# Evaluation of oversamplers with a set of classifiers on a set of datasets

In this notebook, we give an example of evaluating multiple oversamplers on multiple datasets with multiple classifiers. 

In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import smote_variants as sv

import imbalanced_databases as imbd

In [2]:
# Setting the cache_path which is used for caching during the evaluation

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [3]:
# Specifying two datasets by their load functions

datasets= [imbd.load_glass0, imbd.load_yeast1]

In [4]:
# Specifying the classifiers used for evaluation

knn_classifier= KNeighborsClassifier()
dt_classifier= DecisionTreeClassifier()

In [5]:
# Executing the evaluation using 5 parallel jobs, and at most 35 different 
# random but meaningful parameter combinations with the oversamplers

results= sv.evaluate_oversamplers(datasets= datasets,
                                    samplers= sv.get_n_quickest_oversamplers(5),
                                    classifiers= [knn_classifier, dt_classifier],
                                    cache_path= cache_path,
                                    n_jobs= 5,
                                    max_samp_par_comb= 35)

2019-06-11 18:18:29,340:INFO:dataset: glass0, samplings_available: True, evaluations_available: True
2019-06-11 18:18:29,341:INFO:doing the folding
2019-06-11 18:18:29,341:INFO:Folding reading from file folding_glass0.pickle
2019-06-11 18:18:29,344:INFO:do the samplings
2019-06-11 18:18:29,345:INFO:create sampling objects
2019-06-11 18:18:29,346:INFO:executing 72 sampling in parallel
2019-06-11 18:18:31,552:INFO:do the evaluations
2019-06-11 18:18:31,552:INFO:create classifier jobs
2019-06-11 18:18:31,593:INFO:executing 72 evaluation jobs in parallel
2019-06-11 18:18:32,193:INFO:concatenating the results
2019-06-11 18:18:32,423:INFO:aggregating the results
2019-06-11 18:18:32,773:INFO:dataset: yeast1, samplings_available: True, evaluations_available: True
2019-06-11 18:18:32,773:INFO:doing the folding
2019-06-11 18:18:32,793:INFO:Folding reading from file folding_yeast1.pickle
2019-06-11 18:18:33,028:INFO:do the samplings
2019-06-11 18:18:33,029:INFO:create sampling objects
2019-06-11 

In [6]:
# The results are arranged in a pandas DataFrame with the following columns:
# db_name - name of the database
# classifier - name of the classifier
# sampler - name of the oversampling technique
# auc - highest auc score with the classifier and oversampler (aggregated over all classifier and oversampler
# parameter combinations)
# brier - highest brier score with the classifier and oversampler (aggregated similarly)
# acc - the highest accuracy score with the classifier and oversampler (aggregated similarly)
# f1 - the highest f1 score with the classifier and oversampler (aggregated similarly)
# p_top20 - the highest p_top20 score with the classifier and oversampler (aggregated similarly)
# gacc - the highest GACC score with the classifier and oversampler (aggregated similarly)
# runtime - average runtime in seconds
# db_size - size of the dataset
# db_n_attr - number of attributes in the dataset
# imbalanced_ratio - the ratio of majority/minority class sizes
# sampler_categories - the categories assigned to the oversampler
# classifier_parameters_auc - the classifier parameters reaching the highest auc score
# classifier_parameters_acc - the classifier parameters reaching the highest acc score
# classifier_parameters_gacc - the classifier parameters reaching the highest gacc score
# classifier_parameters_f1 - the classifier parameters reaching the highest f1 score
# classifier_parameters_p_top20 - the classifier parameters reaching the highest p_top20 score
# classifier_parameters_brier - the classifier parameters reaching the highest brier score
# sampler_parameters_auc - the oversampler parameters reaching the highest auc score
# sampler_parameters_acc - the oversampler parameters reaching the highest acc score
# sampler_parameters_gacc - the oversampler parameters reaching the highest gacc score
# sampler_parameters_f1 - the oversampler parameters reaching the highest f1 score
# sampler_parameters_p_top20 - the oversampler parameters reaching the highest p_top20 score
# sampler_parameters_brier - the oversampler parameters reaching the highest brier score

print(results.columns)

Index(['db_name', 'classifier', 'sampler', 'auc', 'brier', 'acc', 'f1',
       'p_top20', 'gacc', 'runtime', 'db_size', 'db_n_attr',
       'imbalanced_ratio', 'sampler_categories', 'classifier_parameters_auc',
       'classifier_parameters_acc', 'classifier_parameters_gacc',
       'classifier_parameters_f1', 'classifier_parameters_p_top20',
       'classifier_parameters_brier', 'sampler_parameters_auc',
       'sampler_parameters_acc', 'sampler_parameters_gacc',
       'sampler_parameters_f1', 'sampler_parameters_p_top20',
       'sampler_parameters_brier'],
      dtype='object')


In [7]:
# The results can be processed according to the requirements of the analysis

print(results)

   db_name              classifier   sampler       auc     brier       acc  \
0   glass0  DecisionTreeClassifier    Gazzah  0.497751  0.331776  0.668224   
1   glass0  DecisionTreeClassifier  NT_SMOTE  0.789187  0.186916  0.813084   
2   glass0  DecisionTreeClassifier      OUPS  0.795106  0.186916  0.813084   
3   glass0  DecisionTreeClassifier   SMOTE_D  0.815377  0.176012  0.823988   
4   glass0  DecisionTreeClassifier       SPY  0.786012  0.213396  0.786604   
5   glass0    KNeighborsClassifier    Gazzah  0.864253  0.149097  0.805296   
6   glass0    KNeighborsClassifier  NT_SMOTE  0.865653  0.156075  0.788162   
7   glass0    KNeighborsClassifier      OUPS  0.870662  0.154019  0.795950   
8   glass0    KNeighborsClassifier   SMOTE_D  0.877403  0.155389  0.789720   
9   glass0    KNeighborsClassifier       SPY  0.867907  0.152960  0.788162   
10  yeast1  DecisionTreeClassifier    Gazzah  0.508739  0.286164  0.713836   
11  yeast1  DecisionTreeClassifier  NT_SMOTE  0.666199  0.285490