In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import smote_variants as sv
import imbalanced_databases as imbd

from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import logging
logging.getLogger('smote_variants').setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings("ignore")

import mldb.binary_classification as binclas

In [2]:
datasets= binclas.get_filtered_data_loaders(n_attr_encoded_bounds=[1, 20], n_bounds=[1, 800], n_minority_bounds=[10, 1000])

In [3]:
datasets= [binclas.load_cleveland_0_vs_4,
binclas.load_ecoli_0_1_4_6_vs_5,
binclas.load_ecoli4,
binclas.load_glass_0_1_4_6_vs_2,
binclas.load_glass2,
binclas.load_shuttle_6_vs_2_3,
binclas.load_habarman,
binclas.load_iris0,
binclas.load_new_thyroid1,
binclas.load_bupa,
binclas.load_appendicitis,
binclas.load_led7digit_0_2_4_5_6_7_8_9_vs_1,
binclas.load_page_blocks_1_3_vs_4,
binclas.load_winequality_red_3_vs_5,
binclas.load_yeast_1_vs_7,
binclas.load_pima,
binclas.load_wisconsin,
binclas.load_monk_2,
binclas.load_saheart,
binclas.load_australian,
binclas.load_abalone_3_vs_11]

#datasets = [binclas.load_bupa]

In [4]:
len(datasets)

21

In [5]:
oversamplers_metric= set(sv.get_metric_learning_oversamplers())
oversamplers_ml = {sv.SPY, sv.OUPS, sv.SMOTE_D, sv.NT_SMOTE,
                sv.Gazzah, sv.ROSE, sv.NDO_sampling,
                sv.Borderline_SMOTE1, sv.SMOTE,
                sv.Borderline_SMOTE2, sv.ISMOTE, sv.SMMO,
                sv.SMOTE_OUT, sv.SN_SMOTE, sv.Selected_SMOTE,
                sv.distance_SMOTE, sv.Gaussian_SMOTE, sv.MCT,
                sv.Random_SMOTE, sv.ADASYN, sv.SL_graph_SMOTE,
                sv.CURE_SMOTE, sv.ANS, sv.MSMOTE,
                sv.Safe_Level_SMOTE, sv.SMOBD, sv.CBSO,
                sv.Assembled_SMOTE, sv.SDSMOTE,
                sv.SMOTE_TomekLinks, sv.Edge_Det_SMOTE,
                sv.ProWSyn, sv.Stefanowski, sv.NRAS,
                sv.AND_SMOTE, sv.DBSMOTE, sv.polynom_fit_SMOTE,
                sv.ASMOBD, sv.MDO}
#oversamplers = [sv.SMOTE, sv.SMOTE_TomekLinks, sv.SMOTE_ENN, sv.E_SMOTE]
oversamplers = [sv.SMOTE]

In [6]:
validator = RepeatedStratifiedKFold(n_repeats=1000, n_splits=5, random_state=5)

classifiers= [(DecisionTreeClassifier, {'random_state': 5}), 
                #(RandomForestClassifier, {'random_state': 5}),
                #(RandomForestClassifier, {'random_state': 5, 'min_samples_leaf': 15}),
                (SVC, {'probability': True, 'random_state': 5, 'C': 1.0}),
                #(KNeighborsClassifier, {'algorithm': 'brute', 'weights': 'distance'})
                (KNeighborsClassifier, {'algorithm': 'brute', 'weights': 'distance', 'n_neighbors': 5})
                ]



In [7]:
results= []

for j, dataset in enumerate(datasets):
    """
    X, y= make_classification(n_samples=100, 
                                n_features=4,
                                n_informative=2,
                                n_redundant=1,
                                n_repeated=0,
                                n_clusters_per_class=2,
                                weights=np.array([0.8, 0.2]),
                                random_state=j)
    ds= {'name': str(j)}
    """

    results_dataset= []

    ds= dataset()
    X, y= ds['data'], ds['target']

    print(j, ds['name'], len(X), len(np.unique(X, axis=0)), len(X[y == 1]))

    for i, (train, test) in enumerate(validator.split(X, y)):
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]

        nn_params_0= {'metric': 'precomputed',
                        'metric_learning_method': 'id',
                        'algorithm': 'brute'}
        nn_params_1= {'metric': 'precomputed',
                    'metric_learning_method': 'gmean',
                    'algorithm': 'brute'}

        #metric_tensor= sv.MetricTensor(**nn_params_1).tensor(X_train, y_train)
        #nn_params_1['metric_tensor']= metric_tensor

        oversampler_params_0= {'random_state': 5,
                                'nn_params': nn_params_0,
                                'sampling_params': {'simplex_sampling': 'uniform',
                                                    'within_simplex_sampling': 'random',
                                                    'n_dim': 2},
                                'n_neighbors': 5,
                                'proportion': 1.0}
        oversampler_params_1= {'random_state': 5,
                                'nn_params': nn_params_0,
                                'n_neighbors': 5,
                                'sampling_params': {'simplex_sampling': 'uniform',
                                                    'within_simplex_sampling': 'deterministic',
                                                    'n_dim': 2},
                                'proportion': 1.0}

        for o in oversamplers:
            if not o.__name__.endswith('NoSMOTE'):
                oversampler= o(**oversampler_params_0)
            else:
                oversampler= o()
            X_os_0, y_os_0 = oversampler.sample(X_train, y_train)

            for c, cp in classifiers:
                classifier = c(**cp).fit(X_os_0, y_os_0)
                y_proba_0 = classifier.predict_proba(oversampler.preprocessing_transform(X_test))[:,1]
                results_dataset.append([i, o.__name__, 'euc', roc_auc_score(y_test, y_proba_0), ds['name'], c.__name__, cp])
                #print(results_dataset[-1])

            if not o.__name__.endswith('NoSMOTE'):
                oversampler= o(**oversampler_params_1)
            else:
                oversampler= o()
            X_os_1, y_os_1 = oversampler.sample(X_train, y_train)

            for c, cp in classifiers:
                classifier = c(**cp).fit(X_os_1, y_os_1)
                y_proba_1 = classifier.predict_proba(oversampler.preprocessing_transform(X_test))[:,1]
                results_dataset.append([i, o.__name__, 'euc_pref', roc_auc_score(y_test, y_proba_1), ds['name'], c.__name__, cp])

    data= pd.DataFrame(results_dataset, columns=['fold', 'oversampler', 'metric', 'auc', 'dataset', 'classifier', 'params'])
    print(data.groupby(['classifier', 'oversampler', 'metric']).agg({'auc': np.mean}))

    results.extend(results_dataset)
    
    data= pd.DataFrame(results, columns=['fold', 'oversampler', 'metric', 'auc', 'dataset', 'classifier', 'params'])
    print(data.groupby(['classifier', 'oversampler', 'metric']).agg({'auc': np.mean}))

0 cleveland-0_vs_4 177 177 13
                                                  auc
classifier             oversampler metric            
DecisionTreeClassifier SMOTE       euc       0.781554
                                   euc_pref  0.786560
KNeighborsClassifier   SMOTE       euc       0.970327
                                   euc_pref  0.969877
SVC                    SMOTE       euc       0.978641
                                   euc_pref  0.978311
                                                  auc
classifier             oversampler metric            
DecisionTreeClassifier SMOTE       euc       0.781554
                                   euc_pref  0.786560
KNeighborsClassifier   SMOTE       euc       0.970327
                                   euc_pref  0.969877
SVC                    SMOTE       euc       0.978641
                                   euc_pref  0.978311
1 ecoli-0-1-4-6_vs_5 280 280 20
                                                  auc
classifier          

In [8]:
data= pd.DataFrame(results, columns=['fold', 'oversampler', 'metric', 'auc', 'dataset'])

ValueError: 5 columns passed, passed data had 7 columns

In [None]:
tmp= np.hstack([X_train[:,2:], X_train[:,:2]]).T

In [None]:
mt= sv.MetricTensor(metric='precomputed', metric_learning_method='gmean').tensor(X_train, y_train)
eigv, eigw= np.linalg.eig(mt)
print(eigv)

[0.41862514 0.31411729 0.28807286 0.25696997 0.20078851 0.16538174
 0.1450392  0.05703713 0.11517736 0.07665949 0.08368331 0.09854955
 0.09670926]


In [None]:
mt= sv.MetricTensor(metric='precomputed', metric_learning_method='gmean').tensor(tmp, y_train)
eigv, eigw= np.linalg.eig(mt)
print(eigv)

ValueError: not enough values to unpack (expected 2, got 0)

In [None]:
data.groupby(['oversampler', 'metric']).agg({'auc': np.mean})

Unnamed: 0_level_0,Unnamed: 1_level_0,auc
oversampler,metric,Unnamed: 2_level_1
ADASYN,ITML_mi,0.847245
ADASYN,euc,0.846558
Borderline_SMOTE1,ITML_mi,0.841936
Borderline_SMOTE1,euc,0.839512
Borderline_SMOTE2,ITML_mi,0.840883
Borderline_SMOTE2,euc,0.837899
SMOTE,ITML_mi,0.851753
SMOTE,euc,0.849281
SMOTE_TomekLinks,ITML_mi,0.852887
SMOTE_TomekLinks,euc,0.848034


In [None]:
data.groupby(['dataset', 'oversampler', 'metric']).agg({'auc': np.mean})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,auc
dataset,oversampler,metric,Unnamed: 3_level_1
abalone-3_vs_11,ADASYN,ITML_mi,0.999794
abalone-3_vs_11,ADASYN,euc,1.000000
abalone-3_vs_11,Borderline_SMOTE1,ITML_mi,0.999794
abalone-3_vs_11,Borderline_SMOTE1,euc,0.999863
abalone-3_vs_11,Borderline_SMOTE2,ITML_mi,0.999794
...,...,...,...
yeast-1_vs_7,Borderline_SMOTE2,euc,0.776872
yeast-1_vs_7,SMOTE,ITML_mi,0.756919
yeast-1_vs_7,SMOTE,euc,0.756545
yeast-1_vs_7,SMOTE_TomekLinks,ITML_mi,0.761913
