In [1]:
import numpy as np

"""
from .._metric_tensor import NearestNeighborsWithMetricTensor, MetricTensor
from ._OverSampling import OverSampling
from .._base import mode

from .._logger import logger
_logger = logger

__all__= ['EnsembleSMOTE']
"""

import smote_variants as sv
from smote_variants import *

from smote_variants import NearestNeighborsWithMetricTensor, MetricTensor
from smote_variants import OverSampling

import logging
_logger= logging.getLogger('smote_variants')


class EnsembleSMOTE(OverSampling):
    """
    References:
        * BibTex::

            @InProceedings{borderlineSMOTE,
                            author="Han, Hui
                            and Wang, Wen-Yuan
                            and Mao, Bing-Huan",
                            editor="Huang, De-Shuang
                            and Zhang, Xiao-Ping
                            and Huang, Guang-Bin",
                            title="Borderline-SMOTE: A New Over-Sampling Method
                                     in Imbalanced Data Sets Learning",
                            booktitle="Advances in Intelligent Computing",
                            year="2005",
                            publisher="Springer Berlin Heidelberg",
                            address="Berlin, Heidelberg",
                            pages="878--887",
                            isbn="978-3-540-31902-3"
                            }
    """

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_borderline,
                  OverSampling.cat_metric_learning]

    def __init__(self,
                 proportion=1.0,
                 smotes= [],
                 *,
                 nn_params={},
                 n_jobs=1,
                 random_state=None):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                    to sample e.g. 1.0 means that after
                                    sampling the number of minority samples
                                    will be equal to the number of majority
                                    samples
            smotes (list): list of smote variants to use in the ensemble
            nn_params (dict): additional parameters for nearest neighbor calculations, any 
                                parameter NearestNeighbors accepts, and additionally use
                                {'metric': 'precomputed', 'metric_learning': '<method>', ...}
                                with <method> in 'ITML', 'LSML' to enable the learning of
                                the metric to be used for neighborhood calculations
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__(random_state=random_state)
        self.check_greater_or_equal(proportion, 'proportion', 0)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.smotes = smotes
        self.nn_params = nn_params
        self.n_jobs = n_jobs

        self.set_random_state(random_state)

    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable parameter combinations.

        Returns:
            list(dict): a list of meaningful parameter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'smotes': [[('SMOTE', {'random_state': 5}),
                                                ('Borderline_SMOTE1', {'random_state': 5}),
                                                ('Borderline_SMOTE2', {'random_state': 5}),
                                                ('ADASYN', {'random_state': 5})]]}

        return cls.generate_parameter_combinations(parameter_combinations, raw)

    def sample(self, X, y):
        """
        Does the sample generation according to the class parameters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        _logger.info(self.__class__.__name__ + ": " +
                     "Running sampling via %s" % self.descriptor())

        self.class_label_statistics(y)

        if not self.check_enough_min_samples_for_sampling():
            return X.copy(), y.copy()

        # determining number of samples to be generated
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])

        if n_to_sample == 0:
            _logger.warning(self.__class__.__name__ +
                            ": " + "Sampling is not needed")
            return X.copy(), y.copy()

        oversamplers = [eval(name)(**params, random_state=5) for name, params in self.smotes]
        oversampler_results = [o.sample(X, y) for o in oversamplers]
        new_samples = np.vstack([r[0][len(X):] for r in oversampler_results])

        nn_params= {**self.nn_params}
        nn_params['metric_tensor']= self.metric_tensor_from_nn_params(nn_params, X, y)

        n_neighbors_dens = 11

        nn= NearestNeighborsWithMetricTensor(n_neighbors=n_neighbors_dens, 
                                                n_jobs=self.n_jobs, 
                                                **(nn_params))
        nn.fit(new_samples)

        seed_indices = self.random_state.choice(range(len(new_samples)), size=n_to_sample, replace=False)
        ind, dist= nn.kneighbors(new_samples[seed_indices], return_distance=True)
        densities = 1.0/dist[:,-1]

        for _ in range(1000):
            ind, dist= nn.kneighbors(new_samples[seed_indices], return_distance=True)
            new_seed_indices= np.random.choice(range(ind.shape[1]), size=n_to_sample, replace=True)
            ind_new, dist_new= nn.kneighbors(new_samples[new_seed_indices], return_distance=True)
            new_densities= 1.0/dist_new[:,-1]
            weights= np.nan_to_num(new_densities/densities, 1.0)
            seed_indices= np.where(weights < self.random_state.random(len(weights)), new_seed_indices, seed_indices)

        samples = new_samples[seed_indices]

        """
        samples = []
        while len(samples) < n_to_sample:
            seed_idx = self.random_state.randint(len(new_samples))
            for _ in range(100):
                indices = nn.kneighbors(new_samples[seed_idx].reshape(1, -1), return_distance=False)[0]
                seed_idx = self.random_state.choice(indices)
                #print(seed_idx, len(new_samples))
            samples.append(new_samples[seed_idx])

        samples = np.vstack(samples)
        """

        #samples = new_samples[self.random_state.choice(new_samples.shape[0], n_to_sample, replace=False), :]
        
        return (np.vstack([X, samples]),
                np.hstack([y, np.hstack([self.min_label]*n_to_sample)]))

    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'smotes': self.smotes,
                'nn_params': self.nn_params,
                'n_jobs': self.n_jobs,
                'random_state': self._random_state_init}


ModuleNotFoundError: No module named 'smote_variants.oversampling._ADASYN'

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import smote_variants as sv
import imbalanced_databases as imbd

from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import logging
logging.getLogger('smote_variants').setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings("ignore")

import mldb.binary_classification as binclas

In [None]:
datasets= binclas.get_filtered_data_loaders(n_attr_encoded_bounds=[1, 20], n_bounds=[1, 800], n_minority_bounds=[10, 1000])

In [None]:
datasets= [binclas.load_cleveland_0_vs_4,
binclas.load_ecoli_0_1_4_6_vs_5,
binclas.load_ecoli4,
binclas.load_glass_0_1_4_6_vs_2,
binclas.load_glass2,
binclas.load_shuttle_6_vs_2_3,
binclas.load_habarman,
binclas.load_iris0,
binclas.load_new_thyroid1,
binclas.load_bupa,
binclas.load_appendicitis,
binclas.load_led7digit_0_2_4_5_6_7_8_9_vs_1,
binclas.load_page_blocks_1_3_vs_4,
binclas.load_winequality_red_3_vs_5,
binclas.load_yeast_1_vs_7,
binclas.load_pima,
binclas.load_wisconsin,
binclas.load_monk_2,
binclas.load_saheart,
binclas.load_australian,
binclas.load_abalone_3_vs_11]

In [None]:
validator = RepeatedStratifiedKFold(n_repeats=10, n_splits=5, random_state=5)

results= []

classifiers= [(DecisionTreeClassifier, {'random_state': 5}), 
                (SVC, {'probability': True, 'random_state': 5}),
                (KNeighborsClassifier, {'n_neighbors': 5})]

oversamplers_base = [sv.SMOTE, sv.Borderline_SMOTE1, sv.Borderline_SMOTE2, sv.ADASYN]

#nn_params_0= {}
#nn_params_1= {}

for j, dataset in enumerate(datasets):
    """
    X, y= make_classification(n_samples=100, 
                                n_features=4,
                                n_informative=2,
                                n_redundant=1,
                                n_repeated=0,
                                n_clusters_per_class=2,
                                weights=np.array([0.8, 0.2]),
                                random_state=j)
    ds= {'name': str(j)}
    """
    

    results_dataset= []

    ds= dataset()
    X, y= ds['data'], ds['target']

    print(j, ds['name'], len(X), len(np.unique(X, axis=0)))

    for i, (train, test) in enumerate(validator.split(X, y)):
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]

        nn_params_0= {'metric': 'precomputed',
                        'metric_learning_method': 'id'}

        for o in oversamplers_base:
            oversampler= o(random_state=5, nn_params=nn_params_0, preferential=False)
            X_os_0, y_os_0 = oversampler.sample(X_train, y_train)

            for c, cp in classifiers:
                classifier = c(**cp).fit(X_os_0, y_os_0)
                y_proba_0 = classifier.predict_proba(oversampler.preprocessing_transform(X_test))[:,1]
                results_dataset.append([i, o.__name__, 'euc', roc_auc_score(y_test, y_proba_0), ds['name'], c.__name__, cp])
                #print(results_dataset[-1])

        o = EnsembleSMOTE
        oversampler= EnsembleSMOTE(smotes=[('SMOTE', {'proportion': 2.0}), 
                                            ('Borderline_SMOTE1', {'proportion': 2.0}), 
                                            ('Borderline_SMOTE2', {'proportion': 2.0}), 
                                            ('ADASYN', {'proportion': 2.0})])
        X_os_1, y_os_1 = oversampler.sample(X_train, y_train)
        for c, cp in classifiers:
            classifier = c(**cp).fit(X_os_1, y_os_1)
            y_proba_1 = classifier.predict_proba(oversampler.preprocessing_transform(X_test))[:,1]
            results_dataset.append([i, o.__name__, 'euc', roc_auc_score(y_test, y_proba_1), ds['name'], c.__name__, cp])

    data= pd.DataFrame(results_dataset, columns=['fold', 'oversampler', 'metric', 'auc', 'dataset', 'classifier', 'params'])
    print(data.groupby(['classifier', 'oversampler', 'metric']).agg({'auc': np.mean}))

    results.extend(results_dataset)
    
    data= pd.DataFrame(results, columns=['fold', 'oversampler', 'metric', 'auc', 'dataset', 'classifier', 'params'])
    print(data.groupby(['classifier', 'oversampler', 'metric']).agg({'auc': np.mean}))

0 cleveland-0_vs_4 177 177
                                                      auc
classifier             oversampler       metric          
DecisionTreeClassifier ADASYN            euc     0.769716
                       Borderline_SMOTE1 euc     0.703381
                       Borderline_SMOTE2 euc     0.681544
                       EnsembleSMOTE     euc     0.743532
                       SMOTE             euc     0.762576
KNeighborsClassifier   ADASYN            euc     0.977449
                       Borderline_SMOTE1 euc     0.976597
                       Borderline_SMOTE2 euc     0.967926
                       EnsembleSMOTE     euc     0.971815
                       SMOTE             euc     0.978671
SVC                    ADASYN            euc     0.982614
                       Borderline_SMOTE1 euc     0.980581
                       Borderline_SMOTE2 euc     0.972563
                       EnsembleSMOTE     euc     0.982809
                       SMOTE             euc 

KeyboardInterrupt: 