In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.3/199.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.9.1 imblearn-0.0


# Import Libraries

In [None]:
import numpy as np
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as imb_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN

# Create Dataset

In [None]:
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [None]:
(unique, counts) = np.unique(y, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

[[   0 9900]
 [   1  100]]


In [None]:
from collections import Counter
counter=Counter(y)
print(counter)

Counter({0: 9900, 1: 100})


# Use Sampling Methods in Pipeline

In [None]:
# define pipeline
dtree= DecisionTreeClassifier()
pipe_rand_over = imb_pipeline([('over', RandomOverSampler()), ('model', DecisionTreeClassifier())])
pipe_rand_smote = imb_pipeline([('smote', SMOTE()), ('model', DecisionTreeClassifier())])
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
scores = cross_val_score(dtree, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores)
print('F-measure: %.3f' % score)

F-measure: 0.766


In [None]:
scores = cross_val_score(pipe_rand_over, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores)
print('F-measure: %.3f' % score)

F-measure: 0.753


In [None]:
scores = cross_val_score(pipe_rand_smote, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores)
print('F-measure: %.3f' % score)

F-measure: 0.820


# Use Sampling Methods with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# GridSearch with oversampling
pipe_rand_smote = imb_pipeline([('smote', SMOTE()), ('model', DecisionTreeClassifier())])
param_grid = {
    # try different feature engineering parameters
    'smote__k_neighbors': [1,2,3,4,5],
    'model__max_depth': [2,3,4,5,6],
}

#apply grid search
grid_smote= GridSearchCV(pipe_rand_smote, param_grid, cv=5, n_jobs=2, scoring='roc_auc')
grid_smote.fit(X, y)

print("Best parameters: {}".format(grid_smote.best_params_))
print("Best Mean cross-validation score: {:.2f}".format(grid_smote.best_score_))

Best parameters: {'model__max_depth': 3, 'smote__k_neighbors': 5}
Best Mean cross-validation score: 0.94


In [None]:
# GridSearch with oversampling
pipe_rand_svmsmote = imb_pipeline([('svmsmote', SVMSMOTE()), ('model', DecisionTreeClassifier())])
param_grid = {
    # try different feature engineering parameters
    'svmsmote__k_neighbors': [1,2,3,4,5],
    'model__max_depth': [2,3,4,5,6],
}

#apply grid search
grid_svmsmote= GridSearchCV(pipe_rand_svmsmote, param_grid, cv=5, n_jobs=2, scoring='roc_auc')
grid_svmsmote.fit(X, y)

print("Best parameters: {}".format(grid_svmsmote.best_params_))
print("Best Mean cross-validation score: {:.2f}".format(grid_svmsmote.best_score_))

Best parameters: {'model__max_depth': 4, 'svmsmote__k_neighbors': 2}
Best Mean cross-validation score: 0.91


In [None]:
# GridSearch with oversampling
pipe_rand_adasyn = imb_pipeline([('adasyn', ADASYN()), ('model', DecisionTreeClassifier())])
param_grid = {
    # try different feature engineering parameters
    'adasyn__n_neighbors': [1,2,3,4,5],
    'model__max_depth': [2,3,4,5,6],
}

#apply grid search
grid_adasyn= GridSearchCV(pipe_rand_adasyn, param_grid, cv=5, n_jobs=2, scoring='roc_auc')
grid_adasyn.fit(X, y)

print("Best parameters: {}".format(grid_adasyn.best_params_))
print("Best Mean cross-validation score: {:.2f}".format(grid_adasyn.best_score_))

Best parameters: {'adasyn__n_neighbors': 5, 'model__max_depth': 3}
Best Mean cross-validation score: 0.94


# Different Sampling Methods

<span class="mark">Approach inspired from https://machinelearningmastery.com/imbalanced-classification-of-good-and-bad-credit/</span>

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import OneSidedSelection
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
# define undersampling models to test
def get_sampling():
    sampling, names = list(), list()
    # TL
    sampling.append(TomekLinks())
    names.append('TL')
    # ENN
    sampling.append(EditedNearestNeighbours())
    names.append('ENN')
    # RENN
    sampling.append(RepeatedEditedNearestNeighbours())
    names.append('RENN')
    # OSS
    sampling.append(OneSidedSelection())
    names.append('OSS')
    # NCR
    sampling.append(NeighbourhoodCleaningRule())
    names.append('NCR')

    # RandomOverSampler
    sampling.append(RandomOverSampler())
    names.append('ROS')
    # SMOTE
    sampling.append(SMOTE())
    names.append('SMOTE')
    # BorderlineSMOTE
    sampling.append(BorderlineSMOTE())
    names.append('BLSMOTE')
    # SVMSMOTE
    sampling.append(SVMSMOTE())
    names.append('SVMSMOTE')
    # ADASYN
    sampling.append(ADASYN())
    names.append('ADASYN')
    return sampling, names

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score

In [None]:
def f2_measure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=2)

In [None]:
metric = make_scorer(f2_measure)

In [None]:
# evaluate a model
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
	return scores

In [None]:
sampling, names = get_sampling()
results = list()
for i in range(len(sampling)):
    # define model to evaluate
    model_rf =  RandomForestClassifier(random_state=42)
    # sample and then fit model
    pipe_sample = imb_pipeline(steps=[('s', sampling[i]), ('m',model_rf)])
    # evaluate the model and store results
    scores_rf = evaluate_model(X, y, pipe_sample)
    results.append(scores_rf)
    # summarize and store
    print('>%s %.3f (%.3f)' % (names[i], np.mean(scores_rf), np.std(scores_rf)))
    

>TL 0.591 (0.062)
>ENN 0.629 (0.056)
>RENN 0.631 (0.067)
>OSS 0.550 (0.136)
>NCR 0.641 (0.021)
>ROS 0.539 (0.063)
>SMOTE 0.405 (0.041)
>BLSMOTE 0.589 (0.070)
>SVMSMOTE 0.622 (0.051)
>ADASYN 0.329 (0.023)


In [None]:
from imblearn.under_sampling import NeighbourhoodCleaningRule as NCR

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf_us_2 = imb_pipeline([('ncr',NCR()),('model',rf)])
rf_us_2_param = {
              'model__max_depth' : [2,4,6],
              "ncr__n_neighbors":[4,12]
             }
rf_us_2_grid = GridSearchCV(rf_us_2, rf_us_2_param,cv=5, return_train_score=True,scoring= metric )
rf_us_2_grid.fit(X,y)

In [None]:
print(f'Best Mean Cross Validation Score is {rf_us_2_grid.best_score_}')
print(f'Best Mean Cross Validation Param is {rf_us_2_grid.best_params_}')
print(f'Train score is {rf_us_2_grid.score(X,y)}')
print(f'Test score is {rf_us_2_grid.score(X,y)}')
print(f'Val score is {rf_us_2_grid.score(X,y)}')

Best Mean Cross Validation Score is 0.5603193702461997
Best Mean Cross Validation Param is {'model__max_depth': 6, 'ncr__n_neighbors': 12}
Train score is 0.6692913385826772
Test score is 0.6692913385826772
Val score is 0.6692913385826772
