<a href="https://colab.research.google.com/github/cagBRT/Data/blob/main/CombinedOverUnderSampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The dataset

In [None]:
from numpy import mean
from collections import Counter
from sklearn.datasets import make_classification
from matplotlib import pyplot
from numpy import where
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTEENN

In [None]:

# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
 n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)

In [None]:

# scatter plot of examples by class label
for label, _ in counter.items():
 row_ix = where(y == label)[0]
 pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()

# The model

In [None]:
model = DecisionTreeClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

# Manually Combine Over-Undersampling Methods

###Using the same model and dataset as the original problem

A good starting point for combining resampling techniques is to start with random or naive methods.<br>

Although simple, and often ineffective when applied in isolation, they can be effective when combined.<br>

**Random oversampling involves randomly duplicating examples in the minority class**, <br>

**random undersampling involves randomly deleting examples from the majority class.**<br>

As these two transforms are performed on separate classes, the order in which they are applied to the training dataset does not matter.



In [None]:
# define resampling
over = RandomOverSampler(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
X_over,y_over=over.fit_resample(X,y)
X_under, y_under=under.fit_resample(X_over,y_over)
# define pipeline
pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', model)])
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('over: Resampled dataset shape %s' % Counter(y_over))
print('over&under: Resampled dataset shape %s' % Counter(y_under))
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

# Manually Combine SMOTE and Random Undersampling

One of the most popular oversampling method is the Synthetic Minority Oversampling Technique, or SMOTE for short.

SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample as a point along that line.



###Using the same model and dataset as the original problem

In [None]:
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under), ('m', model)]
pipeline = Pipeline(steps=steps)
X_over3,y_over3=over.fit_resample(X,y)
X_under3, y_under3=under.fit_resample(X_over,y_over)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('over: Resampled dataset shape %s' % Counter(y_over))
print('over&under: Resampled dataset shape %s' % Counter(y_under3))
print('Mean ROC AUC: %.3f' % mean(scores))

# Combination of SMOTE and Tomek Links Undersampling


In [None]:
resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
X_resample, y_resample=resample.fit_resample(X,y)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Resampled dataset shape %s' % Counter(y_resample))
print('Mean ROC AUC: %.3f' % mean(scores))

# Combination of SMOTE and Edited Nearest Neighbors Undersampling

In [None]:
resample = SMOTEENN()
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
X_resample, y_resample=resample.fit_resample(X,y)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Resampled dataset shape %s' % Counter(y_resample))
print('Mean ROC AUC: %.3f' % mean(scores))