<a href="https://colab.research.google.com/github/cagBRT/Data/blob/main/Data_Sampling_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Sythetic Minority Oversampling Technique (SMOTE)**

As we saw in Data Sampling 1, oversampling duplicates data from the minority class until the classes are of similiar size. 

SMOTE sythsizes new data from existing examples. 


**Import libraries**

In [None]:
# Generate and plot a synthetic imbalanced classification dataset
from collections import Counter
from sklearn.datasets import make_classification
from matplotlib import pyplot
from numpy import where

Create an imbalanced dataset

In [None]:
# define dataset
X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)

In [None]:
# scatter plot of examples by class label 
for label, _ in counter.items():
  row_ix = where(y == label)[0]
  pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) 
  pyplot.legend()
pyplot.show()

**Use SMOTE to augment the minority class**

SMOTE works by:<br>
1. select a random example from the minority class that are close in the feature space
2. Find the k nearest neighbors of the selected example 
2. Draw a line between the example and its nearest neighbor in the feature space  
3. Draw a new sample at a point along that line.


**Import the SMOTE library**

In [None]:
# Oversample and plot imbalanced dataset with SMOTE
from imblearn.over_sampling import SMOTE

**Transform the dataset using SMOTE**

In [None]:
# use SMOTE to transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [None]:
# summarize the new class distribution 
counter = Counter(y)
print(counter)

In [None]:
# scatter plot of examples by class label 
for label, _ in counter.items():
  row_ix = where(y == label)[0]
  pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) 
  pyplot.legend()
pyplot.show()

**A downside of SMOTE** is that synthetic examples are created without considering the majority class, which can ressult in ambiguous examples when there is a strong overlap for the classes.

**Assignment #1**<br>
Modify the number of datapoints in the dataset. 
What happens when the dataset is less than 1000 examples?



---



In [None]:
# decision tree evaluated on imbalanced dataset
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# define model
model = DecisionTreeClassifier()
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) 
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) 
print('Mean ROC AUC: %.3f' % mean(scores))

In [None]:
# decision tree evaluated on imbalanced dataset with SMOTE oversampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# define pipeline
steps = [('over', SMOTE()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) 
print('Mean ROC AUC: %.3f' % mean(scores))

In [None]:
# grid search k value for SMOTE oversampling for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7] 
for k in k_values:
  # define pipeline
  model = DecisionTreeClassifier()
  over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
  pipeline = Pipeline(steps=[('over', over), ('model', model)])
  # evaluate pipeline
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) 
  score = mean(scores)
  print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

In [None]:
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from sklearn.datasets import make_classification 
from imblearn.over_sampling import BorderlineSMOTE 
from matplotlib import pyplot
from numpy import where
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = BorderlineSMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution counter = Counter(y)
print(counter)
# scatter plot of examples by class label 
for label, _ in counter.items():
  row_ix = where(y == label)[0]
  pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) 
  pyplot.legend()
pyplot.show()

In [None]:
# borderline-SMOTE with SVM for imbalanced dataset
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SVMSMOTE
from matplotlib import pyplot
from numpy import where
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = SVMSMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution counter = Counter(y)
print(counter)
# scatter plot of examples by class label 
for label, _ in counter.items():
  row_ix = where(y == label)[0]
  pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) 
  pyplot.legend()
pyplot.show()

In [None]:
# Oversample and plot imbalanced dataset with ADASYN
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN
from matplotlib import pyplot
from numpy import where
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = ADASYN()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution counter = Counter(y)
print(counter)
# scatter plot of examples by class label 
for label, _ in counter.items():
  row_ix = where(y == label)[0]
  pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) 
  pyplot.legend()
pyplot.show()