
# SMOTE: Improving Minority-Class Performance

This notebook shows how to use **SMOTE (Synthetic Minority Over-Sampling Technique)** on an imbalanced binary classification problem.

Note: You should install `imbalanced-learn` if you do not have it.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import warnings; warnings.filterwarnings("ignore")
from collections import Counter

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [None]:
# create a binary classification dataset
x, y = make_classification(
    n_samples=1000,
    n_features=2,
    n_redundant=0,
    n_clusters_per_class=1,
    weights=[0.98, 0.02],
    random_state=125,
)

# another to try afterwards
# x, y = make_classification(
#     n_samples=3000,
#     n_features=5,
#     n_informative=5,
#     n_redundant=0,
#     n_repeated=0,
#     n_classes=2,
#     n_clusters_per_class=1,
#     weights=[0.95, 0.05],
#     class_sep=0.5,
#     flip_y=0.0,
#     random_state=42
# )


labels = Counter(y)
print("y labels after oversampling")
print(labels)

In [None]:
plt.scatter(x[:, 0], x[:, 1], marker="o", c=y, edgecolor="k");

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
model = DecisionTreeClassifier()

model.fit(x_train, y_train)

precision_score_tree_nosmote = precision_score(y_test, model.predict(x_test), pos_label=1)
recall_score_tree_nosmote = recall_score(y_test, model.predict(x_test), pos_label=1)
auc_bin = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
print(f"No-SMOTE model, Precision (1) = {precision_score_tree_nosmote:.3f}")
print(f"No-SMOTE model, Recall (1) = {recall_score_tree_nosmote:.3f}")
print(f"No-SMOTE model, ROC AUC = {auc_bin:.3f}")

print("Classification report:")
print(classification_report(y_test, model.predict(x_test)))

In [None]:
model = LogisticRegression()

model.fit(x_train, y_train)

precision_score_lr_nosmote = precision_score(y_test, model.predict(x_test), pos_label=1)
recall_score_lr_nosmote = recall_score(y_test, model.predict(x_test), pos_label=1)
auc_bin = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
print(f"No-SMOTE model, Precision (1) = {precision_score_lr_nosmote:.3f}")
print(f"No-SMOTE model, Recall (1) = {recall_score_lr_nosmote:.3f}")
print(f"No-SMOTE model, ROC AUC = {auc_bin:.3f}")

print("Classification report:")
print(classification_report(y_test, model.predict(x_test)))

In [None]:
over = SMOTE()

x_train_smote, y_train_smote = over.fit_resample(x_train, y_train)

labels = Counter(y_train_smote)
print("y labels after oversampling")
print(labels)

In [None]:
plt.scatter(x_train_smote[:, 0], x_train_smote[:, 1], marker="o", c=y_train_smote, edgecolor="k");

In [None]:
model = DecisionTreeClassifier()

model.fit(x_train_smote, y_train_smote)

precision_score_tree_smote = precision_score(y_test, model.predict(x_test), pos_label=1)
recall_score_tree_smote = recall_score(y_test, model.predict(x_test), pos_label=1)
auc_bin = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
print(f"SMOTE model, Precision (1) = {precision_score_tree_smote:.3f}")
print(f"SMOTE model, Recall (1) = {recall_score_tree_smote:.3f}")
print(f"SMOTE model, ROC AUC = {auc_bin:.3f}")

print("Classification report:")
print(classification_report(y_test, model.predict(x_test)))

In [None]:
model = LogisticRegression()

model.fit(x_train_smote, y_train_smote)

precision_score_lr_smote = precision_score(y_test, model.predict(x_test), pos_label=1)
recall_score_lr_smote = recall_score(y_test, model.predict(x_test), pos_label=1)
auc_bin = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
print(f"SMOTE model, Precision (1) = {precision_score_lr_smote:.3f}")
print(f"SMOTE model, Recall (1) = {recall_score_lr_smote:.3f}")
print(f"SMOTE model, ROC AUC = {auc_bin:.3f}")

print("Classification report:")
print(classification_report(y_test, model.predict(x_test)))

| Model  | SMOTE | Precision | Recall | ROC AUC |
| -----  | ----- | ----- | ----- | ----- |
| Tree   | No    | 0.000      | 0.000   | 0.474    |
|        | Yes   | 0.032      | 0.167   | 0.506    |
| LogReg | No    | 0.000      | 0.000   | 0.821   |
|        | Yes   | 0.102      | 0.833   | 0.878    |

In [None]:
# MUST BE CAREFUL
# Use imblearn.pipeline.Pipeline
# So that during the predict / scoring, the original data is used rather than the sampled data

model = Pipeline([
        ('sampling', SMOTE()),
        ('classification', LogisticRegression())
    ])

params = {
    'classification__C': [0.1, 1, 10]
}

grid = GridSearchCV(model,
                    param_grid=params,
                    scoring='roc_auc',
                    cv=5)

grid.fit(x_train, y_train)

cv_results = grid.cv_results_
best_index = grid.best_index_
mean_test_score = cv_results['mean_test_score'][best_index]

#print("Mean ROC AUC of best model: %.3f" % mean_test_score)
#print("Classification report:")
#print(classification_report(y_test, grid.predict(x_test)))

In [None]:
precision_score_lr_smote = precision_score(y_test, grid.predict(x_test), pos_label=1)
recall_score_lr_smote = recall_score(y_test, grid.predict(x_test), pos_label=1)
auc_bin = roc_auc_score(y_test, grid.predict_proba(x_test)[:,1])
print(f"SMOTE grid, Precision (1) = {precision_score_lr_smote:.3f}")
print(f"SMOTE grid, Recall (1) = {recall_score_lr_smote:.3f}")
print(f"SMOTE grid, ROC AUC = {auc_bin:.3f}")

print("Classification report:")
print(classification_report(y_test, grid.predict(x_test)))

| Model  | SMOTE | Precision | Recall | ROC AUC |
| -----  | ----- | ----- | ----- | ----- |
| Tree   | No    | 0.000      | 0.000   | 0.474    |
|        | Yes   | 0.032      | 0.167   | 0.506    |
| LogReg | No    | 0.000      | 0.000   | 0.821   |
|        | Yes   | 0.102      | 0.833   | 0.878    |
| LRGrid | Yes   | 0.102      | 0.833   | 0.881    |


## Notes

- SMOTE synthesizes new minority-class samples only on the training set, helping the model learn a better boundary.
- You should never apply SMOTE (or any resampling) to the test set.
- Expect recall for the minority class to improve the most. Precision may go up or down depending on the problem.
- Always compare using a fixed test set and consider scoring metrics for the minority class like ROC-AUC, precision, recall, or others suited to your domain.
