In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from imblearn.combine import SMOTETomek, SMOTEENN
from collections import Counter

## Data Loading

In [11]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
data.Outcome.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

We use diabetes dataset that is imbalanced, with class 0 568 and class 1 268

## Data Preparation

In [16]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

print('Original class distribution:', Counter(y_train))

#dictionary to store results
results = {}

Original class distribution: Counter({0: 350, 1: 187})


When performing any feature engineering, including resampling, we always split the train test data first. We can only do feature engineering on training data, the test data needs to be untouched because it acts as an unseen / new data.

For this practice, we use the parameter "stratify = y" for the train test split so that the split will follow the distribution of y from the original dataset.

## Resampling
### Baseline (no sampling)

In [15]:
clf = RandomForestClassifier(random_state=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#save the result
results['No Sampling'] = f1_score(y_test, y_pred, average='macro')
print('\n=== No Sampling ===')
print(classification_report(y_test, y_pred))


=== No Sampling ===
              precision    recall  f1-score   support

           0       0.77      0.85      0.80       150
           1       0.65      0.52      0.58        81

    accuracy                           0.73       231
   macro avg       0.71      0.68      0.69       231
weighted avg       0.72      0.73      0.72       231



### SMOTE Oversampling

In [20]:
smote = SMOTE(random_state=1)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE:", Counter(y_smote))

clf.fit(X_smote, y_smote)
y_pred = clf.predict(X_test)

#save the result
results['SMOTE Sampling'] = f1_score(y_test, y_pred, average='macro')
print('\n=== SMOTE Sampling ===')
print(classification_report(y_test, y_pred))

After SMOTE: Counter({1: 350, 0: 350})

=== SMOTE Sampling ===
              precision    recall  f1-score   support

           0       0.80      0.79      0.79       150
           1       0.61      0.63      0.62        81

    accuracy                           0.73       231
   macro avg       0.71      0.71      0.71       231
weighted avg       0.73      0.73      0.73       231



### Borderline SMOTE Oversampling

In [22]:
bsmote = BorderlineSMOTE(random_state=1)
X_bsmote, y_bsmote = bsmote.fit_resample(X_train, y_train)

print("After BSMOTE:", Counter(y_bsmote))

clf.fit(X_bsmote, y_bsmote)
y_pred = clf.predict(X_test)

#save the result
results['BSMOTE Sampling'] = f1_score(y_test, y_pred, average='macro')
print('\n=== BSMOTE Sampling ===')
print(classification_report(y_test, y_pred))

After BSMOTE: Counter({1: 350, 0: 350})

=== BSMOTE Sampling ===
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       150
           1       0.60      0.64      0.62        81

    accuracy                           0.72       231
   macro avg       0.70      0.70      0.70       231
weighted avg       0.73      0.72      0.73       231



### Random Undersampling

In [23]:
rus = RandomUnderSampler(random_state=1)
X_rus, y_rus = rus.fit_resample(X_train, y_train)

print("After RUS:", Counter(y_rus))

clf.fit(X_rus, y_rus)
y_pred = clf.predict(X_test)

#save the result
results['Random Under Sampling'] = f1_score(y_test, y_pred, average='macro')
print('\n=== Random Under Sampling ===')
print(classification_report(y_test, y_pred))

After RUS: Counter({0: 187, 1: 187})

=== Random Under Sampling ===
              precision    recall  f1-score   support

           0       0.82      0.72      0.77       150
           1       0.58      0.72      0.64        81

    accuracy                           0.72       231
   macro avg       0.70      0.72      0.70       231
weighted avg       0.74      0.72      0.72       231



### Near Miss Oversampling

In [24]:
nm = NearMiss(version=1)
X_nm, y_nm = nm.fit_resample(X_train, y_train)

print("After NearMiss:", Counter(y_nm))

clf.fit(X_nm, y_nm)
y_pred = clf.predict(X_test)

#save the result
results['NearMiss Sampling'] = f1_score(y_test, y_pred, average='macro')
print('\n=== NearMiss Sampling ===')
print(classification_report(y_test, y_pred))

After NearMiss: Counter({0: 187, 1: 187})

=== NearMiss Sampling ===
              precision    recall  f1-score   support

           0       0.82      0.76      0.79       150
           1       0.61      0.69      0.65        81

    accuracy                           0.74       231
   macro avg       0.71      0.73      0.72       231
weighted avg       0.75      0.74      0.74       231



### SMOTE + Tomek

In [31]:
smote_tomek = SMOTETomek(random_state=1)
X_st, y_st = smote_tomek.fit_resample(X_train, y_train)

print("After SMOTETomek:", Counter(y_st))

clf.fit(X_st, y_st)
y_pred = clf.predict(X_test)

#save the result
results['SMOTETomek Sampling'] = f1_score(y_test, y_pred, average='macro')
print('\n=== SMOTETomek Sampling ===')
print(classification_report(y_test, y_pred))

After SMOTETomek: Counter({1: 326, 0: 326})

=== SMOTETomek Sampling ===
              precision    recall  f1-score   support

           0       0.80      0.79      0.79       150
           1       0.61      0.63      0.62        81

    accuracy                           0.73       231
   macro avg       0.71      0.71      0.71       231
weighted avg       0.73      0.73      0.73       231



### SMOTE + ENN

In [30]:
smote_enn = SMOTEENN(random_state=1)
X_se, y_se = smote_enn.fit_resample(X_train, y_train)

print("After SMOTEENN:", Counter(y_se))

clf.fit(X_se, y_se)
y_pred = clf.predict(X_test)

#save the result
results['SMOTEENN Sampling'] = f1_score(y_test, y_pred, average='macro')
print('\n=== SMOTEENN Sampling ===')
print(classification_report(y_test, y_pred))

After SMOTEENN: Counter({1: 217, 0: 178})

=== SMOTEENN Sampling ===
              precision    recall  f1-score   support

           0       0.83      0.74      0.78       150
           1       0.60      0.73      0.66        81

    accuracy                           0.74       231
   macro avg       0.72      0.73      0.72       231
weighted avg       0.75      0.74      0.74       231



In [29]:
# Compare all results
print("\n=== COMPARISON ===")
for method, score in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{method}: {score:.4f}")


=== COMPARISON ===
SMOTEENN Sampling: 0.7218
NearMiss Sampling: 0.7182
SMOTE Sampling: 0.7069
SMOTETomek Sampling: 0.7069
Random Under Sampling: 0.7048
BSMOTE Sampling: 0.7007


Common misconception, when doing combinations of SMOTE + Tomek or ENN, the result is not just an oversampling but there is also reduction, so the number is less than SMOTE alone
1. SMOTE generates synthetic samples along lines between minority samples, some of them are very close to majority sample and it create overlap at boundary
2. These synthetic samples form Tomek Links with majority samples
3. Tomek / ENN sampling removes majority samples in Tomek links and it also remove samples that are in the wrong neighbourhood (ENN)

One thing to know, Tomek & ENN can also remove minority sampling for this case specificly