In [1]:
pip install -U scikit-learn imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Sample dataset
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Minority', 'Minority', 'Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Majority']
})

# Step 1: Convert categorical labels to numerical values
df['Class'] = df['Class'].map({'Majority': 0, 'Minority': 1})

# Step 2: Split features (X) and target variable (y)
X = df[['Age', 'Income']]
y = df['Class']

# Step 3: Apply SMOTE with k_neighbors = 3 (reducing from default 5)
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 4: Convert numeric labels back to categorical
y_resampled = y_resampled.map({0: 'Majority', 1: 'Minority'})

# Step 5: Combine the resampled dataset
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=['Age', 'Income']),
                         pd.DataFrame(y_resampled, columns=['Class'])], axis=1)
print(df_balanced['Class'].value_counts())
# Display the balanced dataset
df_balanced


Class
Minority    9
Majority    9
Name: count, dtype: int64


Unnamed: 0,Age,Income,Class
0,22,2000,Minority
1,25,2500,Majority
2,27,2700,Majority
3,28,3200,Majority
4,30,3500,Majority
5,35,3800,Minority
6,40,4000,Minority
7,45,4200,Minority
8,50,4300,Majority
9,55,4500,Majority
