In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from sklearn.utils.class_weight import compute_class_weight

In [4]:
df =  pd.read_csv('simple_imbalanced_dataset.csv')
df.head()
df.shape

(1000, 4)

In [5]:
df.isna().sum()

Unnamed: 0,0
age,0
income,0
years_of_experience,0
label,0


In [6]:
df.duplicated().sum()

0

In [7]:
df = df[df['age'] - df['years_of_experience'] >= 18]
df.shape

(554, 4)

In [8]:
df.info()
df['label'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Index: 554 entries, 4 to 998
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  554 non-null    int64  
 1   income               554 non-null    int64  
 2   years_of_experience  554 non-null    int64  
 3   label                554 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 21.6 KB


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0.0,389
1.0,165


In [9]:
X = df.drop("label" , axis =1)
y =df['label']

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = .25 , random_state =23)


In [10]:
from collections import Counter

In [11]:
# Random Under Sampling

print("Original class distribution:", Counter(y_train))
rus = RandomUnderSampler(random_state=23)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_rus))

Original class distribution: Counter({0.0: 293, 1.0: 122})
Original class distribution: Counter({0.0: 122, 1.0: 122})


In [12]:
# Random Over Sampling
from imblearn.over_sampling import RandomOverSampler
print("Original class distribution:", Counter(y_train))
ros = RandomOverSampler(random_state=23)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_ros))

Original class distribution: Counter({0.0: 293, 1.0: 122})
Original class distribution: Counter({1.0: 293, 0.0: 293})


In [13]:
# SMOTE(Synthetic Minority Oversampling Techniqu)
print("Original class distribution:", Counter(y_train))
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_smote))

Original class distribution: Counter({0.0: 293, 1.0: 122})
Original class distribution: Counter({1.0: 293, 0.0: 293})


In [14]:
# Tomek Links
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_tl))

Original class distribution: Counter({0.0: 224, 1.0: 122})


In [15]:
# Class Weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))
class_weights

array([0.70819113, 1.70081967])

In [16]:
# Function to evaluate model performance
def evaluate_model(X_train, y_train, X_test, y_test, class_weights=None):
    model = RandomForestClassifier(class_weight=class_weights, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(classification_report(y_test, y_pred))
    print("AUC:", roc_auc_score(y_test, y_prob))
    return model

evaluate_model(X_train, y_train, X_test, y_test, class_weights=None)

              precision    recall  f1-score   support

         0.0       0.68      0.83      0.75        96
         1.0       0.27      0.14      0.18        43

    accuracy                           0.62       139
   macro avg       0.48      0.49      0.47       139
weighted avg       0.56      0.62      0.58       139

AUC: 0.49878875968992253


In [17]:
# Evaluate each sampling technique
print("Random Undersampling:")
evaluate_model(X_rus, y_rus, X_test, y_test)

print("\nRandom Oversampling:")
evaluate_model(X_ros, y_ros, X_test, y_test)

print("\nSMOTE:")
evaluate_model(X_smote, y_smote, X_test, y_test)

print("\nTomek Links:")
evaluate_model(X_tl, y_tl, X_test, y_test)

print("\nClass Weights:")
evaluate_model(X_train, y_train, X_test, y_test, class_weights_dict)

Random Undersampling:
              precision    recall  f1-score   support

         0.0       0.70      0.50      0.58        96
         1.0       0.31      0.51      0.39        43

    accuracy                           0.50       139
   macro avg       0.50      0.51      0.49       139
weighted avg       0.58      0.50      0.52       139

AUC: 0.5078730620155039

Random Oversampling:
              precision    recall  f1-score   support

         0.0       0.69      0.71      0.70        96
         1.0       0.30      0.28      0.29        43

    accuracy                           0.58       139
   macro avg       0.49      0.49      0.49       139
weighted avg       0.57      0.58      0.57       139

AUC: 0.5205910852713178

SMOTE:
              precision    recall  f1-score   support

         0.0       0.72      0.60      0.66        96
         1.0       0.34      0.47      0.40        43

    accuracy                           0.56       139
   macro avg       0.53     