## Imbalanced Data - Oversampling and Undersampling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split

def income_model_data_prep(data):
    data = pd.get_dummies(
        income.assign(
                target = np.where(data["SalStat"]==" less than or equal to 50,000", 0,1),
                nativecountry = data["nativecountry"].str.replace(" Holand-Netherlands",
                                                         " Germany"),
                JobType = data["JobType"].replace({" Never-worked":" Without-pay"}),
                occupation = data["occupation"].str.replace(" Armed-Forces"," ?")
                ).drop("SalStat",axis=1),
        drop_first=True
    )
    X = data.drop(columns=["target"],axis=1)
    y = data["target"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

    return X_train, X_test, y_train, y_test

In [3]:
## Split train and test datasets

income = pd.read_csv("../Data/income.csv")

X_train, X_test, y_train, y_test = income_model_data_prep(income)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

logreg = LogisticRegression(max_iter=1000)
lr = logreg.fit(X_train, y_train)

pred_orig = lr.predict(X_train)

print(f'Accuracy: {accuracy_score(y_test, lr.predict(X_test))}')
print(f'F1 Score: {f1_score(y_test, lr.predict(X_test))}')

Accuracy: 0.8541275797373359
F1 Score: 0.6652314316469322


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Sampling Methods
1. Umdersample the data to a 2:1 ratio of teh 0s and 1s. Generate a confusion matrix,
   and calculate common evaluations metrics.
2. Oversample the data using Random Oversampling by creating 4x of the number of 1s
   and fit a logistic regression.
3. Use SMOTE to oversample the data, creating 4x of the current number of 1s.

In [5]:
from imblearn import under_sampling as US

## Randomly undersample negative samples
RUS = US.RandomUnderSampler(sampling_strategy= 1/2, random_state=1024)

X_train_us, y_train_us = RUS.fit_resample(X_train, y_train)

lr_us = logreg.fit(X_train_us, y_train_us)

pred_us = lr_us.predict(X_train_us)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

confusion_matrix(y_train_us, pred_us)

array([[10910,  1392],
       [ 1850,  4301]])

In [7]:
print(f'Original accuracy: {accuracy_score(y_train, pred_orig)} ')
print(f'Undersampled accuracy: {accuracy_score(y_train_us, pred_us)} ')
print(f'Original precision: {precision_score(y_train, pred_orig)} ')
print(f'Undersampled precision: {precision_score(y_train_us, pred_us)} ')
print(f'Original recall: {recall_score(y_train, pred_orig)} ')
print(f'Undersampled recall: {recall_score(y_train_us, pred_us)} ')
print(f'Original F1 score: {f1_score(y_train, pred_orig)} ')
print(f'Undersampled F1 score: {f1_score(y_train_us, pred_us)} ')

Original accuracy: 0.8498162770698147 
Undersampled accuracy: 0.8243104102313987 
Original precision: 0.7314091000200441 
Undersampled precision: 0.7554891972597927 
Original recall: 0.5932368720533246 
Undersampled recall: 0.6992358966021786 
Original F1 score: 0.655116696588869 
Undersampled F1 score: 0.726274907125971 


In [8]:
import imblearn.over_sampling as OS

## Oversample by creating 4x positive samples

n_pos = y_train.sum()
n_neg = len(y_train) - n_pos
ratio = {1:n_pos * 4, 0:n_neg}

## Fit the model with oversampled training data
ROS = OS.RandomOverSampler(sampling_strategy= ratio, random_state=1024)
X_train_os, y_train_os = ROS.fit_resample(X_train, y_train)

lr_os = logreg.fit(X_train_os, y_train_os)

pred_os = lr_os.predict(X_train_os)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
print(f'Original accuracy: {accuracy_score(y_train, pred_orig)} ')
print(f'Overasampled accuracy: {accuracy_score(y_train_os, pred_os)} ')
print(f'Original precision: {precision_score(y_train, pred_orig)} ')
print(f'Oversampled precision: {precision_score(y_train_os, pred_os)} ')
print(f'Original recall: {recall_score(y_train, pred_orig)} ')
print(f'Oversampled recall: {recall_score(y_train_os, pred_os)} ')
print(f'Original F1 score: {f1_score(y_train, pred_orig)} ')
print(f'Oversampled F1 score: {f1_score(y_train_os, pred_os)} ')

Original accuracy: 0.8498162770698147 
Overasampled accuracy: 0.8246394913137277 
Original precision: 0.7314091000200441 
Oversampled precision: 0.8208285822881034 
Original recall: 0.5932368720533246 
Oversampled recall: 0.8777434563485612 
Original F1 score: 0.655116696588869 
Oversampled F1 score: 0.8483324822249283 


In [10]:
import imblearn.over_sampling as OS

## Oversample by creating 4x positive samples

n_pos = y_train.sum()
n_neg = len(y_train) - n_pos
ratio = {1:n_pos * 4, 0:n_neg}

## SMOTE oversample for positives
## Fit the model with oversampled training data
SMT = OS.SMOTE(sampling_strategy= ratio, random_state=1024)
X_train_smt, y_train_smt = SMT.fit_resample(X_train, y_train)

lr_smt = logreg.fit(X_train_smt, y_train_smt)

pred_smt = lr_smt.predict(X_train_smt)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
print(f'Original accuracy: {accuracy_score(y_train, pred_orig)} ')
print(f'SMOTE Overasampled accuracy: {accuracy_score(y_train_smt, pred_smt)} ')
print(f'Original precision: {precision_score(y_train, pred_orig)} ')
print(f'SMOTE Oversampled precision: {precision_score(y_train_smt, pred_smt)} ')
print(f'Original recall: {recall_score(y_train, pred_orig)} ')
print(f'SMOTE Oversampled recall: {recall_score(y_train_smt, pred_smt)} ')
print(f'Original F1 score: {f1_score(y_train, pred_orig)} ')
print(f'SMOTE Oversampled F1 score: {f1_score(y_train_smt, pred_smt)} ')

Original accuracy: 0.8498162770698147 
SMOTE Overasampled accuracy: 0.8774838196888839 
Original precision: 0.7314091000200441 
SMOTE Oversampled precision: 0.8777581120943952 
Original recall: 0.5932368720533246 
SMOTE Oversampled recall: 0.9070476345309706 
Original F1 score: 0.655116696588869 
SMOTE Oversampled F1 score: 0.8921625457234804 


In [12]:
## testing the model with SMOTE oversampled model

print('--------------Test Results on SMOTE model-----------')
print(f'Original F1 Score: {f1_score(y_test, lr.predict(X_test))}')
print(f'SMOTE F1 Score: {f1_score(y_test, lr_smt.predict(X_test))}')

--------------Test Results on SMOTE model-----------
Original F1 Score: 0.6629866356769321
SMOTE F1 Score: 0.6629866356769321


In [13]:
income["SalStat"].value_counts(normalize=True)

SalStat
less than or equal to 50,000    0.759366
greater than 50,000             0.240634
Name: proportion, dtype: float64

### Tuning the class weights (hyperparameter)

In [14]:
logreg_balanced = LogisticRegression(class_weight="balanced")

lr_balanced = logreg_balanced.fit(X_train, y_train)

pred_balanced = lr_balanced.predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
print("------Evaluating the model------")
print('Accuracy: ', accuracy_score(y_test, pred_balanced))
print('Precision: ', precision_score(y_test, pred_balanced))
print('Recall: ', recall_score(y_test, pred_balanced))
print('F1 Score: ', f1_score(y_test, pred_balanced))

------Evaluating the model------
Accuracy:  0.7823639774859287
Precision:  0.5305711987127917
Recall:  0.8542746113989638
F1 Score:  0.654590570719603


In [16]:
logreg_4x = LogisticRegression(class_weight={1:4, 0:1})

lr_4x = logreg_4x.fit(X_train, y_train)

pred_4x = lr_4x.predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
print("------Evaluating the model------")
print('Accuracy: ', accuracy_score(y_test, pred_4x))
print('Precision: ', precision_score(y_test, pred_4x))
print('Recall: ', recall_score(y_test, pred_4x))
print('F1 Score: ', f1_score(y_test, pred_4x))

------Evaluating the model------
Accuracy:  0.7614133833646028
Precision:  0.5034856700232379
Recall:  0.8419689119170984
F1 Score:  0.6301502666020359


In [18]:
from sklearn.metrics import roc_curve, auc

y_probs = lr.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
auc_score = auc(fpr, tpr)

y_balanced_probs = lr_balanced.predict_proba(X_test)[:,1]
fpr_balanced, tpr_balanced, thresholds_balanced = roc_curve(y_test, y_balanced_probs)
auc_score_balanced = auc(fpr_balanced, tpr_balanced)

y_4x_probs = lr_4x.predict_proba(X_test)[:,1]
fpr_4x, tpr_4x, thresholds_4x = roc_curve(y_test, y_4x_probs)
auc_score_4x = auc(fpr_4x, tpr_4x)

print('original auc:', auc_score)
print('balanced auc', auc_score_balanced)
print('4x auc', auc_score_4x)

original auc: 0.8867558087258499
balanced auc 0.8976575147687615
4x auc 0.8909324155628361
