In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
## Handling imbalanced datasets
## Upsampling and Downsampling
## 1. Upsampling - Increasing the number of samples in the minority class by replicating them.
## 2. Downsampling - Decreasing the number of samples in the majority class by removing them.
## 3. Combination of both - Using both upsampling and downsampling to balance the dataset.

In [3]:
np.random.seed(123)
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [4]:
n_class_0, n_class_1

(900, 100)

In [5]:
## Create a dataframe with imbalanced classes
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0]*n_class_0
})  
class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1]*n_class_1
})

In [6]:
pd.concat([class_0, class_1]).reset_index(drop=True)

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
995,1.376371,2.845701,1
996,2.239810,0.880077,1
997,1.131760,1.640703,1
998,2.902006,0.390305,1


In [7]:
## Upsampling the minority class
df = pd.concat([class_0, class_1]).reset_index(drop=True)
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [8]:
from sklearn.utils import resample
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=123)

In [9]:
df_minority_upsampled

Unnamed: 0,feature_1,feature_2,target
966,0.671933,1.370988,1
992,2.196570,1.397425,1
998,2.902006,0.390305,1
917,2.197269,4.216788,1
983,2.844335,2.015572,1
...,...,...,...
912,2.834499,2.773754,1
983,2.844335,2.015572,1
917,2.197269,4.216788,1
950,2.399896,-0.840847,1


In [10]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled]).reset_index(drop=True)

In [11]:
## Downsampling the majority class
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=123)
df_downsampled = pd.concat([df_majority_downsampled, df_minority]).reset_index(drop=True)

In [12]:
df_majority_downsampled

Unnamed: 0,feature_1,feature_2,target
613,-1.147525,0.284447,0
524,-0.448392,0.777222,0
690,-1.576233,0.187258,0
457,1.601908,-1.080462,0
85,-1.085902,-1.775243,0
...,...,...,...
299,0.819761,-1.425211,0
266,0.635943,1.803586,0
727,0.747393,-0.466574,0
164,0.665653,0.434198,0
