# Handling imbalanced dataset
## Imbalanced datasets need to be handled to avoid bias-ness of ML model towards majority class
### Two methods
- Upsampling minority dataset
- Downsampling majority dataset

#### Creating the dataset

In [1]:
import numpy as np
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a Dataframe with 2 classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [2]:
n_class_0, n_class_1

(900, 100)

In [6]:
# Create my dataframe with imbalanced dataset
class_0 = pd.DataFrame(
    {
        'feature1': np.random.normal(loc=0, scale=1, size=n_class_0),
        'feature2': np.random.normal(loc=0, scale=1, size=n_class_0),
        'target': [0] * n_class_0
    }
)
class_1 = pd.DataFrame(
    {
        'feature1': np.random.normal(loc=2, scale=1, size=n_class_1),
        'feature2': np.random.normal(loc=2, scale=1, size=n_class_1),
        'target': [1] * n_class_1
    }
)

In [16]:
df = pd.concat([class_0, class_1], ignore_index=True)
df

Unnamed: 0,feature1,feature2,target
0,-1.774224,0.285744,0
1,-1.201377,0.333279,0
2,1.096257,0.531807,0
3,0.861037,-0.354766,0
4,-1.520367,-1.120815,0
...,...,...,...
995,2.677156,1.092048,1
996,2.963404,0.181955,1
997,1.621476,1.877267,1
998,3.429559,3.794486,1


In [17]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [18]:
# Dataset segregation
minority_df = df[df['target']==1]
majority_df = df[df['target']==0]

## Upsampling using sklearn

In [20]:
from sklearn.utils import resample
minority_upsampled_df = resample(
    minority_df,
    replace=True, # Samples get added with replacement
    n_samples=len(majority_df),
    random_state=43
)

In [22]:
minority_upsampled_df.shape

(900, 3)

In [23]:
upsampled_df = pd.concat([majority_df, minority_upsampled_df], ignore_index=True)
upsampled_df

Unnamed: 0,feature1,feature2,target
0,-1.774224,0.285744,0
1,-1.201377,0.333279,0
2,1.096257,0.531807,0
3,0.861037,-0.354766,0
4,-1.520367,-1.120815,0
...,...,...,...
1795,1.607934,1.420219,1
1796,0.781147,1.559140,1
1797,2.192494,1.422683,1
1798,2.797490,1.839661,1


In [24]:
upsampled_df['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

## Downsampling Majority dataset

In [25]:
majority_downsampled_df = resample(
    majority_df,
    replace=False,  # Replace should not be TRUE
    n_samples=len(minority_df),
    random_state=43
)

In [26]:
majority_downsampled_df.shape

(100, 3)

In [27]:
downsampled_df = pd.concat([majority_downsampled_df, minority_df], ignore_index=True)
downsampled_df

Unnamed: 0,feature1,feature2,target
0,-0.480315,-0.774853,0
1,2.184314,-0.322156,0
2,-0.520203,0.724457,0
3,0.551232,1.071139,0
4,-0.896718,2.540514,0
...,...,...,...
195,2.677156,1.092048,1
196,2.963404,0.181955,1
197,1.621476,1.877267,1
198,3.429559,3.794486,1


In [28]:
downsampled_df['target'].value_counts()

target
0    100
1    100
Name: count, dtype: int64