# Handling Imbalanced Datasets

In [153]:
import numpy as np
import pandas as pd

In [154]:
np.random.seed(123)
## By setting the random seed, you ensure that any random operations you perform will give the same results each time you run the code, as long as the seed value remains the same. 
# This is crucial for debugging and when sharing code with others because it guarantees that they will get the same results when running your code.
# Why 123?: The number 123 is arbitrary. You can use any integer as the seed. It's a common practice to choose a fixed number to make your experiments reproducible, but you can choose any number you'd like.

In [155]:
## creating a dataframe with two classes...
num_samples = 1000
class_0_ratio = 0.9
num_class_0 = int(class_0_ratio * num_samples)
num_class_1 = num_samples - num_class_0

In [156]:
num_class_0,num_class_1

(900, 100)

In [157]:
## creating my dataframe with imbalanced dataset...
class_0 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=0,scale=1,size=num_class_0),
    'feature_2' : np.random.normal(loc=0,scale=1,size=num_class_0),
    'target' : [0] * num_class_0
})
## This generates num_class_0 number of samples for the specified feature (feature_1 or feature_2) from a normal distribution with:
# loc=0: The mean (center) of the distribution is 0.
# scale=1: The standard deviation (spread) of the distribution is 1.
# size=num_class_0: The number of random samples to generate, where num_class_0 is the number of samples you want in class 0 (already defined earlier as 900).
# 'target': [0] * num_class_0: This creates a list of length num_class_0, where every element is set to 0, indicating that all the samples belong to class 0 (the majority class).

In [158]:
class_1 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=2,scale=1,size=num_class_1),
    'feature_2' : np.random.normal(loc=2,scale=1,size=num_class_1),
    'target' : [1] * num_class_1
})

In [159]:
df = pd.concat([class_0,class_1]).reset_index(drop=True)

In [160]:
df

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
995,1.376371,2.845701,1
996,2.239810,0.880077,1
997,1.131760,1.640703,1
998,2.902006,0.390305,1


In [161]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

### Upsampling

In classification problems, when the dataset is imbalanced (i.e., one class has many more samples than the other), most machine learning algorithms tend to be biased toward the majority class. Upsampling helps to balance the class distribution, allowing the model to learn more from the minority class.

In [162]:
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [163]:
from sklearn.utils import resample

In [164]:
## upsampling the minority class....

df_minority_upsampled = resample(df_minority,replace=True,n_samples=len(df_majority),random_state=42)
## The first parameter is the data you want to resample...
# replace: Boolean (default is True). Whether to sample with replacement (True) or without replacement (False).
# n_samples: Integer or float (default is None). Number of samples to draw. If None, the number of samples is set to the size of the array being sampled.
# random_state: Integer or None (default is None). Controls the random number generator used to shuffle the data. Set this for reproducibility.
## It returns the resampled arrays as the same type as the input arrays....
df_minority_upsampled

Unnamed: 0,feature_1,feature_2,target
951,1.125854,1.843917,1
992,2.196570,1.397425,1
914,1.932170,2.998053,1
971,2.272825,3.034197,1
960,2.870056,1.550485,1
...,...,...,...
952,1.188902,2.189189,1
965,3.919526,1.980541,1
976,2.810326,3.604614,1
942,3.621531,2.168229,1


In [165]:
df_minority_upsampled.shape

(900, 3)

In [166]:
## now concatenating the majority and the upscaled minority classes...
df_upsampled = pd.concat([df_majority,df_minority_upsampled])

In [167]:
df_upsampled['target'].value_counts() ## so here we see that there is equal number of '0' and '1'...

target
0    900
1    900
Name: count, dtype: int64

### Downsampling

In [168]:
## create a dataset...

num_samples = 1000
class_0_ratio = 0.9
num_class_0 = int(class_0_ratio * num_samples)
num_class_1 = num_samples - num_class_0

class_0 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=0,scale=1,size=num_class_0),
    'feature_2' : np.random.normal(loc=0,scale=1,size=num_class_0),
    'target' : [0] * num_class_0
})
class_1 = pd.DataFrame({
    'feature_1' : np.random.normal(loc=2,scale=1,size=num_class_1),
    'feature_2' : np.random.normal(loc=2,scale=1,size=num_class_1),
    'target' : [1] * num_class_1
})

df = pd.concat([class_0,class_1]).reset_index(drop=True)


In [169]:
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [170]:
from sklearn.utils import resample

In [171]:
df_majority_downsampled = resample(df_majority,replace=False,n_samples=len(df_minority),random_state=42) ## here we are downsampling so we dont want any replacement...

df_majority_downsampled

Unnamed: 0,feature_1,feature_2,target
70,0.786057,0.830076,0
827,-0.859775,0.174732,0
231,-0.012870,-0.735348,0
588,-0.297331,-0.351224,0
39,0.528702,1.508257,0
...,...,...,...
398,1.013666,0.083496,0
76,0.164480,-1.808980,0
196,-1.576624,0.525893,0
631,-0.540147,0.610282,0


In [172]:
df_majority_downsampled.shape

(100, 3)

In [173]:
## now concatenating the majority and the upscaled minority classes...
df_downsampled = pd.concat([df_majority_downsampled,df_minority])

df_downsampled

Unnamed: 0,feature_1,feature_2,target
70,0.786057,0.830076,0
827,-0.859775,0.174732,0
231,-0.012870,-0.735348,0
588,-0.297331,-0.351224,0
39,0.528702,1.508257,0
...,...,...,...
995,2.677156,1.092048,1
996,2.963404,0.181955,1
997,1.621476,1.877267,1
998,3.429559,3.794486,1


In [174]:
df_downsampled.target.value_counts()

target
0    100
1    100
Name: count, dtype: int64