**Handling Imbalanced Dataset**
1) Up sampling
2) Down sampling

Creating imbalanced data

In [27]:
import pandas as pd
import numpy as np

# Number of samples for each class
num_class_1 = 900
num_class_0 = 100

# Generate feature data
np.random.seed(0)
feature_1 = np.random.randn(num_class_1 + num_class_0)
feature_2 = np.random.randn(num_class_1 + num_class_0)

# Generate target data with class imbalance
target = np.concatenate([np.ones(num_class_1), np.zeros(num_class_0)])

# Create DataFrame
data = pd.DataFrame({
    'Feature 1': feature_1,
    'Feature 2': feature_2,
    'Target': target.astype(int)  # Convert to integers (0 or 1)
})




In [28]:
data.head()


Unnamed: 0,Feature 1,Feature 2,Target
0,1.764052,0.555963,1
1,0.400157,0.892474,1
2,0.978738,-0.422315,1
3,2.240893,0.104714,1
4,1.867558,0.228053,1


In [29]:
data["Target"].value_counts()

Target
1    900
0    100
Name: count, dtype: int64

Up-sampling
    we'll increase the number of samples in the minority class by duplicating the minority class 

In [30]:
df_minority = data[data["Target"] == 0]
df_majority = data[data["Target"] == 1]

In [31]:
from sklearn.utils import resample
df_minority_upsampled =resample(df_minority,replace=True,n_samples=900,random_state=42)

# it duplicated data of minority class with replacement ie single data can come multiple times


In [32]:
df_minority_upsampled.shape

(900, 3)

In [33]:
df_minority_upsampled.head()

Unnamed: 0,Feature 1,Feature 2,Target
951,0.244443,-0.58958,0
992,-0.799422,1.315138,0
914,-1.226622,1.455808,0
971,0.968883,-0.709854,0
960,-0.365551,-0.208285,0


In [34]:
df_upsampled=pd.concat([df_minority_upsampled,df_majority])

In [35]:
df_upsampled["Target"].value_counts()

Target
0    900
1    900
Name: count, dtype: int64

Downsampling: Less preferable as we loses the data

In [36]:
df_majority_downsampled =resample(df_majority,replace=False,n_samples=100,random_state=42)
#since we are decreasing data we dont want same data as we have enough data we did replace=False so that same data can never come


In [37]:
df_majority_downsampled.shape

(100, 3)

In [38]:
df_downsampled=pd.concat([df_majority_downsampled,df_minority])

In [39]:
df_downsampled['Target'].value_counts()

Target
1    100
0    100
Name: count, dtype: int64