#### Handling Imbalanced datasets

In [1]:
import pandas as pd 
import numpy as np 

In [10]:
np.random.seed(123)
n_sample = 1000
class_0_ratio = 0.9
n_class_0 = int(n_sample * class_0_ratio)
n_class_1 = int(n_sample / 10)
n_class_0  , n_class_1

(900, 100)

#### Creating a dataFrame with imbalanced dataset

In [18]:
class_0 = pd.DataFrame(
    {
        'feature_1': np.random.normal(loc=0,scale=1,size=n_class_0),
         'feature_2': np.random.normal(loc=0,scale=1,size=n_class_0),
         'target' : [0]*n_class_0
    }
)
class_1 = pd.DataFrame(
    {
        'feature_1': np.random.normal(loc=2,scale=1,size=n_class_1),
         'feature_2': np.random.normal(loc=2,scale=1,size=n_class_1),
         'target' : [1]*n_class_1
    }
)

In [19]:
class_1.head(),class_0.head()

(   feature_1  feature_2  target
 0  -0.643425   2.571923       1
 1   1.551009   1.782767       1
 2   1.641093   2.054318       1
 3   2.133194   2.155998       1
 4   1.355758   2.467810       1,
    feature_1  feature_2  target
 0  -1.774224   0.285744       0
 1  -1.201377   0.333279       0
 2   1.096257   0.531807       0
 3   0.861037  -0.354766       0
 4  -1.520367  -1.120815       0)

In [20]:
df = pd.concat([class_0,class_1]).reset_index(drop=True)

In [21]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.774224,0.285744,0
1,-1.201377,0.333279,0
2,1.096257,0.531807,0
3,0.861037,-0.354766,0
4,-1.520367,-1.120815,0


In [35]:
df['feature_1'].median()

0.1434918080329678

In [22]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

#### We can handles data_unbalancing by sampling

In [24]:
df_minority = df[df['target'] == 1]
df_mejority = df[df['target'] == 0]

df_minority,df_mejority

(     feature_1  feature_2  target
 900  -0.643425   2.571923       1
 901   1.551009   1.782767       1
 902   1.641093   2.054318       1
 903   2.133194   2.155998       1
 904   1.355758   2.467810       1
 ..         ...        ...     ...
 995   2.677156   1.092048       1
 996   2.963404   0.181955       1
 997   1.621476   1.877267       1
 998   3.429559   3.794486       1
 999   3.532273   1.679490       1
 
 [100 rows x 3 columns],
      feature_1  feature_2  target
 0    -1.774224   0.285744       0
 1    -1.201377   0.333279       0
 2     1.096257   0.531807       0
 3     0.861037  -0.354766       0
 4    -1.520367  -1.120815       0
 ..         ...        ...     ...
 895  -0.896718   2.540514       0
 896   0.812571  -0.082706       0
 897   0.637361   0.444621       0
 898   0.332712   1.896404       0
 899   0.258237   0.237581       0
 
 [900 rows x 3 columns])

In [26]:
from sklearn.utils import resample

updated_minority_data = resample(df_minority,n_samples=len(df_mejority) , random_state=42,replace=True)

In [28]:
updated_minority_data.shape, df_mejority.shape


((900, 3), (900, 3))

In [32]:
df_upsampled = pd.concat([df_mejority,updated_minority_data])
df_upsampled.head()
df_upsampled.shape

(1800, 3)

In [37]:
df['feature_1'].median(),df_upsampled['feature_1'].median()
df['feature_2'].median(),df_upsampled['feature_2'].median()

(0.13212647208803258, 1.0734584057633259)

In [39]:
df_upsampled['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

In [41]:
df_minority = df[df['target'] == 1]
df_mejority = df[df['target'] == 0]
df_mejority.shape,df_minority.shape

((900, 3), (100, 3))

In [43]:
df_down_sample_mejority_updated = resample(df_mejority,replace=False,random_state=42,n_samples=len(df_minority))
df_down_sample_mejority_updated.shape

(100, 3)

In [44]:
df_down_sample_mejority_updated['target'].value_counts()

target
0    100
Name: count, dtype: int64

In [45]:
df_down_updated_sample = pd.concat([df_down_sample_mejority_updated,df_minority])

In [47]:
df_down_updated_sample['target'].value_counts()

target
0    100
1    100
Name: count, dtype: int64