# Handling Imbalanced data

### Banking data set

In [1]:
#### UNDERSAMPLING or DOWNSAMPLING


import os
import numpy as np
import pandas as pd
import sklearn


############################################

# pip install -U imbalanced-learn (in ANACONDA PROMPT)


from imblearn.over_sampling import RandomOverSampler  # Oversampling or Upsampling
from imblearn.under_sampling import RandomUnderSampler # Undersampling or Downsampling

In [2]:
# Importing the data

os.chdir('D:/MLP_Session_26_JULY/MLP_12_16_08_21')
data = pd.read_csv('Banking_data.csv')

In [3]:
data.head()

Unnamed: 0,job,marital,default,housing,loan,poutcome,y
0,blue-collar,married,unknown,yes,no,nonexistent,0
1,technician,married,no,no,no,nonexistent,0
2,management,single,no,yes,no,success,1
3,services,married,no,no,no,nonexistent,0
4,retired,married,no,yes,no,success,1


In [4]:
# Looking at the frequency of 1 and 0 in the Target column

data.y.value_counts()

0    36548
1     4640
Name: y, dtype: int64

In [5]:
# This is a clear case of an Imbalanced data set

# Alt way to look at the frequency of 1 and 0

ones = data[data['y']==1]

zeros = data[data['y']==0]

print(ones.shape,zeros.shape)

(4640, 7) (36548, 7)


In [6]:
# Creating the Independent and Dependent variable set

X=data.iloc[:,:-1]
Y=data.iloc[:,-1]

In [7]:
X.head()

Unnamed: 0,job,marital,default,housing,loan,poutcome
0,blue-collar,married,unknown,yes,no,nonexistent
1,technician,married,no,no,no,nonexistent
2,management,single,no,yes,no,success
3,services,married,no,no,no,nonexistent
4,retired,married,no,yes,no,success


In [8]:
Y.head()

0    0
1    0
2    1
3    0
4    1
Name: y, dtype: int64

In [9]:
# Checking for Missing values

data.isnull().sum()

job         0
marital     0
default     0
housing     0
loan        0
poutcome    0
y           0
dtype: int64

In [10]:
# Performing Undersampling using RandomUnderSampler

under_sampler = RandomUnderSampler()
X_us, y_us = under_sampler.fit_resample(X, Y)

In [11]:
# Looking at the shape after undersampling

X_us.shape,y_us.shape

((9280, 6), (9280,))

In [12]:
# Shape of the original data

X.shape,Y.shape

((41188, 6), (41188,))

In [13]:
from collections import Counter

print('Original dataset shape {}'.format(Counter(Y)))

Original dataset shape Counter({0: 36548, 1: 4640})


In [15]:
print('Resampled dataset shape {}'.format(Counter(y_us)))

Resampled dataset shape Counter({0: 4640, 1: 4640})


In [16]:
#### OVERSAMPLING

data = pd.read_csv('Banking_data.csv')
data.head()


Unnamed: 0,job,marital,default,housing,loan,poutcome,y
0,blue-collar,married,unknown,yes,no,nonexistent,0
1,technician,married,no,no,no,nonexistent,0
2,management,single,no,yes,no,success,1
3,services,married,no,no,no,nonexistent,0
4,retired,married,no,yes,no,success,1


In [17]:
data.shape

(41188, 7)

In [18]:
data.y.value_counts()

0    36548
1     4640
Name: y, dtype: int64

In [19]:
# Alt way

ones = data[data['y']==1]

zeros = data[data['y']==0]

print(ones.shape,zeros.shape)

(4640, 7) (36548, 7)


In [20]:
print(X.shape)
print(Y.shape)

(41188, 6)
(41188,)


In [21]:
X=data.iloc[:,:-1]
Y=data.iloc[:,-1]

In [22]:
X.head()

Unnamed: 0,job,marital,default,housing,loan,poutcome
0,blue-collar,married,unknown,yes,no,nonexistent
1,technician,married,no,no,no,nonexistent
2,management,single,no,yes,no,success
3,services,married,no,no,no,nonexistent
4,retired,married,no,yes,no,success


In [23]:
Y.head()

0    0
1    0
2    1
3    0
4    1
Name: y, dtype: int64

In [24]:
data.isnull().sum()

job         0
marital     0
default     0
housing     0
loan        0
poutcome    0
y           0
dtype: int64

In [25]:
from imblearn.over_sampling import RandomOverSampler

over_sampler =  RandomOverSampler()

X_os, y_os = over_sampler.fit_resample(X, Y)

In [27]:
X_os.shape,y_os.shape

((73096, 6), (73096,))

In [28]:
X.shape,Y.shape

((41188, 6), (41188,))

In [29]:
from collections import Counter

print('Original dataset shape {}'.format(Counter(Y)))

Original dataset shape Counter({0: 36548, 1: 4640})


In [30]:
print('Resampled dataset shape {}'.format(Counter(y_os)))

Resampled dataset shape Counter({0: 36548, 1: 36548})
