# Handling Imbalanced Dataset

* Undersampling
* Oversampling
* SMOT
* Ensemble

To start working with imbalanced dataset, it is required to define the imbalanced tune-up model, then model can be improved by handling imbalanced dataset

In [1]:
%pylab inline
plt.style.use("bmh")
plt.rcParams["figure.figsize"] = (6,6)

Populating the interactive namespace from numpy and matplotlib


In [46]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold, GridSearchCV

# sampling
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.ensemble import EasyEnsembleClassifier
from collections import Counter

import os
import sys

In [2]:
%load_ext autoreload
%autoreload 2

In [5]:
# When the decimal is a lot, to make the values readable it is better to modify the float format
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
data = pd.read_csv('../../creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.35981,-0.07278,2.53635,1.37816,-0.33832,0.46239,0.2396,0.0987,0.36379,...,-0.01831,0.27784,-0.11047,0.06693,0.12854,-0.18911,0.13356,-0.02105,149.62,0
1,0.0,1.19186,0.26615,0.16648,0.44815,0.06002,-0.08236,-0.0788,0.0851,-0.25543,...,-0.22578,-0.63867,0.10129,-0.33985,0.16717,0.12589,-0.00898,0.01472,2.69,0
2,1.0,-1.35835,-1.34016,1.77321,0.37978,-0.5032,1.8005,0.79146,0.24768,-1.51465,...,0.248,0.77168,0.90941,-0.68928,-0.32764,-0.1391,-0.05535,-0.05975,378.66,0
3,1.0,-0.96627,-0.18523,1.79299,-0.86329,-0.01031,1.2472,0.23761,0.37744,-1.38702,...,-0.1083,0.00527,-0.19032,-1.17558,0.64738,-0.22193,0.06272,0.06146,123.5,0
4,2.0,-1.15823,0.87774,1.54872,0.40303,-0.40719,0.09592,0.59294,-0.27053,0.81774,...,-0.00943,0.79828,-0.13746,0.14127,-0.20601,0.50229,0.21942,0.21515,69.99,0


In [8]:
data.shape

(284807, 31)

In [9]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [10]:
data.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [11]:
# independant and dependant features
X = data.drop('Class',axis=1)
y = data.Class

## Unbalanced model

### Logistic Regression

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7)

In [13]:
10.0**np.arange(-2,3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [14]:
log_class = LogisticRegression()
grid={'C':10.0**np.arange(-2,3), 'penalty':['l1','l2']}
cv = KFold(n_splits=5, shuffle=False,random_state=None)

In [15]:
clf = GridSearchCV(log_class, grid,cv=cv,n_jobs=-1,scoring='f1_macro')
clf.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [16]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85255    34]
 [   43   111]]
0.9990988144142879
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85289
           1       0.77      0.72      0.74       154

    accuracy                           1.00     85443
   macro avg       0.88      0.86      0.87     85443
weighted avg       1.00      1.00      1.00     85443



### Random Forest
#### No class weights

In [17]:
classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)

RandomForestClassifier()

In [18]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85283     6]
 [   35   119]]
0.9995201479348805
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85289
           1       0.95      0.77      0.85       154

    accuracy                           1.00     85443
   macro avg       0.98      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443



#### Adding class weights
you can use class_weight="balanced", which is easier to understand: it basically means replicating the smaller class until you have as many samples as in the larger one, but in an implicit way.

https://www.youtube.com/watch?v=Kp31wfHpG2c
##### Balanced
`n_samples / (n_classes * np.bincount(y))`

In [19]:
classifier_balanced = RandomForestClassifier(class_weight='balanced',n_jobs=-1)
classifier_balanced.fit(X_train,y_train)

RandomForestClassifier(class_weight='balanced', n_jobs=-1)

In [20]:
y_pred = classifier_balanced.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85283     6]
 [   38   116]]
0.9994850368081645
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85289
           1       0.95      0.75      0.84       154

    accuracy                           1.00     85443
   macro avg       0.98      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443



##### Balanced_subsample

In [21]:
classifier_balanced_subsample = RandomForestClassifier(class_weight='balanced_subsample',n_jobs=-1)
classifier_balanced_subsample.fit(X_train,y_train)

RandomForestClassifier(class_weight='balanced_subsample', n_jobs=-1)

In [22]:
y_pred = classifier_balanced_subsample.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85282     7]
 [   35   119]]
0.9995084442259752
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85289
           1       0.94      0.77      0.85       154

    accuracy                           1.00     85443
   macro avg       0.97      0.89      0.92     85443
weighted avg       1.00      1.00      1.00     85443



## Under-sampling

Reduce the points of the maximun labels. **Do not use under-sampling unless the dataset is very very less**

* NearMiss Under Sampler
* Random Under Sampler

### NearMiss Under Sampler

NearMiss is an under-sampling technique. It aims to balance class distribution by randomly eliminating majority class examples. When instances of two different classes are very close to each other, we remove the instances of the majority class to increase the spaces between the two classes.

In [27]:
nm = NearMiss(n_jobs=-1)
X_train_nm, y_train_nm = nm.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_nm)))

The number of classes before fit Counter({0: 199026, 1: 338})
The number of classes before fit Counter({0: 338, 1: 338})


In [28]:
classifier_nm = RandomForestClassifier(n_jobs=-1)
classifier_nm.fit(X_train_nm,y_train_nm)

RandomForestClassifier(n_jobs=-1)

In [29]:
y_pred = classifier_nm.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[58499 26790]
 [    9   145]]
0.686352305045469
              precision    recall  f1-score   support

           0       1.00      0.69      0.81     85289
           1       0.01      0.94      0.01       154

    accuracy                           0.69     85443
   macro avg       0.50      0.81      0.41     85443
weighted avg       1.00      0.69      0.81     85443



## Over-sampling

In [39]:
os = RandomOverSampler()
X_train_os, y_train_os = os.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_os)))

The number of classes before fit Counter({0: 199026, 1: 338})
The number of classes before fit Counter({0: 199026, 1: 199026})


In [40]:
classifier_os = RandomForestClassifier(n_jobs=-1)
classifier_os.fit(X_train_os,y_train_os)

RandomForestClassifier(n_jobs=-1)

In [41]:
y_pred = classifier_os.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85282     7]
 [   34   120]]
0.9995201479348805
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85289
           1       0.94      0.78      0.85       154

    accuracy                           1.00     85443
   macro avg       0.97      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [123]:
# Manually oversampling based on the all the data
print(data.Class.value_counts())
mbalanced_ds = pd.concat([data[data.loc[:,'Class']==0], data[data.loc[:,'Class']==1].sample(284315, replace=True)])
mbalanced_ds.Class.value_counts()

# In this case, for validation the following step should be done:
X_test_unique = X_test[~X_test.duplicated()]
y_test_unique = y_test[~X_test.duplicated()]

0    284315
1       492
Name: Class, dtype: int64


1    284315
0    284315
Name: Class, dtype: int64

## SMOT
SMOTE is one of the most popular oversampling techniques that is developed by Chawla et al. (2002). Unlike random oversampling that only duplicates some random examples from the minority class, SMOTE generates examples based on the distance of each data (usually using Euclidean distance) and the minority class nearest neighbors, so the generated examples are different from the original minority class.
To define the best ratio, it is better to watch https://www.youtube.com/watch?v=U3X98xZ4_no&t=36s

In [43]:
sm = SMOTETomek(0.5)
X_train_os, y_train_os = sm.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_os)))

The number of classes before fit Counter({0: 199026, 1: 338})
The number of classes before fit Counter({0: 197900, 1: 98387})


In [44]:
classifier_os = RandomForestClassifier(n_jobs=-1)
classifier_os.fit(X_train_os,y_train_os)

RandomForestClassifier(n_jobs=-1)

In [45]:
y_pred = classifier_os.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85275    14]
 [   19   135]]
0.9996137776061234
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85289
           1       0.91      0.88      0.89       154

    accuracy                           1.00     85443
   macro avg       0.95      0.94      0.95     85443
weighted avg       1.00      1.00      1.00     85443



# Ensemble

In [47]:
easy = EasyEnsembleClassifier(n_jobs=-1)
easy.fit(X_train,y_train)

EasyEnsembleClassifier(n_jobs=-1)

In [48]:
y_pred = easy.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[82252  3037]
 [   15   139]]
0.9642802804208653
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     85289
           1       0.04      0.90      0.08       154

    accuracy                           0.96     85443
   macro avg       0.52      0.93      0.53     85443
weighted avg       1.00      0.96      0.98     85443

