# Handling Imbalanced Dataset

* Undersampling
* Oversampling
* SMOT
* Ensemble

To start working with imbalanced dataset, it is required to define the imbalanced tune-up model, then model can be improved by handling imbalanced dataset

In [19]:
%pylab inline
plt.style.use("bmh")
plt.rcParams["figure.figsize"] = (6,6)

Populating the interactive namespace from numpy and matplotlib


In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [36]:
import warnings
warnings.filterwarnings('ignore')

In [126]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold, GridSearchCV

# sampling
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek
from collections import Counter

import os
import sys

In [6]:
data = pd.read_csv('../../creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
data.shape

(284807, 31)

In [8]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [9]:
data.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [10]:
# independant and dependant features
X = data.drop('Class',axis=1)
y = data.Class

## Unbalanced model

### Logistic Regression

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7)

In [30]:
10.0**np.arange(-2,3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [34]:
log_class = LogisticRegression()
grid={'C':10.0**np.arange(-2,3), 'penalty':['l1','l2']}
cv = KFold(n_splits=5, shuffle=False,random_state=None)

In [35]:
clf = GridSearchCV(log_class, grid,cv=cv,n_jobs=-1,scoring='f1_macro')
clf.fit(X_train,y_train)

        nan 0.84060328        nan 0.84577282]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [37]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85269    31]
 [   46    97]]
0.9990988144142879
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85300
           1       0.76      0.68      0.72       143

    accuracy                           1.00     85443
   macro avg       0.88      0.84      0.86     85443
weighted avg       1.00      1.00      1.00     85443



### Random Forest
#### No class weights

In [41]:
classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)

RandomForestClassifier()

In [42]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85295     5]
 [   36   107]]
0.9995201479348805
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85300
           1       0.96      0.75      0.84       143

    accuracy                           1.00     85443
   macro avg       0.98      0.87      0.92     85443
weighted avg       1.00      1.00      1.00     85443



#### Adding class weights
##### Balanced
`n_samples / (n_classes * np.bincount(y))`

In [43]:
classifier_balanced = RandomForestClassifier(class_weight='balanced',n_jobs=-1)
classifier_balanced.fit(X_train,y_train)

RandomForestClassifier(class_weight='balanced')

In [44]:
y_pred = classifier_balanced.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85297     3]
 [   33   110]]
0.9995786664794073
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85300
           1       0.97      0.77      0.86       143

    accuracy                           1.00     85443
   macro avg       0.99      0.88      0.93     85443
weighted avg       1.00      1.00      1.00     85443



##### Balanced_subsample

In [45]:
classifier_balanced_subsample = RandomForestClassifier(class_weight='balanced_subsample',n_jobs=-1)
classifier_balanced_subsample.fit(X_train,y_train)

RandomForestClassifier(class_weight='balanced_subsample', n_jobs=-1)

In [46]:
y_pred = classifier_balanced_subsample.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85296     4]
 [   35   108]]
0.9995435553526912
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85300
           1       0.96      0.76      0.85       143

    accuracy                           1.00     85443
   macro avg       0.98      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443



## Under-sampling

Reduce the points of the maximun labels. **Do not use under-sampling unless the dataset is very very less**

* NearMiss Under Sampler
* Random Under Sampler

### NearMiss Under Sampler

NearMiss is an under-sampling technique. It aims to balance class distribution by randomly eliminating majority class examples. When instances of two different classes are very close to each other, we remove the instances of the majority class to increase the spaces between the two classes.

In [68]:
nm = NearMiss(n_jobs=-1)
X_train_nm, y_train_nm = nm.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_nm)))

The number of classes before fit Counter({0: 199015, 1: 349})
The number of classes before fit Counter({0: 349, 1: 349})


In [69]:
classifier_nm = RandomForestClassifier(n_jobs=-1)
classifier_nm.fit(X_train_nm,y_train_nm)

RandomForestClassifier(n_jobs=-1)

In [70]:
y_pred = classifier_nm.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[59756 25544]
 [    8   135]]
0.700946830050443
              precision    recall  f1-score   support

           0       1.00      0.70      0.82     85300
           1       0.01      0.94      0.01       143

    accuracy                           0.70     85443
   macro avg       0.50      0.82      0.42     85443
weighted avg       1.00      0.70      0.82     85443



## Over-sampling

In [116]:
os = RandomOverSampler()
X_train_os, y_train_os = os.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_os)))

The number of classes before fit Counter({0: 199015, 1: 349})
The number of classes before fit Counter({0: 199015, 1: 199015})


In [124]:
classifier_os = RandomForestClassifier(n_jobs=-1)
classifier_os.fit(X_train_os,y_train_os)

RandomForestClassifier(n_jobs=-1)

In [125]:
y_pred = classifier_os.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85295     5]
 [   32   111]]
0.9995669627705019
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85300
           1       0.96      0.78      0.86       143

    accuracy                           1.00     85443
   macro avg       0.98      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [123]:
# Manually oversampling based on the all the data
print(data.Class.value_counts())
mbalanced_ds = pd.concat([data[data.loc[:,'Class']==0], data[data.loc[:,'Class']==1].sample(284315, replace=True)])
mbalanced_ds.Class.value_counts()

# In this case, for validation the following step should be done:
X_test_unique = X_test[~X_test.duplicated()]
y_test_unique = y_test[~X_test.duplicated()]

0    284315
1       492
Name: Class, dtype: int64


1    284315
0    284315
Name: Class, dtype: int64

## SMOT

In [None]:
sm = SMOTETomek()
X_train_os, y_train_os = sm.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_os)))