In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.shape

(284807, 31)

In [4]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

As we can see from above value counts, this data is imbalanced.

In [7]:
# dependent and independent features 
X = df.drop(['Class'], axis=1)
y = df['Class']

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=None, test_size=0.3)

#### Now we will check the performance of our model with imbalanced data

In [19]:
# checking Logistic Regression performance

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold, GridSearchCV

In [13]:
logreg = LogisticRegression()

grid = {
    'C': 10.0 ** np.arange(-2,3),
    'penalty': ['l1', 'l2']
}

cv = KFold(n_splits=5, shuffle=False, random_state=None)

In [14]:
clf = GridSearchCV(estimator=logreg, param_grid=grid, cv=cv, n_jobs=-1, verbose=2,
                   scoring='f1_macro')
clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   32.8s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro', verbose=2)

In [16]:
y_pred = clf.predict(X_test)

In [20]:
print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Classification report: \n{classification_report(y_test, y_pred)}')
print(f'Accuracy score: \n{accuracy_score(y_test, y_pred)}')

Confusion matrix: 
[[85196    98]
 [  148     1]]
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85294
           1       0.01      0.01      0.01       149

    accuracy                           1.00     85443
   macro avg       0.50      0.50      0.50     85443
weighted avg       1.00      1.00      1.00     85443

Accuracy score: 
0.9971208876092834


In [21]:
# Now checking RandomForestClassifier performance 

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [22]:
y_pred = rf.predict(X_test)

In [23]:
print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Classification report: \n{classification_report(y_test, y_pred)}')
print(f'Accuracy score: \n{accuracy_score(y_test, y_pred)}')

Confusion matrix: 
[[85291     3]
 [   29   120]]
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85294
           1       0.98      0.81      0.88       149

    accuracy                           1.00     85443
   macro avg       0.99      0.90      0.94     85443
weighted avg       1.00      1.00      1.00     85443

Accuracy score: 
0.9996254813150287


## UnderSampling 
We reduce the number of points which have maximum labels.

This has a very huge disadvantage, this reduces the amount of data and therefore should be used only when the dataset is not very large

In [2]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [4]:
X = df.drop(['Class'], axis=1)
y = df['Class']

In [5]:
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [10]:
y_train.value_counts()

0    199019
1       345
Name: Class, dtype: int64

Here we will try to reduce the number of samples of class label 0.

In [11]:
from imblearn.under_sampling import NearMiss
from collections import Counter

ns = NearMiss(sampling_strategy=0.8)
# 0.8 is the ration of number of points in minority class over those in majority class.

X_train_ns, y_train_ns = ns.fit_sample(X_train, y_train)
print(f"The number of class before the fit : {Counter(y_train)}")
print(f"The number of class after the fit : {Counter(y_train_ns)}")

The number of class before the fit : Counter({0: 199019, 1: 345})
The number of class after the fit : Counter({0: 431, 1: 345})


In [13]:
# Testing the results
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_ns, y_train_ns)

RandomForestClassifier()

In [14]:
y_pred = rf.predict(X_test)

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Classification report: \n{classification_report(y_test, y_pred)}')
print(f'Accuracy score: \n{accuracy_score(y_test, y_pred)}')

Confusion matrix: 
[[61492 23804]
 [   11   136]]
Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.72      0.84     85296
           1       0.01      0.93      0.01       147

    accuracy                           0.72     85443
   macro avg       0.50      0.82      0.42     85443
weighted avg       1.00      0.72      0.84     85443

Accuracy score: 
0.7212761724190396


As we can see, our precision value has reduced, andso has our accuracy, this is becuase we have reduced the data and therefore it is recommended to use under sampling only when we have a smaller dataset.

## Over Sampling
Here we increase the number of samples of minority class label.

In [17]:
from imblearn.over_sampling import RandomOverSampler

In [24]:
ros = RandomOverSampler(sampling_strategy=0.75)
X_train_os, y_train_os = ros.fit_sample(X_train, y_train)

In [25]:
print(f"The number of class before the fit : {Counter(y_train)}")
print(f"The number of class after the fit : {Counter(y_train_os)}")

The number of class before the fit : Counter({0: 199019, 1: 345})
The number of class after the fit : Counter({0: 199019, 1: 149264})


In [26]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestClassifier()
rf.fit(X_train_os, y_train_os)

RandomForestClassifier()

In [27]:
y_pred = rf.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Classification report: \n{classification_report(y_test, y_pred)}')
print(f'Accuracy score: \n{accuracy_score(y_test, y_pred)}')

Confusion matrix: 
[[85290     6]
 [   32   115]]
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.95      0.78      0.86       147

    accuracy                           1.00     85443
   macro avg       0.98      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443

Accuracy score: 
0.9995552590615966


## SMOTE Tomek

In [29]:
from imblearn.combine import SMOTETomek

In [32]:
sm = SMOTETomek(sampling_strategy=0.75)
X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

In [33]:
print(f"The number of class before the fit : {Counter(y_train)}")
print(f"The number of class after the fit : {Counter(y_train_sm)}")

The number of class before the fit : Counter({0: 199019, 1: 345})
The number of class after the fit : Counter({0: 198204, 1: 148449})


In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_sm, y_train_sm)

RandomForestClassifier()

In [35]:
y_pred = rf.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Classification report: \n{classification_report(y_test, y_pred)}')
print(f'Accuracy score: \n{accuracy_score(y_test, y_pred)}')

Confusion matrix: 
[[85284    12]
 [   26   121]]
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.91      0.82      0.86       147

    accuracy                           1.00     85443
   macro avg       0.95      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443

Accuracy score: 
0.9995552590615966


## Ensemble Techniques

In [37]:
from imblearn.ensemble import EasyEnsembleClassifier

In [41]:
eclf = EasyEnsembleClassifier(sampling_strategy=0.75)
eclf.fit(X_train, y_train)

EasyEnsembleClassifier(sampling_strategy=0.75)

In [42]:
y_pred = eclf.predict(X_test)

In [43]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Classification report: \n{classification_report(y_test, y_pred)}')
print(f'Accuracy score: \n{accuracy_score(y_test, y_pred)}')

Confusion matrix: 
[[83468  1828]
 [   17   130]]
Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85296
           1       0.07      0.88      0.12       147

    accuracy                           0.98     85443
   macro avg       0.53      0.93      0.56     85443
weighted avg       1.00      0.98      0.99     85443

Accuracy score: 
0.9784066570696254


##### To get a better score, we can play with different parameters of the EasyEnsembleClassifier