## Handling Imbalanced Dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
df.shape

(284807, 31)

In [None]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [None]:
df.isnull().count()

Time      284807
V1        284807
V2        284807
V3        284807
V4        284807
V5        284807
V6        284807
V7        284807
V8        284807
V9        284807
V10       284807
V11       284807
V12       284807
V13       284807
V14       284807
V15       284807
V16       284807
V17       284807
V18       284807
V19       284807
V20       284807
V21       284807
V22       284807
V23       284807
V24       284807
V25       284807
V26       284807
V27       284807
V28       284807
Amount    284807
Class     284807
dtype: int64

In [None]:
df['Class']

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

In [None]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

### Independent and Dependent Features

In [None]:
X = df.drop("Class", axis=1)
y = df.Class

### Cross Validation llike KFold and Hyperparameter tuning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
10.0**np.arange(-2, 3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

### Logistic Regression with HyperParameter Tuning and KFold

In [None]:
log_class = LogisticRegression()
grid = {'C': 10.0**np.arange(-2, 3), 'penalty': ['l1', 'l2']}
cv = KFold(n_splits=5, shuffle=False, random_state=None)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
clf = GridSearchCV(log_class, grid, cv=cv, n_jobs=-1, scoring='f1_macro')
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [None]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[85227    57]
 [   49   110]]
0.9987594068560327
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85284
           1       0.66      0.69      0.67       159

    accuracy                           1.00     85443
   macro avg       0.83      0.85      0.84     85443
weighted avg       1.00      1.00      1.00     85443



* If for an Imbalanced Dataset, you get a high accuray it does not mean that your model is good.

* Look at various performance metrics when working on an imbalanced dataset

* Precision and Recall score should be increased for the Class 1 (Fradulent Activity).

* Model is doing a decent job

### Random Forest Classification

In [None]:
X_train.shape

(199364, 30)

In [None]:
y_train.value_counts()

0    199031
1       333
Name: Class, dtype: int64

In [None]:
class_wieght = dict({0:1, 1:100})

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(class_weight=class_wieght)
classifier.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 1, 1: 100})

#### Results without Class Weights

In [None]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[85275     9]
 [   37   122]]
0.9994616293903538
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85284
           1       0.93      0.77      0.84       159

    accuracy                           1.00     85443
   macro avg       0.97      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443



#### Results with Class Weights

In [None]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[85279     5]
 [   39   120]]
0.9994850368081645
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85284
           1       0.96      0.75      0.85       159

    accuracy                           1.00     85443
   macro avg       0.98      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443



### 1. Under Sampling Technique

* Reduces the points of the maximum labels in our case it is (0 label)

* Not use this because there is loss of data

* Should use it when the dataset is not large enough

In [None]:
y_train.value_counts()

0    199031
1       333
Name: Class, dtype: int64

In [None]:
from collections import Counter
from imblearn.under_sampling import NearMiss

ns = NearMiss(sampling_strategy=0.8)
X_train_ns, y_train_ns = ns.fit_sample(X_train, y_train)
print("The number of class before Near Miss {}".format(Counter(y_train)))
print("The number of class after Near Miss {}".format(Counter(y_train_ns)))

The number of class before Near Miss Counter({0: 199031, 1: 333})
The number of class after Near Miss Counter({0: 416, 1: 333})


### Random Forest Classifer After Near Miss

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_ns, y_train_ns)

RandomForestClassifier()

In [None]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[68263 17021]
 [   16   143]]
0.8006039113795161
              precision    recall  f1-score   support

           0       1.00      0.80      0.89     85284
           1       0.01      0.90      0.02       159

    accuracy                           0.80     85443
   macro avg       0.50      0.85      0.45     85443
weighted avg       1.00      0.80      0.89     85443



* The accuracy was no where near the previous one so it is advisable to avoid Under Sampling if possible

### 2. Over Sampling Technique

* Increases the points of the minimum labels in our case it is (1 label)

* The points are created over the same points in the '1' label

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
from collections import Counter

os = RandomOverSampler(sampling_strategy=0.75)
X_train_os, y_train_os = os.fit_sample(X_train, y_train)
print("The number of class before Over Sampler {}".format(Counter(y_train)))
print("The number of class after Over Sampler {}".format(Counter(y_train_os)))

The number of class before Over Sampler Counter({0: 199031, 1: 333})
The number of class after Over Sampler Counter({0: 199031, 1: 149273})


#### Random Forest Classifer after Over Sampling

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_os, y_train_os)

RandomForestClassifier()

In [None]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[85274    10]
 [   36   123]]
0.9994616293903538
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85284
           1       0.92      0.77      0.84       159

    accuracy                           1.00     85443
   macro avg       0.96      0.89      0.92     85443
weighted avg       1.00      1.00      1.00     85443



### 3. SMOTETomek Technique

* SMOTETomek technique creates new points of the minimum label ('1' label)

* Based on the nearest point of the label more points will **created** around it. Hence it takes more time

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
st = SMOTETomek(sampling_strategy=0.75)
X_train_st, y_train_st = st.fit_sample(X_train, y_train)
print("The number of class before SmoteTomek {}".format(Counter(y_train)))
print("The number of class after SmoteTomek {}".format(Counter(y_train_st)))

The number of class before SmoteTomek Counter({0: 199031, 1: 333})
The number of class after SmoteTomek Counter({0: 198423, 1: 148665})


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_st, y_train_st)

RandomForestClassifier()

In [None]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[85266    18]
 [   33   126]]
0.999403110845827
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85284
           1       0.88      0.79      0.83       159

    accuracy                           1.00     85443
   macro avg       0.94      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443



### Ensemble Techniques (Imbalanced Learn)

In [None]:
from imblearn.ensemble import EasyEnsembleClassifier

In [None]:
easy = EasyEnsembleClassifier()
easy.fit(X_train, y_train)

EasyEnsembleClassifier()

In [None]:
y_pred = easy.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[82875  2409]
 [   21   138]]
0.9715599873599944
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     85284
           1       0.05      0.87      0.10       159

    accuracy                           0.97     85443
   macro avg       0.53      0.92      0.54     85443
weighted avg       1.00      0.97      0.98     85443

