This notebook gives an insight into handling imbalanced dataset. Ensemble techniques have been experimented with both imbalanced and balanced datasets to look into there performance in different scenarios. Observations have been made on oversampling techniques such as random oversampling and SMOTE.

In [1]:
import pandas as pd
df=pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [2]:
df.shape

(284807, 31)

In [3]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [4]:
# Independent and Dependent Features
X=df.drop("Class",axis=1)
y=df.Class

In [None]:
# Cross Validation Like KFold and Hyperparameter Tuning

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import GridSearchCV

In [60]:
log_class = LogisticRegression(solver='liblinear', max_iter=1000)
grid = {'C':10.0 **np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(n_splits=5,random_state=None,shuffle=False)

In [61]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)

In [62]:
clf = GridSearchCV(log_class, grid, cv=cv, n_jobs=-1, scoring='f1_macro')
clf.fit(X_train, y_train)


In [63]:
y_pred=clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85275    16]
 [   53    99]]
0.9991924440855307
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85291
           1       0.86      0.65      0.74       152

    accuracy                           1.00     85443
   macro avg       0.93      0.83      0.87     85443
weighted avg       1.00      1.00      1.00     85443



In [18]:
class_weight=dict({0:1,1:100})

In [21]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
classifier.fit(X_train,y_train)

In [17]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85285     6]
 [   36   116]]
0.9995084442259752
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85291
           1       0.95      0.76      0.85       152

    accuracy                           1.00     85443
   macro avg       0.98      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443



In [9]:
#Under sampling 

In [22]:
y_train.value_counts()

Class
0    199024
1       340
Name: count, dtype: int64

In [40]:
from collections import Counter
from imblearn.under_sampling import NearMiss

# Drops majority samples based on distance
# Minority class is now 80% of the majority
ns = NearMiss(sampling_strategy=0.8)

X_train_ns, y_train_ns = ns.fit_resample(X_train, y_train)

print("The number of classes before fit: {}".format(Counter(y_train)))
print("The number of classes after fit: {}".format(Counter(y_train_ns)))


The number of classes before fit: Counter({0: 199024, 1: 340})
The number of classes after fit: Counter({0: 425, 1: 340})


In [33]:
(80/100)*433

346.40000000000003

In [35]:
classifier=RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
classifier.fit(X_train_ns,y_train_ns)

In [36]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[73438 11853]
 [   12   140]]
0.8611354938379973
              precision    recall  f1-score   support

           0       1.00      0.86      0.93     85291
           1       0.01      0.92      0.02       152

    accuracy                           0.86     85443
   macro avg       0.51      0.89      0.47     85443
weighted avg       1.00      0.86      0.92     85443



In [None]:
# Precision for fraud has gone down. Never do undersampling unless and untill your dataset is very very less.

In [None]:
# Over Sampling

In [68]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from collections import Counter

In [72]:
ros = RandomOverSampler(random_state=42)
X_train_os, y_train_os = ros.fit_resample(X_train, y_train)

print("Before oversampling:", Counter(y_train))
print("After oversampling:", Counter(y_train_os))

Before oversampling: Counter({0: 199024, 1: 340})
After oversampling: Counter({0: 199024, 1: 199024})


In [70]:
classifier = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
classifier.fit(X_train_os, y_train_os)

y_pred = classifier.predict(X_test)

In [71]:
y_pred = classifier.predict(X_test)


[[85268    23]
 [   34   118]]
0.9993328885923949
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85291
           1       0.84      0.78      0.81       152

    accuracy                           1.00     85443
   macro avg       0.92      0.89      0.90     85443
weighted avg       1.00      1.00      1.00     85443



In [None]:
# SMOTETomek (uses both undersampling and oversampling techniques)

In [75]:
from imblearn.combine import SMOTETomek
from collections import Counter

In [76]:
os = SMOTETomek(sampling_strategy=0.75, random_state=42)
X_train_ns, y_train_ns = os.fit_resample(X_train, y_train)

print("The number of classes before fit: {}".format(Counter(y_train)))
print("The number of classes after fit: {}".format(Counter(y_train_ns)))


The number of classes before fit: Counter({0: 199024, 1: 340})
The number of classes after fit: Counter({0: 198326, 1: 148570})


In [77]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

In [78]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85276    15]
 [   30   122]]
0.9994733330992591
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85291
           1       0.89      0.80      0.84       152

    accuracy                           1.00     85443
   macro avg       0.95      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443

