In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#import lightgbm as lgb

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedBaggingClassifier

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, roc_auc_score, precision_score

import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import xgboost as xgb
import lightgbm as lgb

In [2]:
def score_plot(actual, predicted):
    y_actual = actual
    y_predicted = predicted
    sub_df = pd.DataFrame({"y_actual": y_actual, "y_predicted": y_predicted})

    f, ax = plt.subplots(nrows=1, ncols=1, sharex=False, sharey=False, squeeze=True, figsize=(16, 6))
    sns.distplot(1-sub_df[sub_df['y_actual']==1].y_predicted.values, hist=True, kde=True, rug=False, label="Bad", ax=ax)
    sns.distplot(1-sub_df[sub_df['y_actual']==0].y_predicted.values, hist=True, kde=True, rug=False, label="Good", ax=ax)
    plt.xlabel('Predicted score')
    plt.ylabel('Count')
    plt.title('Distribution of score')
    plt.legend(loc="upper right")
    plt.show()

In [3]:
## Creating Evalution Metrics Function which we will be using
def eval_metrics(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUROC :",  roc_auc_score(actual, predicted_probability))
    #score_plot(actual, predicted)

In [4]:
## Data Source https://www.kaggle.com/mlg-ulb/creditcardfraud
df = pd.read_csv('SampleData.csv')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
## Class Distribution of the Data
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

## About Data
Time, V1 to V25 and Amount are Vaariables (X)
<br>

Class is the label (Y)
- Class = 0 means not fraud
- Class = 1 means fraud
<br>

Problem Statement : Company wants to reduce their frauds, so create a model which can seperate Frauds from Non-Frauds

## Basic Model 

In [6]:
## Splitting Data in train test, which we will be using through out the exercise
Xtrain, Xtest, ytrain, ytest = train_test_split(df.drop(columns = 'Class'), df.Class, test_size=0.20, random_state=27)
print("Train Shape", Xtrain.shape)
print("Train Class Distribution")
print(ytrain.value_counts())
print("")
print("Test Shape", Xtest.shape)
print("Test Class Distribution")
print(ytest.value_counts())


Train Shape (227845, 30)
Train Class Distribution
0    227457
1       388
Name: Class, dtype: int64

Test Shape (56962, 30)
Test Class Distribution
0    56858
1      104
Name: Class, dtype: int64


In [7]:
# Lets Predict All 0 (Most Frequent)
pred_0 = [0]*len(ytest)

# Accuracy and Other Evalution Parameters
print('Test score: ', accuracy_score(ytest, pred_0))
print("Confusion Matrix")
print(pd.DataFrame(confusion_matrix(ytest, pred_0)))

Test score:  0.9981742214107651
Confusion Matrix
       0  1
0  56858  0
1    104  0


So By Predicting 0 we got a good accuracy but this does not serve the purpose of the problem
<br>
Our main aim is to seperate fraud and non-frauds
<br>
Hence Accuracy is not the right measure for handling this type of (imbalanced) data

### Now We will check following Evalution Metrics
- Precision Recall
- F1 Score (Harmonic Mean of Precision and Recall)
- AUROC (Degree or measure of separability)

In [8]:
eval_metrics(ytest, pred_0, pred_0)

Confusion Matrix
       0  1
0  56858  0
1    104  0

For Class 1
f1 Score : 0.0
Precision Score : 0.0
Recall Score : 0.0

For Class 0
f1 Score : 0.9990862765770515
Precision Score : 0.9981742214107651
Recall Score : 1.0

AUROC : 0.5


Precision, Recall, F1 Score and AUROC came out very bad
<br>
This shows accuracy is not the right measure for Imbalanced Data

# Logistic Regression

In [9]:
lrclf = LogisticRegression().fit(Xtrain, ytrain)
pred_test_lr_class = lrclf.predict(Xtest)
pred_test_lr = lrclf.predict_proba(Xtest)

In [10]:
# Checking accuracy
print("Accuracy :", accuracy_score(ytest, pred_test_lr_class))
eval_metrics(ytest, pred_test_lr_class, pred_test_lr[:,1])

Accuracy : 0.9991046662687406
Confusion Matrix
       0   1
0  56846  12
1     39  65

For Class 1
f1 Score : 0.7182320441988951
Precision Score : 0.8441558441558441
Recall Score : 0.625

For Class 0
f1 Score : 0.9995516207590797
Precision Score : 0.9993144062582403
Recall Score : 0.9997889479053079

AUROC : 0.9127790013988966


## Findings
- Accuracy as well as Precision, Recall, F1 Score and AUROC Improved
- Further Improvement will be checked

# Resampling Techniques 

## Undersampling Majority Class 

In [11]:
X = pd.concat([Xtrain, ytrain], axis=1)
# Seperating Data according to class
zero = X[X.Class == 0]
one = X[X.Class == 1]
print(len(one)/len(zero))

0.001705816923638314


In [12]:
# Undersampling Zeros (Majority Class)
# Zeros are around 580 time of Ones
zero_under1 = resample(zero, replace = False, n_samples = int(round(len(one)*1,0)), random_state = 111)
zero_under2 = resample(zero, replace = False, n_samples = int(round(len(one)*2,0)), random_state = 111)
zero_under3 = resample(zero, replace = False, n_samples = int(round(len(one)*5,0)), random_state = 111)
zero_under4 = resample(zero, replace = False, n_samples = int(round(len(one)*50,0)), random_state = 111)
zero_under5 = resample(zero, replace = False, n_samples = int(round(len(one)*100,0)), random_state = 111)
zero_under6 = resample(zero, replace = False, n_samples = int(round(len(one)*200,0)), random_state = 111)
zero_under7 = resample(zero, replace = False, n_samples = int(round(len(one)*500,0)), random_state = 111)

# Final Undersampled Data
undersampled1 = pd.concat([zero_under1, one]).reset_index(drop = True)
undersampled2 = pd.concat([zero_under2, one]).reset_index(drop = True)
undersampled3 = pd.concat([zero_under3, one]).reset_index(drop = True)
undersampled4 = pd.concat([zero_under4, one]).reset_index(drop = True)
undersampled5 = pd.concat([zero_under5, one]).reset_index(drop = True)
undersampled6 = pd.concat([zero_under6, one]).reset_index(drop = True)
undersampled7 = pd.concat([zero_under7, one]).reset_index(drop = True)

print("Sample 1")
print(undersampled1.Class.value_counts())
print("Sample 2")
print(undersampled2.Class.value_counts())
print("Sample 3")
print(undersampled3.Class.value_counts())
print("Sample 4")
print(undersampled4.Class.value_counts())
print("Sample 5")
print(undersampled5.Class.value_counts())
print("Sample 6")
print(undersampled6.Class.value_counts())
print("Sample 7")
print(undersampled7.Class.value_counts())

Sample 1
1    388
0    388
Name: Class, dtype: int64
Sample 2
0    776
1    388
Name: Class, dtype: int64
Sample 3
0    1940
1     388
Name: Class, dtype: int64
Sample 4
0    19400
1      388
Name: Class, dtype: int64
Sample 5
0    38800
1      388
Name: Class, dtype: int64
Sample 6
0    77600
1      388
Name: Class, dtype: int64
Sample 7
0    194000
1       388
Name: Class, dtype: int64


In [13]:
# Model Fitting on Undersampled Data
us_lrclf1 = LogisticRegression().fit(undersampled1.drop(columns = 'Class'), undersampled1.Class)
us_lrclf2 = LogisticRegression().fit(undersampled2.drop(columns = 'Class'), undersampled2.Class)
us_lrclf3 = LogisticRegression().fit(undersampled3.drop(columns = 'Class'), undersampled3.Class)
us_lrclf4 = LogisticRegression().fit(undersampled4.drop(columns = 'Class'), undersampled4.Class)
us_lrclf5 = LogisticRegression().fit(undersampled5.drop(columns = 'Class'), undersampled5.Class)
us_lrclf6 = LogisticRegression().fit(undersampled6.drop(columns = 'Class'), undersampled6.Class)
us_lrclf7 = LogisticRegression().fit(undersampled7.drop(columns = 'Class'), undersampled7.Class)

# Predicting on Test
pred_test_lr_us_class1 = us_lrclf1.predict(Xtest)
pred_test_lr_us_class2 = us_lrclf2.predict(Xtest)
pred_test_lr_us_class3 = us_lrclf3.predict(Xtest)
pred_test_lr_us_class4 = us_lrclf4.predict(Xtest)
pred_test_lr_us_class5 = us_lrclf5.predict(Xtest)
pred_test_lr_us_class6 = us_lrclf6.predict(Xtest)
pred_test_lr_us_class7 = us_lrclf7.predict(Xtest)

pred_test_lr_us1 = us_lrclf1.predict_proba(Xtest)
pred_test_lr_us2 = us_lrclf2.predict_proba(Xtest)
pred_test_lr_us3 = us_lrclf3.predict_proba(Xtest)
pred_test_lr_us4 = us_lrclf4.predict_proba(Xtest)
pred_test_lr_us5 = us_lrclf5.predict_proba(Xtest)
pred_test_lr_us6 = us_lrclf6.predict_proba(Xtest)
pred_test_lr_us7 = us_lrclf7.predict_proba(Xtest)


In [14]:
# Checking accuracy
print("Sample 1")
print("Accuracy : ",accuracy_score(ytest, pred_test_lr_us_class1))
eval_metrics(ytest, pred_test_lr_us_class1, pred_test_lr_us1[:,1])
print("")
print("Sample 2")
print("Accuracy : ",accuracy_score(ytest, pred_test_lr_us_class2))
eval_metrics(ytest, pred_test_lr_us_class2, pred_test_lr_us2[:,1])
print("")
print("Sample 3")
print("Accuracy : ",accuracy_score(ytest, pred_test_lr_us_class3))
eval_metrics(ytest, pred_test_lr_us_class3, pred_test_lr_us3[:,1])
print("")
print("Sample 4")
print("Accuracy : ",accuracy_score(ytest, pred_test_lr_us_class4))
eval_metrics(ytest, pred_test_lr_us_class4, pred_test_lr_us4[:,1])
print("")
print("Sample 5")
print("Accuracy : ",accuracy_score(ytest, pred_test_lr_us_class5))
eval_metrics(ytest, pred_test_lr_us_class5, pred_test_lr_us5[:,1])
print("")
print("Sample 6")
print("Accuracy : ",accuracy_score(ytest, pred_test_lr_us_class6))
eval_metrics(ytest, pred_test_lr_us_class6, pred_test_lr_us6[:,1])
print("")
print("Sample 7")
print("Accuracy : ",accuracy_score(ytest, pred_test_lr_us_class7))
eval_metrics(ytest, pred_test_lr_us_class7, pred_test_lr_us7[:,1])

Sample 1
Accuracy :  0.9724026544011798
Confusion Matrix
       0     1
0  55299  1559
1     13    91

For Class 1
f1 Score : 0.10376282782212086
Precision Score : 0.05515151515151515
Recall Score : 0.875

For Class 0
f1 Score : 0.9859855576357315
Precision Score : 0.9997649696268441
Recall Score : 0.9725808153645925

AUROC : 0.9627969949428671

Sample 2
Accuracy :  0.9925037744461219
Confusion Matrix
       0    1
0  56451  407
1     20   84

For Class 1
f1 Score : 0.2823529411764706
Precision Score : 0.1710794297352342
Recall Score : 0.8076923076923077

For Class 0
f1 Score : 0.9962322088785749
Precision Score : 0.9996458359157798
Recall Score : 0.9928418164550283

AUROC : 0.937597070434578

Sample 3
Accuracy :  0.9960675538078017
Confusion Matrix
       0    1
0  56662  196
1     28   76

For Class 1
f1 Score : 0.4042553191489362
Precision Score : 0.27941176470588236
Recall Score : 0.7307692307692307

For Class 0
f1 Score : 0.9980272660020432
Precision Score : 0.9995060857294056
Rec

## Findings
### Simple Logistic Regression Result
- Accuracy : 0.999
- f1 Score : 0.718
- Precision Score : 0.844
- Recall Score : 0.625
- AUROC : 0.913
### Best Result after Undersampling
- Accuracy ~ 0.999
- f1 Score - 0.702
- Precision Score - 0.786
- Recall Score - 0.634
- AUROC - 0.917

Recall and AUROC has improved while Precision and F1 Score does not
<br>
Reason could be we are missing some info by eliminating some values and model is predicting some 0 Class as 1 Class this results in decrease in precision and F1 Score

## Oversampling Minority Class 

In [15]:
# Over Sampling Class 1
one_over1 = resample(one, replace = True, n_samples = int(round(len(zero)*1,0)), random_state = 111)
one_over2 = resample(one, replace = True, n_samples = int(round(len(zero)*0.75,0)), random_state = 111)
one_over3 = resample(one, replace = True, n_samples = int(round(len(zero)*0.5,0)), random_state = 111)
one_over4 = resample(one, replace = True, n_samples = int(round(len(zero)*0.25,0)), random_state = 111)
one_over5 = resample(one, replace = True, n_samples = int(round(len(zero)*0.1,0)), random_state = 111)
one_over6 = resample(one, replace = True, n_samples = int(round(len(zero)*0.02,0)), random_state = 111)
one_over7 = resample(one, replace = True, n_samples = int(round(len(zero)*0.005,0)), random_state = 111)
# Oversampled Data
oversampled1 = pd.concat([zero, one_over1]).reset_index(drop = True)
oversampled2 = pd.concat([zero, one_over2]).reset_index(drop = True)
oversampled3 = pd.concat([zero, one_over3]).reset_index(drop = True)
oversampled4 = pd.concat([zero, one_over4]).reset_index(drop = True)
oversampled5 = pd.concat([zero, one_over5]).reset_index(drop = True)
oversampled6 = pd.concat([zero, one_over6]).reset_index(drop = True)
oversampled7 = pd.concat([zero, one_over7]).reset_index(drop = True)
print("Sample 1")
print(oversampled1.Class.value_counts())
print("Sample 2")
print(oversampled2.Class.value_counts())
print("Sample 3")
print(oversampled3.Class.value_counts())
print("Sample 4")
print(oversampled4.Class.value_counts())
print("Sample 5")
print(oversampled5.Class.value_counts())
print("Sample 6")
print(oversampled6.Class.value_counts())
print("Sample 7")
print(oversampled7.Class.value_counts())

Sample 1
1    227457
0    227457
Name: Class, dtype: int64
Sample 2
0    227457
1    170593
Name: Class, dtype: int64
Sample 3
0    227457
1    113728
Name: Class, dtype: int64
Sample 4
0    227457
1     56864
Name: Class, dtype: int64
Sample 5
0    227457
1     22746
Name: Class, dtype: int64
Sample 6
0    227457
1      4549
Name: Class, dtype: int64
Sample 7
0    227457
1      1137
Name: Class, dtype: int64


In [16]:
os_lrclf1 = LogisticRegression().fit(oversampled1.drop(columns = 'Class'), oversampled1.Class)
os_lrclf2 = LogisticRegression().fit(oversampled2.drop(columns = 'Class'), oversampled2.Class)
os_lrclf3 = LogisticRegression().fit(oversampled3.drop(columns = 'Class'), oversampled3.Class)
os_lrclf4 = LogisticRegression().fit(oversampled4.drop(columns = 'Class'), oversampled4.Class)
os_lrclf5 = LogisticRegression().fit(oversampled5.drop(columns = 'Class'), oversampled5.Class)
os_lrclf6 = LogisticRegression().fit(oversampled6.drop(columns = 'Class'), oversampled6.Class)
os_lrclf7 = LogisticRegression().fit(oversampled7.drop(columns = 'Class'), oversampled7.Class)

pred_test_lr_os1 = os_lrclf1.predict_proba(Xtest)
pred_test_lr_os2 = os_lrclf2.predict_proba(Xtest)
pred_test_lr_os3 = os_lrclf3.predict_proba(Xtest)
pred_test_lr_os4 = os_lrclf4.predict_proba(Xtest)
pred_test_lr_os5 = os_lrclf5.predict_proba(Xtest)
pred_test_lr_os6 = os_lrclf6.predict_proba(Xtest)
pred_test_lr_os7 = os_lrclf7.predict_proba(Xtest)

pred_test_lr_os_class1 = os_lrclf1.predict(Xtest)
pred_test_lr_os_class2 = os_lrclf2.predict(Xtest)
pred_test_lr_os_class3 = os_lrclf3.predict(Xtest)
pred_test_lr_os_class4 = os_lrclf4.predict(Xtest)
pred_test_lr_os_class5 = os_lrclf5.predict(Xtest)
pred_test_lr_os_class6 = os_lrclf6.predict(Xtest)
pred_test_lr_os_class7 = os_lrclf7.predict(Xtest)

In [17]:
print("Sample 1")
print("Accuracy :", accuracy_score(ytest, pred_test_lr_os_class1))
eval_metrics(ytest, pred_test_lr_os_class1, pred_test_lr_os1[:,1])
print("")
print("Sample 2")
print("Accuracy :", accuracy_score(ytest, pred_test_lr_os_class2))
eval_metrics(ytest, pred_test_lr_os_class2, pred_test_lr_os2[:,1])
print("")
print("Sample 3")
print("Accuracy :", accuracy_score(ytest, pred_test_lr_os_class3))
eval_metrics(ytest, pred_test_lr_os_class3, pred_test_lr_os3[:,1])
print("")
print("Sample 4")
print("Accuracy :", accuracy_score(ytest, pred_test_lr_os_class4))
eval_metrics(ytest, pred_test_lr_os_class4, pred_test_lr_os4[:,1])
print("")
print("Sample 5")
print("Accuracy :", accuracy_score(ytest, pred_test_lr_os_class5))
eval_metrics(ytest, pred_test_lr_os_class5, pred_test_lr_os5[:,1])
print("")
print("Sample 6")
print("Accuracy :", accuracy_score(ytest, pred_test_lr_os_class6))
eval_metrics(ytest, pred_test_lr_os_class6, pred_test_lr_os6[:,1])
print("")
print("Sample 7")
print("Accuracy :", accuracy_score(ytest, pred_test_lr_os_class7))
eval_metrics(ytest, pred_test_lr_os_class7, pred_test_lr_os7[:,1])

Sample 1
Accuracy : 0.9808819915031073
Confusion Matrix
       0     1
0  55783  1075
1     14    90

For Class 1
f1 Score : 0.14184397163120568
Precision Score : 0.07725321888412018
Recall Score : 0.8653846153846154

For Class 0
f1 Score : 0.9903333185389019
Precision Score : 0.9997490904528917
Recall Score : 0.9810932498505047

AUROC : 0.9569983048187523

Sample 2
Accuracy : 0.9897826621256276
Confusion Matrix
       0    1
0  56296  562
1     20   84

For Class 1
f1 Score : 0.22400000000000003
Precision Score : 0.13003095975232198
Recall Score : 0.8076923076923077

For Class 0
f1 Score : 0.9948574760987507
Precision Score : 0.999644861140706
Recall Score : 0.9901157268985895

AUROC : 0.9405786886088692

Sample 3
Accuracy : 0.993750219444542
Confusion Matrix
       0    1
0  56524  334
1     22   82

For Class 1
f1 Score : 0.3153846153846154
Precision Score : 0.1971153846153846
Recall Score : 0.7884615384615384

For Class 0
f1 Score : 0.9968607809248351
Precision Score : 0.9996109362

In [None]:
pred_train_lr_os7 = os_lrclf7.predict_proba(Xtrain)
pred_train_lr_os_class7 = os_lrclf7.predict(Xtrain)
print("Accuracy :", accuracy_score(ytrain, pred_train_lr_os_class7))
eval_metrics(ytrain, pred_train_lr_os_class7, pred_train_lr_os7[:,1])

## Findings
### Simple Logistic Regression Result
- Accuracy : 0.999
- f1 Score : 0.718
- Precision Score : 0.844
- Recall Score : 0.625
- AUROC : 0.913

### Best Result after oversampling
- Accuracy : 0.999
- f1 Score : 0.733
- Precision Score : 0.755
- Recall Score : 0.712
- AUROC - 0.921

So Oversampling gave better results than Logistic regression, but we have to tune Oversampling Parameter

## Generate Synthetic Samples

In [18]:
Xtrain_sm1, ytrain_sm1 = SMOTE(random_state=27, ratio=1.0).fit_sample(Xtrain, ytrain)
Xtrain_sm2, ytrain_sm2 = SMOTE(random_state=27, ratio=0.8).fit_sample(Xtrain, ytrain)
Xtrain_sm3, ytrain_sm3 = SMOTE(random_state=27, ratio=0.5).fit_sample(Xtrain, ytrain)
Xtrain_sm4, ytrain_sm4 = SMOTE(random_state=27, ratio=0.1).fit_sample(Xtrain, ytrain)
Xtrain_sm5, ytrain_sm5 = SMOTE(random_state=27, ratio=0.05).fit_sample(Xtrain, ytrain)
Xtrain_sm6, ytrain_sm6 = SMOTE(random_state=27, ratio=0.01).fit_sample(Xtrain, ytrain)
Xtrain_sm7, ytrain_sm7 = SMOTE(random_state=27, ratio=0.005).fit_sample(Xtrain, ytrain)

In [19]:
print("Normal Data")
print(ytrain.value_counts())
print("Sample 1")
print(pd.DataFrame(ytrain_sm1)[0].value_counts())
print("Sample 2")
print(pd.DataFrame(ytrain_sm2)[0].value_counts())
print("Sample 3")
print(pd.DataFrame(ytrain_sm3)[0].value_counts())
print("Sample 4")
print(pd.DataFrame(ytrain_sm4)[0].value_counts())
print("Sample 5")
print(pd.DataFrame(ytrain_sm5)[0].value_counts())
print("Sample 6")
print(pd.DataFrame(ytrain_sm6)[0].value_counts())
print("Sample 7")
print(pd.DataFrame(ytrain_sm7)[0].value_counts())

Normal Data
0    227457
1       388
Name: Class, dtype: int64
Sample 1
1    227457
0    227457
Name: 0, dtype: int64
Sample 2
0    227457
1    181965
Name: 0, dtype: int64
Sample 3
0    227457
1    113728
Name: 0, dtype: int64
Sample 4
0    227457
1     22745
Name: 0, dtype: int64
Sample 5
0    227457
1     11372
Name: 0, dtype: int64
Sample 6
0    227457
1      2274
Name: 0, dtype: int64
Sample 7
0    227457
1      1137
Name: 0, dtype: int64


In [20]:
sm_lrclf1 = LogisticRegression().fit(Xtrain_sm1, ytrain_sm1)
sm_lrclf2 = LogisticRegression().fit(Xtrain_sm2, ytrain_sm2)
sm_lrclf3 = LogisticRegression().fit(Xtrain_sm3, ytrain_sm3)
sm_lrclf4 = LogisticRegression().fit(Xtrain_sm4, ytrain_sm4)
sm_lrclf5 = LogisticRegression().fit(Xtrain_sm5, ytrain_sm5)
sm_lrclf6 = LogisticRegression().fit(Xtrain_sm6, ytrain_sm6)
sm_lrclf7 = LogisticRegression().fit(Xtrain_sm7, ytrain_sm7)

pred_test_lr_sm1 = sm_lrclf1.predict_proba(Xtest)
pred_test_lr_sm2 = sm_lrclf2.predict_proba(Xtest)
pred_test_lr_sm3 = sm_lrclf3.predict_proba(Xtest)
pred_test_lr_sm4 = sm_lrclf4.predict_proba(Xtest)
pred_test_lr_sm5 = sm_lrclf5.predict_proba(Xtest)
pred_test_lr_sm6 = sm_lrclf6.predict_proba(Xtest)
pred_test_lr_sm7 = sm_lrclf7.predict_proba(Xtest)

pred_test_lr_sm_class1 = sm_lrclf1.predict(Xtest)
pred_test_lr_sm_class2 = sm_lrclf2.predict(Xtest)
pred_test_lr_sm_class3 = sm_lrclf3.predict(Xtest)
pred_test_lr_sm_class4 = sm_lrclf4.predict(Xtest)
pred_test_lr_sm_class5 = sm_lrclf5.predict(Xtest)
pred_test_lr_sm_class6 = sm_lrclf6.predict(Xtest)
pred_test_lr_sm_class7 = sm_lrclf7.predict(Xtest)

In [21]:
print("Sample 1")
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm_class1))
eval_metrics(ytest, pred_test_lr_sm_class1, pred_test_lr_sm1[:,1])
print("")
print("Sample 2")
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm_class2))
eval_metrics(ytest, pred_test_lr_sm_class2, pred_test_lr_sm2[:,1])
print("")
print("Sample 3")
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm_class3))
eval_metrics(ytest, pred_test_lr_sm_class3, pred_test_lr_sm3[:,1])
print("")
print("Sample 4")
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm_class4))
eval_metrics(ytest, pred_test_lr_sm_class4, pred_test_lr_sm4[:,1])
print("")
print("Sample 5")
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm_class5))
eval_metrics(ytest, pred_test_lr_sm_class5, pred_test_lr_sm5[:,1])
print("")
print("Sample 6")
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm_class6))
eval_metrics(ytest, pred_test_lr_sm_class6, pred_test_lr_sm6[:,1])
print("")
print("Sample 7")
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm_class7))
eval_metrics(ytest, pred_test_lr_sm_class7, pred_test_lr_sm7[:,1])

Sample 1
Accuracy :  0.9834977704434535
Confusion Matrix
       0    1
0  55931  927
1     13   91

For Class 1
f1 Score : 0.1622103386809269
Precision Score : 0.0893909626719057
Recall Score : 0.875

For Class 0
f1 Score : 0.991666814418184
Precision Score : 0.9997676247676248
Recall Score : 0.9836962256850399

AUROC : 0.9580973653663513

Sample 2
Accuracy :  0.9910115515606895
Confusion Matrix
       0    1
0  56363  495
1     17   87

For Class 1
f1 Score : 0.2536443148688047
Precision Score : 0.14948453608247422
Recall Score : 0.8365384615384616

For Class 0
f1 Score : 0.99547854960349
Precision Score : 0.9996984746363959
Recall Score : 0.9912941010939533

AUROC : 0.942209945424093

Sample 3
Accuracy :  0.9930479969102208
Confusion Matrix
       0    1
0  56479  379
1     17   87

For Class 1
f1 Score : 0.30526315789473685
Precision Score : 0.18669527896995708
Recall Score : 0.8365384615384616

For Class 0
f1 Score : 0.9965065193994036
Precision Score : 0.9996990937411498
Recall Sc

## Findings
### Simple Logistic Regression Result
- Accuracy : 0.999
- f1 Score : 0.718
- Precision Score : 0.844
- Recall Score : 0.625
- AUROC : 0.913

### Best Result after oversampling
- Accuracy : 0.999
- f1 Score : 0.744
- Precision Score : 0.747
- Recall Score : 0.740
- AUROC - 0.924

A little bit of Precision went down but other Metrices improved as compared to Simple Logistic regression

## Cluster Centroids UnderSampling

In [None]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(ratio={0: 10})
Xtrain_cc, ytrain_cc = cc.fit_sample(Xtrain, ytrain)

In [None]:
cc_lrclf = LogisticRegression().fit(Xtrain_cc, ytrain_cc)
pred_test_lr_cc = cc_lrclf.predict_proba(Xtest)
pred_test_lr_cc_class = cc_lrclf.predict(Xtest)

In [None]:
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_cc_class))
eval_metrics(ytest, pred_test_lr_cc_class, pred_test_lr_cc[:,1])

# Change in algorithms 

## Random Forest 

In [22]:
# Random Forest
rfclf1 = RandomForestClassifier(n_estimators=120).fit(Xtrain, ytrain)
rfclf2 = RandomForestClassifier(n_estimators=120, class_weight = 'balanced_subsample').fit(Xtrain, ytrain)
# predict on test set
pred_test_rf1 = rfclf1.predict_proba(Xtest)
pred_test_rf2 = rfclf2.predict_proba(Xtest)

pred_test_rf_class1 = rfclf1.predict(Xtest)
pred_test_rf_class2 = rfclf2.predict(Xtest)

In [23]:
print("Model 1")
print("Accuracy : ", accuracy_score(ytest, pred_test_rf_class1))
eval_metrics(ytest, pred_test_rf_class1, pred_test_rf1[:,1])
print("")
print("Model 2")
print("Accuracy : ", accuracy_score(ytest, pred_test_rf_class2))
eval_metrics(ytest, pred_test_rf_class2, pred_test_rf2[:,1])

Model 1
Accuracy :  0.9996137776061234
Confusion Matrix
       0   1
0  56856   2
1     20  84

For Class 1
f1 Score : 0.8842105263157894
Precision Score : 0.9767441860465116
Recall Score : 0.8076923076923077

For Class 0
f1 Score : 0.9998065661983223
Precision Score : 0.9996483578310711
Recall Score : 0.9999648246508847

AUROC : 0.9356368902826745

Model 2
Accuracy :  0.9995435553526912
Confusion Matrix
       0   1
0  56855   3
1     23  81

For Class 1
f1 Score : 0.8617021276595744
Precision Score : 0.9642857142857143
Recall Score : 0.7788461538461539

For Class 0
f1 Score : 0.9997714004360977
Precision Score : 0.9995956257252365
Recall Score : 0.999947236976327

AUROC : 0.9455210788279574


## Findings
### Simple Logistic Regression Result
- Accuracy : 0.999
- f1 Score : 0.718
- Precision Score : 0.844
- Recall Score : 0.625
- AUROC : 0.913

### Best Result after oversampling
- Accuracy : 0.999
- f1 Score : 0.744
- Precision Score : 0.747
- Recall Score : 0.740
- AUROC - 0.924

### Result of Random Forest
- Accuracy : 0.9995
- f1 Score : 0.862
- Precision Score : 0.964
- Recall Score : 0.779
- AUROC : 0.946

Random Forest has improved results in all fronts as compared to Oversampling
But using class_weight = 'balanced_subsample' improved result more

## XGBoost 

In [24]:
import xgboost as xgb
xgbclf1 = xgb.XGBClassifier(max_depth=3, learning_rate=0.05, n_estimators=200, verbosity=0, silent=None, objective='binary:logistic', 
booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=0.5, 
colsample_bytree=0.33, scale_pos_weight=1, random_state=0, seed=None, missing=None).fit(Xtrain, ytrain)

pred_test_xgb1_class = xgbclf1.predict(Xtest)
pred_test_xgb1 = xgbclf1.predict_proba(Xtest)


In [25]:
xgbclf2 = xgb.XGBClassifier(max_depth=3, learning_rate=0.05, n_estimators=200, verbosity=0, silent=None, objective='binary:logistic', 
booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=0.5, 
colsample_bytree=0.33, scale_pos_weight=25, random_state=0, seed=None, missing=None).fit(Xtrain, ytrain)

pred_test_xgb2_class = xgbclf2.predict(Xtest)
pred_test_xgb2 = xgbclf2.predict_proba(Xtest)


In [26]:
print("Model 1")
print("Accuracy : ", accuracy_score(ytest, pred_test_xgb1_class))
eval_metrics(ytest, pred_test_xgb1_class, pred_test_xgb1[:,1])
print("Model 2")
print("Accuracy : ", accuracy_score(ytest, pred_test_xgb2_class))
eval_metrics(ytest, pred_test_xgb2_class, pred_test_xgb2[:,1])

Model 1
Accuracy :  0.9996137776061234
Confusion Matrix
       0   1
0  56856   2
1     20  84

For Class 1
f1 Score : 0.8842105263157894
Precision Score : 0.9767441860465116
Recall Score : 0.8076923076923077

For Class 0
f1 Score : 0.9998065661983223
Precision Score : 0.9996483578310711
Recall Score : 0.9999648246508847

AUROC : 0.9791312094637924
Model 2
Accuracy :  0.9992626663389628
Confusion Matrix
       0   1
0  56835  23
1     19  85

For Class 1
f1 Score : 0.8018867924528303
Precision Score : 0.7870370370370371
Recall Score : 0.8173076923076923

For Class 0
f1 Score : 0.9996306458421275
Precision Score : 0.9996658106729518
Recall Score : 0.9995954834851736

AUROC : 0.9820651379820714


## Findings
### Simple Logistic Regression Result
- Accuracy : 0.999
- f1 Score : 0.718
- Precision Score : 0.844
- Recall Score : 0.625
- AUROC : 0.913

### Best Result after oversampling
- Accuracy : 0.999
- f1 Score : 0.744
- Precision Score : 0.747
- Recall Score : 0.740
- AUROC - 0.924

### Random Forest Results
- Accuracy : 0.9995
- f1 Score : 0.884
- Precision Score : 0.977
- Recall Score : 0.808
- AUROC : 0.946

### Xgboost Results (w/o scale_pos_weight)
- Accuracy :  0.9996
- f1 Score : 0.884
- Precision Score : 0.977
- Recall Score : 0.808
- AUROC : 0.979

### Xgboost Results (with scale_pos_weight)
- Accuracy : 0.999
- f1 Score : 0.802
- Precision Score : 0.787
- Recall Score : 0.817
- AUROC : 0.982

#### For Normal XGBoost
It has improved over RF in terms of AUROC and have Precision, Recall at same
#### For XGBoost with scale_pos_weight
- AUROC has further improved over Normal XGBoost
- Recall has improved but Precision and F1 Score went bit down

## LightGBM 

In [27]:
lgbclf1 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=7, max_depth=3, learning_rate=0.05, n_estimators=100, 
                        objective=None, min_child_weight=0.001, min_child_samples=20, subsample=0.5, 
                        subsample_freq=5, colsample_bytree=0.25, random_state=2019, n_jobs=-1, reg_alpha=0.0, reg_lambda=0.0,
                             silent=True, importance_type='gain', eval_metric = 'auc').fit(Xtrain, ytrain)
pred_test_lgb1_class = lgbclf1.predict(Xtest)
pred_test_lgb1 = lgbclf1.predict_proba(Xtest)


In [28]:
lgbclf2 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=4, max_depth=-1, learning_rate=0.05, n_estimators=200, 
                        objective=None, min_child_weight=0.001, min_child_samples=20, 
                        subsample_freq=5, colsample_bytree=0.25, reg_alpha=0.0, reg_lambda=0.0, 
                        random_state=2019, n_jobs=-1, silent=True, importance_type='gain', eval_metric = 'auc', 
                             pos_bagging_fraction = 1, neg_bagging_fraction = 0.45).fit(Xtrain, ytrain)
pred_test_lgb2_class = lgbclf2.predict(Xtest)
pred_test_lgb2 = lgbclf2.predict_proba(Xtest)


In [30]:
print("Model 1")
print("Accuracy : ", accuracy_score(ytest, pred_test_lgb1_class))
eval_metrics(ytest, pred_test_lgb1_class, pred_test_lgb1[:,1])
print("")
print("Model 2")
print("Accuracy : ", accuracy_score(ytest, pred_test_lgb2_class))
eval_metrics(ytest, pred_test_lgb2_class, pred_test_lgb2[:,1])

Model 1
Accuracy :  0.9990871107053826
Confusion Matrix
       0   1
0  56837  21
1     31  73

For Class 1
f1 Score : 0.7373737373737373
Precision Score : 0.776595744680851
Recall Score : 0.7019230769230769

For Class 0
f1 Score : 0.9995427606703832
Precision Score : 0.999454877963002
Recall Score : 0.9996306588342889

AUROC : 0.9712162485760748

Model 2
Accuracy :  0.9994557775359011
Confusion Matrix
       0   1
0  56849   9
1     22  82

For Class 1
f1 Score : 0.841025641025641
Precision Score : 0.9010989010989011
Recall Score : 0.7884615384615384

For Class 0
f1 Score : 0.9997274222054181
Precision Score : 0.9996131596068295
Recall Score : 0.999841710928981

AUROC : 0.9784663446318359


## Findings
### Simple Logistic Regression Result
- Accuracy : 0.999
- f1 Score : 0.718
- Precision Score : 0.844
- Recall Score : 0.625
- AUROC : 0.913

### Best Result after oversampling
- Accuracy : 0.999
- f1 Score : 0.744
- Precision Score : 0.747
- Recall Score : 0.740
- AUROC - 0.924

### Random Forest Results
- Accuracy : 0.9995
- f1 Score : 0.884
- Precision Score : 0.977
- Recall Score : 0.808
- AUROC : 0.946

### Xgboost Results (with scale_pos_weight)
- AUROC : 0.979
- f1 Score : 0.802
- Precision Score : 0.787
- Recall Score : 0.817
- AUROC : 0.982

### LGBM
- f1 Score : 0.737
- Precision Score : 0.777
- Recall Score : 0.702
- AUROC : 0.971

### LGBM with sampling
- f1 Score : 0.841
- Precision Score : 0.901
- Recall Score : 0.788
- AUROC : 0.978

Normal LGBM has relatively poor precision, recall and relatively less AUROC
<br>
Where as LGBM with sampling has better AUC, Precision Recall and F1 Score