In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

seed=331

In [2]:
def fnr_calc(y_test, y_pred):
    tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels=[1,0]).ravel()
    return fn / (fn + tp)

def fpr_calc(y_test, y_pred):
    tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels=[1,0]).ravel()
    return fp / (fp + tn)



def model_metrics(y_test, y_pred):
    tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels=[1,0]).ravel()
    
    fnr = fn / (fn + tp)
    fpr = fp / (fp + tn)
    
    roc_auc = roc_auc_score(y_test, y_pred)
    
    sensitivity = tp / (tp + fn)
    specificity = tn / (tp + tn)
    
    return fnr, fpr, roc_auc, sensitivity, specificity

# Data preparation

In [3]:
dataframe = pd.read_csv("./data/creditcard.csv")

columns = dataframe.columns.drop('Class')

dataframe.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
scaled_dataframe = dataframe.copy().drop(['Class'],axis=1)

transformer = StandardScaler()

transformer.fit(scaled_dataframe)

scaled_dataframe = transformer.transform(scaled_dataframe)
scaled_dataframe = pd.DataFrame(scaled_dataframe, columns=columns)

In [53]:
X = scaled_dataframe
y = dataframe.Class

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=seed, test_size=0.33)

print("train rows: {}, test rows: {}".format(X_train.shape[0], X_test.shape[0]))

train rows: 190820, test rows: 93987


# RFE

In [6]:
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5, step=3)

X_rfe = pd.DataFrame(rfe.fit_transform(X, y))
X_rfe.columns = rfe.get_feature_names_out()

X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, random_state=seed, test_size=0.33)

print("train rows: {}, test rows: {}".format(X_train.shape[0], X_test.shape[0]))

X_rfe.columns

train rows: 190820, test rows: 93987


Index(['V10', 'V11', 'V12', 'V14', 'V17'], dtype='object')

### > MLP

In [7]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='adam', hidden_layer_sizes=[56, 62,], max_iter=250, batch_size=50, random_state = seed)
mlp.fit(X_train,y_train)

y_pred = mlp.predict(X_test)
mlp.score(X_test, y_test)

0.9995956887654676

In [8]:
fnr, fpr, roc_auc, sensitivity, specificity = model_metrics(y_test, y_pred)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr * 100, fpr * 100))
print("ROC AUC = {:.4f}".format(roc_auc))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity * 100, specificity * 100))

FNR = 21.0191%, FPR = 0.0053%
ROC AUC = 0.8949
sensitivity = 78.9809%, specificity = 99.8680%


In [9]:
cm = confusion_matrix(y_test, y_pred, labels=[1,0])
cm

array([[  124,    33],
       [    5, 93825]], dtype=int64)

### > QDA

In [10]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis(reg_param=0.97, store_covariance=False)
qda.fit(X_train,y_train)

y_pred_qda = qda.predict(X_test)
qda.score(X_test, y_test)

0.9993403343015523

In [11]:
fnr_qda, fpr_qda, roc_auc_qda, sensitivity_qda, specificity_qda = model_metrics(y_test, y_pred_qda)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr_qda * 100, fpr_qda * 100))
print("ROC AUC = {:.4f}".format(roc_auc_qda))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity_qda * 100, specificity_qda * 100))

FNR = 15.9236%, FPR = 0.0394%
ROC AUC = 0.9202
sensitivity = 84.0764%, specificity = 99.8595%


In [12]:
cm_qda = confusion_matrix(y_test, y_pred_qda, labels=[1,0])
cm_qda

array([[  132,    25],
       [   37, 93793]], dtype=int64)

### > Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=seed)
rf.fit(X_train,y_train)

y_pred_rf = rf.predict(X_test)
rf.score(X_test, y_test)

0.9996595273814464

In [14]:
fnr_rf, fpr_rf, roc_auc_rf, sensitivity_rf, specificity_rf = model_metrics(y_test, y_pred_rf)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr_rf * 100, fpr_rf * 100))
print("ROC AUC = {:.4f}".format(roc_auc_rf))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity_rf * 100, specificity_rf * 100))

FNR = 17.1975%, FPR = 0.0053%
ROC AUC = 0.9140
sensitivity = 82.8025%, specificity = 99.8616%


In [15]:
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=[1,0])
cm_rf

array([[  130,    27],
       [    5, 93825]], dtype=int64)

# Forward Linear Regression

In [20]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

flr = SequentialFeatureSelector(LogisticRegression(random_state=seed), direction='forward', n_features_to_select=5)
X_flr = pd.DataFrame(flr.fit_transform(X, y))

X_flr.columns = rfe.get_feature_names_out()

X_train_flr, X_test_flr, y_train_flr, y_test_flr = train_test_split(X_flr, y, random_state=seed, test_size=0.33)

print("train rows: {}, test rows: {}".format(X_train_flr.shape[0], X_test_flr.shape[0]))

X_flr.columns

train rows: 190820, test rows: 93987


Index(['V10', 'V11', 'V12', 'V14', 'V17'], dtype='object')

### > MLP

In [21]:
mlp_flr = MLPClassifier(solver='adam', hidden_layer_sizes=[56, 62,], max_iter=250, batch_size=50, random_state = seed)
mlp_flr.fit(X_train_flr,y_train_flr)

y_pred_mlp_flr = mlp_flr.predict(X_test_flr)
mlp_flr.score(X_test_flr, y_test_flr)

0.9995318501494888

In [22]:
fnr_mlp_flr, fpr_mlp_flr, roc_auc_mlp_flr, sensitivity_mlp_flr, specificity_mlp_flr = model_metrics(y_test_flr, y_pred_mlp_flr)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr_mlp_flr * 100, fpr_mlp_flr * 100))
print("ROC AUC = {:.4f}".format(roc_auc_mlp_flr))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity_mlp_flr * 100, specificity_mlp_flr * 100))

FNR = 24.8408%, FPR = 0.0053%
ROC AUC = 0.8758
sensitivity = 75.1592%, specificity = 99.8744%


In [23]:
cm_mlp_flr = confusion_matrix(y_test_flr, y_pred_mlp_flr, labels=[1,0])
cm_mlp_flr

array([[  118,    39],
       [    5, 93825]], dtype=int64)

### > QDA

In [24]:
qda_flr = QuadraticDiscriminantAnalysis(reg_param=0.97, store_covariance=False)
qda_flr.fit(X_train_flr,y_train_flr)

y_pred_qda_flr = qda_flr.predict(X_test_flr)
qda_flr.score(X_test_flr, y_test_flr)

0.9984572334471788

In [25]:
fnr_qda_flr, fpr_qda_flr, roc_auc_qda_flr, sensitivity_qda_flr, specificity_qda_flr = model_metrics(y_test_flr, y_pred_qda_flr)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr_qda_flr * 100, fpr_qda_flr * 100))
print("ROC AUC = {:.4f}".format(roc_auc_qda_flr))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity_qda_flr * 100, specificity_qda_flr * 100))

FNR = 15.9236%, FPR = 0.1279%
ROC AUC = 0.9197
sensitivity = 84.0764%, specificity = 99.8593%


In [26]:
cm_qda_flr = confusion_matrix(y_test_flr, y_pred_qda_flr, labels=[1,0])
cm_qda_flr

array([[  132,    25],
       [  120, 93710]], dtype=int64)

### > Random Forest

In [27]:
rf_flr = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=seed)
rf_flr.fit(X_train_flr,y_train_flr)

y_pred_rf_flr = rf_flr.predict(X_test_flr)
rf_flr.score(X_test_flr, y_test_flr)

0.9996701671507762

In [28]:
fnr_rf_flr, fpr_rf_flr, roc_auc_rf_flr, sensitivity_rf_flr, specificity_rf_flr = model_metrics(y_test_flr, y_pred_rf_flr)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr_rf_flr * 100, fpr_rf_flr * 100))
print("ROC AUC = {:.4f}".format(roc_auc_rf_flr))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity_rf_flr * 100, specificity_rf_flr * 100))

FNR = 15.9236%, FPR = 0.0064%
ROC AUC = 0.9204
sensitivity = 84.0764%, specificity = 99.8595%


In [29]:
cm_rf_flr = confusion_matrix(y_test_flr, y_pred_rf_flr, labels=[1,0])
cm_rf_flr

array([[  132,    25],
       [    6, 93824]], dtype=int64)

# PCA

In [75]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X, y)
X_pca = pca.transform(X)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, random_state=seed, test_size=0.33)

print("train rows: {}, test rows: {}".format(X_train_pca.shape[0], X_test_pca.shape[0]))
print("New features num: {}".format(len(X_train_pca[0])))

train rows: 190820, test rows: 93987
New features num: 30


### > MLP

In [55]:
mlp_pca = MLPClassifier(solver='adam', hidden_layer_sizes=[56, 62,], max_iter=250, batch_size=50, random_state = seed)
mlp_pca.fit(X_train_pca,y_train_pca)

y_pred_mlp_pca = mlp_pca.predict(X_test_pca)
mlp_pca.score(X_test_pca, y_test_pca)

0.9995105706108292

In [56]:
fnr_mlp_pca, fpr_mlp_pca, roc_auc_mlp_pca, sensitivity_mlp_pca, specificity_mlp_pca = model_metrics(y_test_pca, y_pred_mlp_pca)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr_mlp_pca * 100, fpr_mlp_pca * 100))
print("ROC AUC = {:.4f}".format(roc_auc_mlp_pca))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity_mlp_pca * 100, specificity_mlp_pca * 100))

FNR = 21.0191%, FPR = 0.0139%
ROC AUC = 0.8948
sensitivity = 78.9809%, specificity = 99.8680%


In [57]:
cm_mlp_pca = confusion_matrix(y_test_pca, y_pred_mlp_pca, labels=[1,0])
cm_mlp_pca

array([[  124,    33],
       [   13, 93817]], dtype=int64)

### > QDA

In [71]:
qda_pca = QuadraticDiscriminantAnalysis(reg_param=0.97, store_covariance=False)
qda_pca.fit(X_train_pca,y_train_pca)

y_pred_qda_pca = qda_pca.predict(X_test_pca)
qda_pca.score(X_test_pca, y_test_pca)

0.9948929107216955

In [59]:
fnr_qda_pca, fpr_qda_pca, roc_auc_qda_pca, sensitivity_qda_pca, specificity_qda_pca = model_metrics(y_test_pca, y_pred_qda_pca)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr_qda_pca * 100, fpr_qda_pca * 100))
print("ROC AUC = {:.4f}".format(roc_auc_qda_pca))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity_qda_pca * 100, specificity_qda_pca * 100))

FNR = 14.0127%, FPR = 0.4881%
ROC AUC = 0.9275
sensitivity = 85.9873%, specificity = 99.8556%


In [60]:
cm_qda_pca = confusion_matrix(y_test_pca, y_pred_qda_pca, labels=[1,0])
cm_qda_pca

array([[  135,    22],
       [  458, 93372]], dtype=int64)

### > Random Forest

In [61]:
rf_pca = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=seed)
rf_pca.fit(X_train_pca,y_train_pca)

y_pred_rf_pca = rf_pca.predict(X_test_pca)
rf_pca.score(X_test_pca, y_test_pca)

0.9996382478427868

In [62]:
fnr_rf_pca, fpr_rf_pca, roc_auc_rf_pca, sensitivity_rf_pca, specificity_rf_pca = model_metrics(y_test_pca, y_pred_rf_pca)

print("FNR = {:.4f}%, FPR = {:.4f}%".format(fnr_rf_pca * 100, fpr_rf_pca * 100))
print("ROC AUC = {:.4f}".format(roc_auc_rf_pca))
print("sensitivity = {:.4f}%, specificity = {:.4f}%".format(sensitivity_rf_pca * 100, specificity_rf_pca * 100))

FNR = 19.7452%, FPR = 0.0032%
ROC AUC = 0.9013
sensitivity = 80.2548%, specificity = 99.8659%


In [63]:
cm_rf_pca = confusion_matrix(y_test_pca, y_pred_rf_pca, labels=[1,0])
cm_rf_pca

array([[  126,    31],
       [    3, 93827]], dtype=int64)