In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from sklearn.metrics import f1_score, recall_score

In [2]:
# Helper function
def ShowStats(cmat, y_test, pred):
   # separate out the confusion matrix components
   tpos = cmat[0][0]
   fneg = cmat[1][1]
   fpos = cmat[0][1]
   tneg = cmat[1][0]
   # calculate F!, Recall scores
   f1Score = round(f1_score(y_test, pred), 2)
   recallScore = round(recall_score(y_test, pred), 2)
   # calculate and display metrics
   print(cmat)
   print( 'Accuracy: '+ str(np.round(100*float(tpos+fneg)/float(tpos+fneg + fpos + tneg),2))+'%')
   print( 'Cohen Kappa: '+ str(np.round(cohen_kappa_score(y_test, pred),3)))
   print("Sensitivity/Recall for Model : {recall_score}".format(recall_score = recallScore))
   print("F1 Score for Model : {f1_score}".format(f1_score = f1Score))

In [3]:
def RunModel(model, X_train, y_train, X_test, y_test):
   model.fit(X_train, y_train.values.ravel())
   pred = model.predict(X_test)
   matrix = confusion_matrix(y_test, pred)
   return matrix, pred

In [5]:
df = pd.read_csv('data/creditcard.csv')
class_names = {0:'Not Fraud', 1:'Fraud'}
print(df.Class.value_counts().rename(index = class_names))

Not Fraud    284315
Fraud           492
Name: Class, dtype: int64


In [6]:
# Selecting features that we want to use
feature_names = df.iloc[:, 1:30].columns

In [7]:
feature_names

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')

In [8]:
target = df.iloc[:1, 30: ].columns

In [9]:
target

Index(['Class'], dtype='object')

In [10]:
data_features = df[feature_names]

In [11]:
data_target = df[target]

In [12]:
# splitting the data into training and test sets
from sklearn.model_selection import train_test_split
np.random.seed(123)
X_train, X_test, y_train, y_test = train_test_split(data_features,    data_target, train_size=0.70, test_size=0.30, random_state=1)

In [14]:
# Picking our machine learning technique or model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cmat, pred = RunModel(lr, X_train, y_train, X_test, y_test)
ShowStats(cmat, y_test, pred)



[[85293    15]
 [   57    78]]
Accuracy: 99.92%
Cohen Kappa: 0.684
Sensitivity/Recall for Model : 0.58
F1 Score for Model : 0.68


while the accuracy was great, the algorithm misclassified more than 4 in 10 fraudulent transactions.
So, since accuracy is not the reliable measure of our model’s effectiveness. Instead, we look at other measures like the Cohen Kappa, Recall, and F1 score and we should achieve a score as close to 1 as we can.

In [15]:
# Let's try another model. In this case; RandomForest classifier.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, n_jobs =4)
cmat, pred = RunModel(rf, X_train, y_train, X_test, y_test)
ShowStats(cmat, y_test, pred)

[[85296    12]
 [   30   105]]
Accuracy: 99.95%
Cohen Kappa: 0.833
Sensitivity/Recall for Model : 0.78
F1 Score for Model : 0.83


That’s quite a bit better. Note the accuracy went up slightly, but the other scores showed significant improvements as well. Therefore, Random Forest model performed better overally than Logistic Regression model. 