In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import time
import sklearn

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree

In [4]:
def model_acc_time(X_train, X_test, y_train, y_test, model):
    # creates a model based on training data and returns the accuracy on the held out test set and the time it took to train the model
    # data should be pre processed already
    
    # records the time and accuracy for each model, as well as saving the model itself
    total_time = []
    accuracy = []
    models = []

    # manual 5 fold on test data to select the best model
    for i in range(5):
        # splitting the training data to train and evaluate the model
        kfX_train, kfX_test, kfy_train, kfy_test = train_test_split(
    X_train, y_train, test_size=0.2,shuffle=True)

        # creates a new unfitted model with the inputted parameters
        curModel = sklearn.base.clone(model)


        # times the training of the model and calcualtes the accuracy on the validation set
        t1 = time.time()
        curModel.fit(kfX_train,kfy_train)
        accuracy.append((curModel.predict(kfX_test)==np.array(kfy_test)[0]).mean())
        t2 = time.time()
        models.append(curModel)
        total_time.append(t2-t1)

    # selects the model with the highest accuracy on the validation set and calcuates the accuracy on the held out set
    best_index = accuracy.index(max(accuracy))
    t = total_time[best_index]
    most_acc_model = models[best_index]
    acc = (most_acc_model.predict(X_test)==np.array(y_test)[0]).mean()*100

    return "Accuracy: {0:.2f}% || Time to Train: {1:.3f} seconds".format(acc, t)

In [6]:
creditcards = pd.read_csv("data/creditcard.csv")
creditcards

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [7]:
creditcards.Class.value_counts(normalize=True)

0    0.998273
1    0.001727
Name: Class, dtype: float64

In [8]:
creditcards.corr()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Time,1.0,0.1173963,-0.01059333,-0.4196182,-0.1052602,0.1730721,-0.06301647,0.08471437,-0.03694943,-0.008660434,...,0.04473573,0.1440591,0.05114236,-0.01618187,-0.2330828,-0.0414071,-0.005134591,-0.009412688,-0.010596,-0.012323
V1,0.117396,1.0,4.6973500000000007e-17,-1.42439e-15,1.755316e-17,6.391162e-17,2.398071e-16,1.99155e-15,-9.490675e-17,2.169581e-16,...,-1.755072e-16,7.477367000000001e-17,9.808705e-16,7.354269e-17,-9.805358e-16,-8.621897000000001e-17,3.208233e-17,9.820892e-16,-0.227709,-0.101347
V2,-0.010593,4.6973500000000007e-17,1.0,2.512175e-16,-1.126388e-16,-2.039868e-16,5.02468e-16,3.966486e-16,-4.4139840000000005e-17,-5.728718e-17,...,8.444409000000001e-17,2.50083e-16,1.059562e-16,-8.142354e-18,-4.261894e-17,2.601622e-16,-4.478472e-16,-3.676415e-16,-0.531409,0.091289
V3,-0.419618,-1.42439e-15,2.512175e-16,1.0,-3.41691e-16,-1.436514e-15,1.431581e-15,2.168574e-15,3.433113e-16,-4.23377e-16,...,-2.9719690000000006e-17,4.648259e-16,2.115206e-17,-9.351637e-17,4.771164e-16,6.521501e-16,6.239832e-16,7.726948e-16,-0.21088,-0.192961
V4,-0.10526,1.755316e-17,-1.126388e-16,-3.41691e-16,1.0,-1.940929e-15,-2.712659e-16,1.55633e-16,5.195643e-16,3.859585e-16,...,-9.976950000000001e-17,2.099922e-16,6.002528000000001e-17,2.229738e-16,5.394585e-16,-6.179751e-16,-6.403423e-17,-5.863664e-17,0.098732,0.133447
V5,0.173072,6.391162e-17,-2.039868e-16,-1.436514e-15,-1.940929e-15,1.0,7.926364e-16,-4.209851e-16,7.589187e-16,4.205206e-16,...,-1.368701e-16,5.060029e-16,1.637596e-16,-9.286095e-16,5.625102e-16,9.14469e-16,4.46596e-16,-3.299167e-16,-0.386356,-0.094974
V6,-0.063016,2.398071e-16,5.02468e-16,1.431581e-15,-2.712659e-16,7.926364e-16,1.0,1.429426e-16,-1.707421e-16,1.114447e-16,...,-1.575903e-16,-3.362902e-16,-7.232186000000001e-17,-1.261867e-15,1.081933e-15,-2.378414e-16,-2.623818e-16,4.813155e-16,0.215981,-0.043643
V7,0.084714,1.99155e-15,3.966486e-16,2.168574e-15,1.55633e-16,-4.209851e-16,1.429426e-16,1.0,-8.691834e-17,7.933251e-16,...,1.938604e-16,-1.058131e-15,2.327911e-16,-2.589727e-17,1.174169e-15,-7.334507e-16,-5.886825e-16,-6.836764000000001e-17,0.397311,-0.187257
V8,-0.036949,-9.490675e-17,-4.4139840000000005e-17,3.433113e-16,5.195643e-16,7.589187e-16,-1.707421e-16,-8.691834e-17,1.0,2.900829e-16,...,-2.412439e-16,5.475559e-16,3.897104e-16,-1.802967e-16,-1.390791e-16,-1.209975e-16,1.733633e-16,-4.484325e-16,-0.103079,0.019875
V9,-0.00866,2.169581e-16,-5.728718e-17,-4.23377e-16,3.859585e-16,4.205206e-16,1.114447e-16,7.933251e-16,2.900829e-16,1.0,...,4.5783890000000004e-17,2.8718550000000003e-17,5.929286e-16,-2.346385e-16,1.099645e-15,-1.388725e-15,-2.287414e-16,9.146779e-16,-0.044246,-0.097733


In [9]:
#Split Data into training and testing
x = creditcards.drop("Class", axis=1)
y = creditcards[["Class"]]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [10]:
# DT classifier, using entropy and a max depth of 4
DT = tree.DecisionTreeClassifier(criterion="entropy",max_depth=4)

# using the function above, we can determine the accuracy and training time of a deciosion tree model
model_acc_time(X_train, X_test, y_train, y_test, DT)

'Accuracy: 99.86% || Time to Train: 3.440 seconds'

In [11]:
# MLP Classifier 
MLP = MLPClassifier(max_iter=300, hidden_layer_sizes=(7,7), solver="sgd")

# calculating the accuracy and training time of an MLP Classifier
model_acc_time(X_train, X_test, y_train, y_test, MLP)

'Accuracy: 100.00% || Time to Train: 4.419 seconds'

In [12]:
# Random Forest
RF = RandomForestClassifier(n_estimators=200, max_depth=4,criterion="entropy")

# calculating the accuracy and training time of the random forest Classifier
model_acc_time(X_train, X_test, y_train, y_test, RF)

'Accuracy: 99.85% || Time to Train: 77.954 seconds'

In [13]:
# Extra Trees Classifier
ET = ExtraTreesClassifier(n_estimators=100, max_depth=3,criterion="entropy")

# calculating the accuracy and training time of the extra trees Classifier
model_acc_time(X_train, X_test, y_train, y_test, ET)

'Accuracy: 99.96% || Time to Train: 2.232 seconds'

In [14]:
# SGD Classifier
SGD = SGDClassifier(loss="hinge", max_iter=500, )

# calculating the accuracy and training time of the SGD Classifier
model_acc_time(X_train, X_test, y_train, y_test, SGD)

'Accuracy: 99.99% || Time to Train: 3.740 seconds'

In [15]:
# Gaussian Naive Bayes
GNB = GaussianNB()
# calculating the accuracy and training time of the Gaussian Naive Bayes
model_acc_time(X_train, X_test, y_train, y_test, GNB)

'Accuracy: 99.14% || Time to Train: 0.095 seconds'

In [16]:
# SVM
SVM = svm.SVC(kernel="sigmoid")

# calculating the accuracy and training time of the SVM
model_acc_time(X_train, X_test, y_train, y_test, SVM)


'Accuracy: 100.00% || Time to Train: 5.917 seconds'