In [68]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture

In [4]:
data = pd.read_csv('../data/creditcard.csv')
x = np.array(data.loc[1:len(data)-1, 'Time'])
y = np.array(data.loc[0:len(data)-2, 'Time'])
data.loc[1:, 'Time'] = x-y
normal = data[data['Class']==0]
anomaly = data[data['Class']==1]

In [5]:
train_normal, test_normal = train_test_split(normal, test_size=0.4, random_state=42)
valid_normal, test_normal = train_test_split(test_normal, test_size=0.5, random_state=42)
train_anomaly, test_anomaly = train_test_split(anomaly, test_size=0.4, random_state=42)
valid_anomaly, test_anomaly = train_test_split(test_anomaly, test_size=0.5, random_state=42)

for x in [train_normal, valid_normal, test_normal, train_anomaly, valid_anomaly, test_anomaly]:
    x.reset_index(drop=True, inplace=True)

print('Normal Train:', train_normal.shape, 
      'Normal Valid:', valid_normal.shape, 
      'Normal Test:', test_normal.shape)
print('Anomaly Train:', train_anomaly.shape, 
      'Anomaly Valid:', valid_anomaly.shape, 
      'Anomaly Test:', test_anomaly.shape)

Normal Train: (170589, 31) Normal Valid: (56863, 31) Normal Test: (56863, 31)
Anomaly Train: (295, 31) Anomaly Valid: (98, 31) Anomaly Test: (99, 31)


In [6]:
train = train_normal.append(train_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
valid = valid_normal.append(valid_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
test = test_normal.append(test_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
logistic = LogisticRegression(random_state=0, solver='newton-cg',
                         multi_class='multinomial')
logistic.fit(train.drop(columns=['Class']), train['Class'])

predict_log = logistic.predict(test.drop(columns=['Class']))

x = test
recall_log = recall_score(y_true=x['Class'].values, y_pred=predict_log)
precision_log = precision_score(y_true=x['Class'].values, y_pred=predict_log)
fbeta_log = fbeta_score(y_true=x['Class'].values, y_pred=predict_log, beta=2)

print('Recall:', recall_log, '\nPrecision:', precision_log, '\nF-score:', fbeta_log)

cnf_matrix_log = confusion_matrix(y_true=x['Class'].values, y_pred=predict_log)
cnf_matrix_log

Recall: 0.5959595959595959 
Precision: 0.8676470588235294 
F-score: 0.6357758620689654


array([[56854,     9],
       [   40,    59]])

In [7]:
gnb = GaussianNB()
gnb.fit(train.drop(columns=['Class']), train['Class'])

predict_gnb = gnb.predict(test.drop(columns=['Class']))

x = test
recall_gnb = recall_score(y_true=x['Class'].values, y_pred=predict_gnb)
precision_gnb = precision_score(y_true=x['Class'].values, y_pred=predict_gnb)
fbeta_gnb = fbeta_score(y_true=x['Class'].values, y_pred=predict_gnb, beta=2)

print('Recall:', recall_gnb, '\nPrecision:', precision_gnb, '\nF-score:', fbeta_gnb)

cnf_matrix_gnb = confusion_matrix(y_true=x['Class'].values, y_pred=predict_gnb)
cnf_matrix_gnb

Recall: 0.8282828282828283 
Precision: 0.05758426966292135 
F-score: 0.22527472527472528


array([[55521,  1342],
       [   17,    82]])

In [219]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=5)
tree.fit(train.drop(columns=['Class']), train['Class'])

x = test

predict_tree = tree.predict(x.drop(columns=['Class']))
recall_tree = recall_score(y_true=x['Class'].values, y_pred=predict_tree)
precision_tree = precision_score(y_true=x['Class'].values, y_pred=predict_tree)
fbeta_tree = fbeta_score(y_true=x['Class'].values, y_pred=predict_tree, beta=1)

print('Recall:', recall_tree, '\nPrecision:', precision_tree, '\nF-score:', fbeta_tree)

cnf_matrix_tree = confusion_matrix(y_true=x['Class'].values, y_pred=predict_tree)
cnf_matrix_tree

Recall: 0.8080808080808081 
Precision: 0.8695652173913043 
F-score: 0.837696335078534


array([[56851,    12],
       [   19,    80]])

In [217]:
x = valid

predict_tree = tree.predict(x.drop(columns=['Class']))
recall_tree = recall_score(y_true=x['Class'].values, y_pred=predict_tree)
precision_tree = precision_score(y_true=x['Class'].values, y_pred=predict_tree)
fbeta_tree = fbeta_score(y_true=x['Class'].values, y_pred=predict_tree, beta=2)

print('Recall:', recall_tree, '\nPrecision:', precision_tree, '\nF-score:', fbeta_tree)

cnf_matrix_tree = confusion_matrix(y_true=x['Class'].values, y_pred=predict_tree)
cnf_matrix_tree

Recall: 0.826530612244898 
Precision: 0.9101123595505618 
F-score: 0.8419958419958419


array([[56855,     8],
       [   17,    81]])

In [13]:
SVM = SVC(kernel='linear',C=0.4)
SVM.fit(train.iloc[0:20000].drop(columns=['Class']), train.iloc[0:20000]['Class'])

predict = SVM.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.5050505050505051 
Precision: 0.9259259259259259 
F-score: 0.5555555555555557


array([[56859,     4],
       [   49,    50]])

In [9]:
lda = LinearDiscriminantAnalysis()
lda.fit(train.drop(columns=['Class']), train['Class'])

predict = lda.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.7777777777777778 
Precision: 0.875 
F-score: 0.7954545454545454


array([[56852,    11],
       [   22,    77]])

In [10]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(train.drop(columns=['Class']), train['Class'])

predict = qda.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.8484848484848485 
Precision: 0.05286343612334802 
F-score: 0.2115869017632242


array([[55358,  1505],
       [   15,    84]])

In [11]:
rfc = RandomForestClassifier()
rfc.fit(train.drop(columns=['Class']), train['Class'])

predict = rfc.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix



Recall: 0.7474747474747475 
Precision: 0.925 
F-score: 0.7773109243697479


array([[56857,     6],
       [   25,    74]])

In [12]:
adb = AdaBoostClassifier(GaussianNB(),
                         n_estimators=50)
adb.fit(train.drop(columns=['Class']), train['Class'])

predict = adb.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.7272727272727273 
Precision: 0.029715229054890633 
F-score: 0.12770485987938987


array([[54512,  2351],
       [   27,    72]])

In [65]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-4, 
                   hidden_layer_sizes=(5, 4, 3, 3), random_state=1)
nn.fit(train.drop(columns=['Class']), train['Class'])

predict = nn.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.7676767676767676 
Precision: 0.8085106382978723 
F-score: 0.7755102040816325


array([[56845,    18],
       [   23,    76]])

In [66]:
predict = nn.predict(valid.drop(columns=['Class']))

x = valid
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.7755102040816326 
Precision: 0.8539325842696629 
F-score: 0.79002079002079


array([[56850,    13],
       [   22,    76]])

In [67]:
predict = nn.predict(train.drop(columns=['Class']))

x = train
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.7254237288135593 
Precision: 0.8136882129277566 
F-score: 0.7415107415107415


array([[170540,     49],
       [    81,    214]])

In [None]:
xg = XGBClassifier(max_depth=4, reg_lambda=0.5)
xg.fit(train.drop(columns=['Class']), train['Class'])

predict = xg.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

In [42]:
predict = xg.predict(train.drop(columns=['Class']))

x = train
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.8406779661016949 
Precision: 0.9880478087649402 
F-score: 0.8665269042627531


array([[170586,      3],
       [    47,    248]])

In [69]:
gmm_normal = GaussianMixture(n_components=3, n_init=5, max_iter=500, random_state=42, tol=1e-6)
gmm_normal.fit(train_normal.drop(columns=['Class']))

gmm_anomaly = GaussianMixture(n_components=3, n_init=5, max_iter=500, random_state=42, tol=1e-5)
gmm_anomaly.fit(train_anomaly.drop(columns=['Class']))

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=500,
        means_init=None, n_components=3, n_init=5, precisions_init=None,
        random_state=42, reg_covar=1e-06, tol=1e-05, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [132]:
f_g_valid = gmm_normal.score_samples(valid.drop(columns=['Class'])) \
 - gmm_anomaly.score_samples(valid.drop(columns=['Class']))

In [203]:
gmm_tree = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=3)
gmm_tree.fit(f_g_valid.reshape(-1, 1), valid['Class'])

x = test
f_g_test = gmm_normal.score_samples(x.drop(columns=['Class'])) \
 - gmm_anomaly.score_samples(x.drop(columns=['Class']))
    
predict_gmm_tree = gmm_tree.predict(f_g_test.reshape(-1, 1))
    
recall_gmm_tree = recall_score(y_true=x['Class'].values, y_pred=predict_gmm_tree)
precision_gmm_tree = precision_score(y_true=x['Class'].values, y_pred=predict_gmm_tree)
fbeta_gmm_tree = fbeta_score(y_true=x['Class'].values, y_pred=predict_gmm_tree, beta=1)

print('Recall:', recall_gmm_tree, 
      '\nPrecision:', precision_gmm_tree, 
      '\nF-score:', fbeta_gmm_tree)

cnf_matrix_gmm_tree = confusion_matrix(y_true=x['Class'].values, y_pred=predict_gmm_tree)
cnf_matrix_gmm_tree

Recall: 0.8080808080808081 
Precision: 0.8247422680412371 
F-score: 0.8163265306122448


array([[56846,    17],
       [   19,    80]])