In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
data = pd.read_csv('../data/creditcard.csv')
x = np.array(data.loc[1:len(data)-1, 'Time'])
y = np.array(data.loc[0:len(data)-2, 'Time'])
data.loc[1:, 'Time'] = x-y
normal = data[data['Class']==0]
anomaly = data[data['Class']==1]

In [3]:
train_normal, test_normal = train_test_split(normal, test_size=0.2, random_state=42)
valid_normal, test_normal = train_test_split(test_normal, test_size=0.5, random_state=42)
train_anomaly, test_anomaly = train_test_split(anomaly, test_size=0.2, random_state=42)
valid_anomaly, test_anomaly = train_test_split(test_anomaly, test_size=0.5, random_state=42)

for x in [train_normal, valid_normal, test_normal, train_anomaly, valid_anomaly, test_anomaly]:
    x.reset_index(drop=True, inplace=True)

print('Normal Train:', train_normal.shape, 
      'Normal Valid:', valid_normal.shape, 
      'Normal Test:', test_normal.shape)
print('Anomaly Train:', train_anomaly.shape, 
      'Anomaly Valid:', valid_anomaly.shape, 
      'Anomaly Test:', test_anomaly.shape)

Normal Train: (227452, 31) Normal Valid: (28431, 31) Normal Test: (28432, 31)
Anomaly Train: (393, 31) Anomaly Valid: (49, 31) Anomaly Test: (50, 31)


In [4]:
train = train_normal.append(train_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
valid = valid_normal.append(valid_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
test = test_normal.append(test_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)

In [3]:
df = pd.DataFrame(data)
outlier_length = int(sum(label))
total_length = data.shape[0]
feature_size = data.shape[1]
# data_outlier = np.zeros([outlier_length, feature_size + 1])
data_outlier = np.ones([outlier_length, feature_size + 1])
data_normal = np.zeros([total_length - outlier_length, feature_size + 1])
# data_normal = np.kron(np.arange(1, 101).reshape(-1, 1), 
#                       np.ones([int((total_length - outlier_length) / 100), 
#                                feature_size + 1]))
#data_normal = np.append(data_normal, 99 * np.ones([feature_size + 1]).reshape(1, -1), 
#                        axis=0)
data_outlier[:, :-1] = df[label == 1]
data_normal[:, :-1] = df[label != 1]

In [4]:
np.random.shuffle(data_normal)
np.random.shuffle(data_outlier)
data_train_normal = data_normal[0:226640]
data_val_normal = data_normal[226640:254970]
data_test_normal = data_normal[254970:]
data_train_outlier = data_outlier[0:2200]
data_val_outlier = data_outlier[2200:2475]
data_test_outlier = data_outlier[2475:]
data_train = np.concatenate((data_train_normal, data_train_outlier), axis=0)
data_val = np.concatenate((data_val_normal, data_val_outlier), axis=0)
data_test = np.concatenate((data_test_normal, data_test_outlier), axis=0)
np.random.shuffle(data_train)
np.random.shuffle(data_val)
np.random.shuffle(data_test)

def shuffle():
    np.random.shuffle(data_normal)
    np.random.shuffle(data_outlier)
    data_train_normal = data_normal[0:226640]
    data_val_normal = data_normal[226640:254970]
    data_test_normal = data_normal[254970:]
    data_train_outlier = data_outlier[0:2200]
    data_val_outlier = data_outlier[2200:2475]
    data_test_outlier = data_outlier[2475:]
    data_train = np.concatenate((data_train_normal, data_train_outlier), axis=0)
    data_val = np.concatenate((data_val_normal, data_val_outlier), axis=0)
    data_test = np.concatenate((data_test_normal, data_test_outlier), axis=0)
    np.random.shuffle(data_train)
    np.random.shuffle(data_val)
    np.random.shuffle(data_test)

In [7]:
km = KMeans(n_clusters=10)

In [8]:
km.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [9]:
sum(km.predict(data))

1341596

In [30]:
logistic = LogisticRegression(random_state=0, solver='newton-cg',
                         multi_class='multinomial')
logistic.fit(train.drop(columns=['Class']), train['Class'])

predict_log = clf.predict(test.drop(columns=['Class']))

x = test
recall_log = recall_score(y_true=x['Class'].values, y_pred=predict_log)
precision_log = precision_score(y_true=x['Class'].values, y_pred=predict_log)
fbeta_log = fbeta_score(y_true=x['Class'].values, y_pred=predict_log, beta=2)

print('Recall:', recall_log, '\nPrecision:', precision_log, '\nF-score:', fbeta_log)

cnf_matrix_log = confusion_matrix(y_true=x['Class'].values, y_pred=predict_log)
cnf_matrix_log

Recall: 0.64 
Precision: 0.9411764705882353 
F-score: 0.6837606837606838


array([[28430,     2],
       [   18,    32]])

In [29]:
gnb = GaussianNB()
gnb.fit(train.drop(columns=['Class']), train['Class'])

predict_gnb = gnb.predict(test.drop(columns=['Class']))

x = test
recall_gnb = recall_score(y_true=x['Class'].values, y_pred=predict_gnb)
precision_gnb = precision_score(y_true=x['Class'].values, y_pred=predict_gnb)
fbeta_gnb = fbeta_score(y_true=x['Class'].values, y_pred=predict_gnb, beta=2)

print('Recall:', recall_gnb, '\nPrecision:', precision_gnb, '\nF-score:', fbeta_gnb)

cnf_matrix_gnb = confusion_matrix(y_true=x['Class'].values, y_pred=predict_gnb)
cnf_matrix_gnb

Recall: 0.82 
Precision: 0.06047197640117994 
F-score: 0.23348519362186787


array([[27795,   637],
       [    9,    41]])

In [34]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(train.drop(columns=['Class']), train['Class'])

predict_tree = tree.predict(test.drop(columns=['Class']))

x = test
recall_tree = recall_score(y_true=x['Class'].values, y_pred=predict_tree)
precision_tree = precision_score(y_true=x['Class'].values, y_pred=predict_tree)
fbeta_tree = fbeta_score(y_true=x['Class'].values, y_pred=predict_tree, beta=2)

print('Recall:', recall_tree, '\nPrecision:', precision_tree, '\nF-score:', fbeta_tree)

cnf_matrix_tree = confusion_matrix(y_true=x['Class'].values, y_pred=predict_tree)
cnf_matrix_tree

Recall: 0.78 
Precision: 0.7090909090909091 
F-score: 0.7647058823529412


array([[28416,    16],
       [   11,    39]])

In [10]:
SVM = SVC(kernel='linear',C=0.4)
SVM.fit(train.iloc[0:20000].drop(columns=['Class']), train.iloc[0:20000]['Class'])

predict = SVM.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.68 
Precision: 0.8292682926829268 
F-score: 0.7053941908713693


array([[28425,     7],
       [   16,    34]])

In [7]:
lda = LinearDiscriminantAnalysis()
lda.fit(train.drop(columns=['Class']), train['Class'])

predict = lda.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.74 
Precision: 0.8809523809523809 
F-score: 0.7644628099173555


array([[28427,     5],
       [   13,    37]])

In [36]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(train.drop(columns=['Class']), train['Class'])

predict = qda.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.9 
Precision: 0.06089309878213803 
F-score: 0.23961661341853036


array([[27738,   694],
       [    5,    45]])

In [5]:
rfc = RandomForestClassifier()
rfc.fit(train.drop(columns=['Class']), train['Class'])

predict = rfc.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix



Recall: 0.78 
Precision: 0.9285714285714286 
F-score: 0.8057851239669421


array([[28429,     3],
       [   11,    39]])

In [29]:
adb = AdaBoostClassifier(GaussianNB(),
                         n_estimators=50)
adb.fit(train.drop(columns=['Class']), train['Class'])

predict = adb.predict(test.drop(columns=['Class']))

x = test
recall = recall_score(y_true=x['Class'].values, y_pred=predict)
precision = precision_score(y_true=x['Class'].values, y_pred=predict)
fbeta = fbeta_score(y_true=x['Class'].values, y_pred=predict, beta=2)

print('Recall:', recall, '\nPrecision:', precision, '\nF-score:', fbeta)

cnf_matrix = confusion_matrix(y_true=x['Class'].values, y_pred=predict)
cnf_matrix

Recall: 0.1 
Precision: 0.38461538461538464 
F-score: 0.11737089201877934


array([[28424,     8],
       [   45,     5]])