In [2]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier

In [3]:
data = pd.read_csv('../data/creditcard.csv')
x = np.array(data.loc[1:len(data)-1, 'Time'])
y = np.array(data.loc[0:len(data)-2, 'Time'])
data.loc[1:, 'Time'] = x-y
normal = data[data['Class']==0]
anomaly = data[data['Class']==1]

In [4]:
train_normal, test_normal = train_test_split(normal, test_size=0.4, random_state=42)
valid_normal, test_normal = train_test_split(test_normal, test_size=0.5, random_state=42)
train_anomaly, test_anomaly = train_test_split(anomaly, test_size=0.4, random_state=42)
valid_anomaly, test_anomaly = train_test_split(test_anomaly, test_size=0.5, random_state=42)

for x in [train_normal, valid_normal, test_normal, train_anomaly, valid_anomaly, test_anomaly]:
    x.reset_index(drop=True, inplace=True)

print('Normal Train:', train_normal.shape, 
      'Normal Valid:', valid_normal.shape, 
      'Normal Test:', test_normal.shape)
print('Anomaly Train:', train_anomaly.shape, 
      'Anomaly Valid:', valid_anomaly.shape, 
      'Anomaly Test:', test_anomaly.shape)

Normal Train: (170589, 31) Normal Valid: (56863, 31) Normal Test: (56863, 31)
Anomaly Train: (295, 31) Anomaly Valid: (98, 31) Anomaly Test: (99, 31)


In [5]:
train_normal.shape[0]
p = 0.9
train_normal_1 = train_normal[:(int)(p * train_normal.shape[0])]
train_normal_2 = train_normal[(int)(p * train_normal.shape[0]):]

In [6]:
train_normal_2.shape

(17059, 31)

In [7]:
train = train_normal.append(train_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
valid = valid_normal.append(valid_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)
test = test_normal.append(test_anomaly).sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
def print_data(model):
    x = train
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1)

    print('Train: Recall:', recall_model, 
          '\nTrain Precision:', precision_model, 
          '\nTrain F-score:', fbeta_model)

    x = valid
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1)

    print('Valid Recall:', recall_model, 
          '\nValid Precision:', precision_model, 
          '\nValid F-score:', fbeta_model)

    x = test
    predict_model = model.predict(x.drop(columns=['Class']))
    recall_model = recall_score(y_true=x['Class'].values, y_pred=predict_model)
    precision_model = precision_score(y_true=x['Class'].values, y_pred=predict_model)
    fbeta_model = fbeta_score(y_true=x['Class'].values, y_pred=predict_model, beta=1)

    print('Test Recall:', recall_model, 
          '\nTest Precision:', precision_model, 
          '\nTest F-score:', fbeta_model)

    #cnf_matrix_model = confusion_matrix(y_true=x['Class'].values, y_pred=predict_model)
    #cnf_matrix_model

In [9]:
class KNDClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_cluster = 10, tree_depth = 5):
        self.n_cluster = n_cluster
        self.tree_depth = tree_depth
        
    def fit(self, X1, X2, y):
        self.kmeans = KMeans(n_clusters=self.n_cluster, random_state=42)
        self.kmeans.fit(X1)
        self.centers = self.kmeans.cluster_centers_
        l = X2.shape[0]
        a = np.repeat(np.expand_dims(self.centers, axis=0), l, axis=0)
        b = np.repeat(np.expand_dims(X2, axis=1), self.n_cluster, axis=1)
        features = np.sqrt(np.sum(np.square(a-b), axis=2))
        self.tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=self.tree_depth)
        self.tree.fit(features, y)
        return self
        
    def predict(self, X):
        l = X.shape[0]
        a = np.repeat(np.expand_dims(self.centers, axis=0), l, axis=0)
        b = np.repeat(np.expand_dims(X, axis=1), self.n_cluster, axis=1)
        features = np.sqrt(np.sum((a - b) ** 2, axis=2))
        return self.tree.predict(features)

In [10]:
class KBDClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_cluster_normal=3, n_cluster_outlier=4, tree_depth = 5):
        self.n_cluster_normal = n_cluster_normal
        self.n_cluster_outlier = n_cluster_outlier
        self.n_cluster = self.n_cluster_normal + self.n_cluster_outlier
        self.tree_depth = tree_depth
        
    def fit(self, X1, X2, X3, y):
        self.kmeans_normal = KMeans(n_clusters=self.n_cluster_normal, random_state=42)
        self.kmeans_normal.fit(X1)
        self.centers_normal = self.kmeans_normal.cluster_centers_
        self.kmeans_anomaly = KMeans(n_clusters=self.n_cluster_outlier, random_state=42)
        self.kmeans_anomaly.fit(X2)
        self.centers_anomaly = self.kmeans_anomaly.cluster_centers_
        self.centers = np.concatenate([self.centers_normal, self.centers_anomaly], axis=0)
        
        l = X3.shape[0]
        a = np.repeat(np.expand_dims(self.centers, axis=0), l, axis=0)
        b = np.repeat(np.expand_dims(X3, axis=1), self.n_cluster, axis=1)
        features = np.sqrt(np.sum(np.square(a-b), axis=2))
        self.tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=self.tree_depth)
        self.tree.fit(features, y)
        return self
        
    def predict(self, X):
        l = X.shape[0]
        a = np.repeat(np.expand_dims(self.centers, axis=0), l, axis=0)
        b = np.repeat(np.expand_dims(X, axis=1), self.n_cluster, axis=1)
        features = np.sqrt(np.sum((a - b) ** 2, axis=2))
        return self.tree.predict(features)

In [11]:
kbd = KBDClassifier(n_cluster_normal=5,
                    n_cluster_outlier=3,
                    tree_depth=20)
kbd.fit(train_normal.drop(columns=['Class']),
        train_anomaly.drop(columns=['Class']),
        train.drop(columns=['Class']), 
        train['Class'])

KBDClassifier(n_cluster_normal=5, n_cluster_outlier=3, tree_depth=20)

In [12]:
print_data(kbd)

Train: Recall: 0.8847457627118644 
Train Precision: 1.0 
Train F-score: 0.9388489208633094
Valid Recall: 0.5306122448979592 
Valid Precision: 0.6933333333333334 
Valid F-score: 0.6011560693641619
Test Recall: 0.5656565656565656 
Test Precision: 0.691358024691358 
Test F-score: 0.6222222222222221


In [167]:
knd = KNDClassifier(n_cluster=5,
                    tree_depth=5)
knd.fit(train_normal.drop(columns=['Class']),
        train.drop(columns=['Class']), 
        train['Class'])

KNDClassifier(n_cluster=5, tree_depth=5)

In [169]:
print_data(knd)

Train: Recall: 0.14576271186440679 
Train Precision: 0.8269230769230769 
Train F-score: 0.2478386167146974
Valid Recall: 0.10204081632653061 
Valid Precision: 0.625 
Valid F-score: 0.17543859649122806
Test Recall: 0.1111111111111111 
Test Precision: 0.5789473684210527 
Test F-score: 0.18644067796610167
