In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [2]:
data = np.load(file='data/forest_data.npy')
label = np.load(file='data/forest_label.npy')

In [3]:
df = pd.DataFrame(data)
outlier_length = int(sum(label))
total_length = data.shape[0]
feature_size = data.shape[1]
# data_outlier = np.zeros([outlier_length, feature_size + 1])
data_outlier = np.ones([outlier_length, feature_size + 1])
data_normal = np.zeros([total_length - outlier_length, feature_size + 1])
# data_normal = np.kron(np.arange(1, 101).reshape(-1, 1), 
#                       np.ones([int((total_length - outlier_length) / 100), 
#                                feature_size + 1]))
#data_normal = np.append(data_normal, 99 * np.ones([feature_size + 1]).reshape(1, -1), 
#                        axis=0)
data_outlier[:, :-1] = df[label == 1]
data_normal[:, :-1] = df[label != 1]

In [4]:
np.random.shuffle(data_normal)
np.random.shuffle(data_outlier)
data_train_normal = data_normal[0:226640]
data_val_normal = data_normal[226640:254970]
data_test_normal = data_normal[254970:]
data_train_outlier = data_outlier[0:2200]
data_val_outlier = data_outlier[2200:2475]
data_test_outlier = data_outlier[2475:]
data_train = np.concatenate((data_train_normal, data_train_outlier), axis=0)
data_val = np.concatenate((data_val_normal, data_val_outlier), axis=0)
data_test = np.concatenate((data_test_normal, data_test_outlier), axis=0)
np.random.shuffle(data_train)
np.random.shuffle(data_val)
np.random.shuffle(data_test)

def shuffle():
    np.random.shuffle(data_normal)
    np.random.shuffle(data_outlier)
    data_train_normal = data_normal[0:226640]
    data_val_normal = data_normal[226640:254970]
    data_test_normal = data_normal[254970:]
    data_train_outlier = data_outlier[0:2200]
    data_val_outlier = data_outlier[2200:2475]
    data_test_outlier = data_outlier[2475:]
    data_train = np.concatenate((data_train_normal, data_train_outlier), axis=0)
    data_val = np.concatenate((data_val_normal, data_val_outlier), axis=0)
    data_test = np.concatenate((data_test_normal, data_test_outlier), axis=0)
    np.random.shuffle(data_train)
    np.random.shuffle(data_val)
    np.random.shuffle(data_test)

In [7]:
km = KMeans(n_clusters=10)

In [8]:
km.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [9]:
sum(km.predict(data))

1341596

In [107]:
shuffle()

clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial')
clf.fit(data_train[:, :-1], data_train[:, -1])

prediction_outlier = clf.predict(data_test_outlier[:, :-1])
prediction_normal = clf.predict(data_test_normal[:, :-1])
prediction = clf.predict(data_test[:, :-1])
print('Outlier set accuracy: ', np.count_nonzero(prediction_outlier) / 
     prediction_outlier.shape[0])
print('Normal set accuracy: ', (prediction_normal.shape[0] - np.count_nonzero(prediction_normal)) 
      / prediction_normal.shape[0])

a = np.count_nonzero(prediction_outlier)
b = np.count_nonzero(prediction_normal)
print('Precision: ', a / (a + b))
print('Recall: ', a / prediction_outlier.shape[0])

Outlier set accuracy:  0.8970588235294118
Normal set accuracy:  0.9992587624863224
Precision:  0.9207547169811321
Recall:  0.8970588235294118


In [106]:
shuffle()

gnb = GaussianNB()
gnb.fit(data_train[:, :-1], data_train[:, -1])

prediction_outlier = gnb.predict(data_test_outlier[:, :-1])
prediction_normal = gnb.predict(data_test_normal[:, :-1])
prediction = gnb.predict(data_test[:, :-1])
print('Outlier set accuracy: ', np.count_nonzero(prediction_outlier) / 
     prediction_outlier.shape[0])
print('Normal set accuracy: ', (prediction_normal.shape[0] - np.count_nonzero(prediction_normal)) 
      / prediction_normal.shape[0])

a = np.count_nonzero(prediction_outlier)
b = np.count_nonzero(prediction_normal)
print('Precision: ', a / (a + b))
print('Recall: ', a / prediction_outlier.shape[0])

Outlier set accuracy:  0.9558823529411765
Normal set accuracy:  0.9944230701351876
Precision:  0.6220095693779905
Recall:  0.9558823529411765


In [13]:
shuffle()

tree = DecisionTreeClassifier(random_state=0)
tree.fit(data_train[:, :-1], data_train[:, -1])

prediction_outlier = tree.predict(data_test_outlier[:, :-1])
prediction_normal = tree.predict(data_test_normal[:, :-1])
prediction = tree.predict(data_test[:, :-1])
print('Outlier set accuracy: ', np.count_nonzero(prediction_outlier) / 
     prediction_outlier.shape[0])
print('Normal set accuracy: ', (prediction_normal.shape[0] - np.count_nonzero(prediction_normal)) 
      / prediction_normal.shape[0])

a = np.count_nonzero(prediction_outlier)
b = np.count_nonzero(prediction_normal)
print('Precision: ', a / (a + b))
print('Recall: ', a / prediction_outlier.shape[0])

Outlier set accuracy:  0.9963235294117647
Normal set accuracy:  0.9999647029755392
Precision:  0.9963235294117647
Recall:  0.9963235294117647


In [15]:
shuffle()

SVM = SVC(kernel='linear',C=0.4)
import time
a = time.time()
SVM.fit(data_train[:, :-1], data_train[:, -1])
b = time.time()
print(b - a)

prediction_outlier = SVM.predict(data_test_outlier[:, :-1])
prediction_normal = SVM.predict(data_test_normal[:, :-1])
prediction = SVM.predict(data_test[:, :-1])
print('Outlier set accuracy: ', np.count_nonzero(prediction_outlier) / 
     prediction_outlier.shape[0])
print('Normal set accuracy: ', (prediction_normal.shape[0] - np.count_nonzero(prediction_normal)) 
      / prediction_normal.shape[0])

a = np.count_nonzero(prediction_outlier)
b = np.count_nonzero(prediction_normal)
print('Precision: ', a / (a + b))
print('Recall: ', a / prediction_outlier.shape[0])

1339.3374779224396
Outlier set accuracy:  0.9301470588235294
Normal set accuracy:  0.9987999011683315
Precision:  0.8815331010452961
Recall:  0.9301470588235294


In [11]:
shuffle()

lda = LinearDiscriminantAnalysis()
lda.fit(data_train[:, :-1], data_train[:, -1])

prediction_outlier = lda.predict(data_test_outlier[:, :-1])
prediction_normal = lda.predict(data_test_normal[:, :-1])
prediction = lda.predict(data_test[:, :-1])
print('Outlier set accuracy: ', np.count_nonzero(prediction_outlier) / 
     prediction_outlier.shape[0])
print('Normal set accuracy: ', (prediction_normal.shape[0] - np.count_nonzero(prediction_normal)) 
      / prediction_normal.shape[0])

a = np.count_nonzero(prediction_outlier)
b = np.count_nonzero(prediction_normal)
print('Precision: ', a / (a + b))
print('Recall: ', a / prediction_outlier.shape[0])

Outlier set accuracy:  0.9007352941176471
Normal set accuracy:  0.9991528714129398
Precision:  0.9107806691449815
Recall:  0.9007352941176471


In [15]:
shuffle()

qda = QuadraticDiscriminantAnalysis()
qda.fit(data_train[:, :-1], data_train[:, -1])

prediction_outlier = qda.predict(data_test_outlier[:, :-1])
prediction_normal = qda.predict(data_test_normal[:, :-1])
prediction = qda.predict(data_test[:, :-1])
print('Outlier set accuracy: ', np.count_nonzero(prediction_outlier) / 
     prediction_outlier.shape[0])
print('Normal set accuracy: ', (prediction_normal.shape[0] - np.count_nonzero(prediction_normal)) 
      / prediction_normal.shape[0])

a = np.count_nonzero(prediction_outlier)
b = np.count_nonzero(prediction_normal)
print('Precision: ', a / (a + b))
print('Recall: ', a / prediction_outlier.shape[0])

Outlier set accuracy:  0.9448529411764706
Normal set accuracy:  0.9980586636546539
Precision:  0.8237179487179487
Recall:  0.9448529411764706


In [16]:
print(np.count_nonzero(prediction_outlier))
print(prediction_outlier.shape[0])

257
272


In [42]:
prediction_outlier = clf.predict(data_test_outlier[:, :-1])
prediction_normal = clf.predict(data_test_normal[:, :-1])
prediction = clf.predict(data_test[:, :-1])
print((prediction_outlier.shape[0] - np.count_nonzero(prediction_outlier)) / 
     prediction_outlier.shape[0])
print(np.count_nonzero(prediction_normal) / prediction_normal.shape[0])

a = prediction_outlier.shape[0] - np.count_nonzero(prediction_outlier)
b = prediction_normal.shape[0] - np.count_nonzero(prediction_normal)
print(a / (a + b))
print(a / prediction_outlier.shape[0])

0.6323529411764706
0.9789982704458015
0.2242503259452412
0.6323529411764706
