In [1]:
from __future__ import division
from __future__ import print_function

import os
import sys
from time import time

# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
# supress warnings for clean output
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.vae import VAE
from pyod.models.mo_gaal import MO_GAAL
from pyod.models.so_gaal import SO_GAAL

from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score
import pickle

In [2]:
from imblearn.datasets import make_imbalance

with open('train.pk', 'rb') as f:
    xx, yy = pickle.load(f)
with open('test.pk', 'rb') as f:
    X_test,y_test = pickle.load(f)
X,Y = make_imbalance(xx, yy, sampling_strategy={'normal':2700, 'injection':100, 'impersonation':100, 'flooding':100},random_state=0)
#xx, yy = make_imbalance(X, Y, sampling_strategy={'normal':3000, 'injection':1000, 'impersonation':1000, 'flooding':1000},random_state=0)

In [3]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

pd.options.display.max_rows
pd.set_option('display.max_rows', None)

In [4]:
random_state = np.random.RandomState(42)

df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc',
              'ABOD', 'CBLOF', 'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 'MCD',
              'OCSVM', 'PCA', 'AutoEncoder', 'VAE', 'MO_GAAL', 'SO_GAAL']
roc_df = pd.DataFrame(columns=df_columns)
prn_df = pd.DataFrame(columns=df_columns)
time_df = pd.DataFrame(columns=df_columns)

In [5]:
outliers_fraction = 0.1
outliers_percentage = 10

X_train_norm, X_test_norm = X, X_test

In [6]:
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer

encoder = LabelEncoder()
encoded_y = encoder.fit_transform(Y)

binarizer = LabelBinarizer()
binarized_y = binarizer.fit_transform(encoded_y)
binarized_y.shape

encoded_y_test = encoder.transform(y_test)

y = keras.utils.to_categorical(encoded_y)
y_test = keras.utils.to_categorical(encoded_y_test)
print(encoded_y)

[0 0 0 ... 3 3 3]


In [7]:
np.count_nonzero(encoded_y == 3)

2700

In [8]:
for i in range(0,encoded_y.shape[0]):
    if encoded_y[i]==0:
        encoded_y[i]=1
    if encoded_y[i]==2:
        encoded_y[i]=1
for i in range(0,encoded_y.shape[0]):
    if encoded_y[i]==3:
        encoded_y[i]=0
print(np.count_nonzero(encoded_y == 0))
print(np.count_nonzero(encoded_y == 1))#outlier

for i in range(0,encoded_y_test.shape[0]):
    if encoded_y_test[i]==0:
        encoded_y_test[i]=1
    if encoded_y_test[i]==2:
        encoded_y_test[i]=1
for i in range(0,encoded_y_test.shape[0]):
    if encoded_y_test[i]==3:
        encoded_y_test[i]=0
print(np.count_nonzero(encoded_y_test == 0))
print(np.count_nonzero(encoded_y_test == 1))#outlier

2700
300
530785
44858


In [9]:
from sklearn.metrics import accuracy_score,  confusion_matrix

In [10]:
    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
        contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor': CBLOF(
            contamination=outliers_fraction, check_estimator=False,
            random_state=random_state),
        'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                          random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(
            contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,
                                    random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)': LOF(
            contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)': MCD(
            contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)': PCA(
            contamination=outliers_fraction, random_state=random_state),
                   
        'Auto Encoder': AutoEncoder(contamination=outliers_fraction, random_state=random_state, hidden_neurons= [16,8,8,16]),
         'VAE': VAE(contamination=outliers_fraction, random_state=random_state, encoder_neurons= [16,8,16], decoder_neurons= [16,8,16]),
        'MO_GAAL': MO_GAAL(contamination=outliers_fraction),
        'SO_GAAL': SO_GAAL(contamination=outliers_fraction)
    }
    for clf_name, clf in classifiers.items():
        clf.fit(X_train_norm)
        
        test_scores = clf.decision_function(X_train_norm)
        roc = round(roc_auc_score(encoded_y, test_scores), ndigits=4)
        prn = round(precision_n_scores(encoded_y, test_scores), ndigits=4)
        print(clf_name)
        print(clf_name,"Train dataset:")
        print('ROC:{roc}, precision @ rank n:{prn}'.format(roc=roc, prn=prn))
        
        
        test_scores1 = clf.decision_function(X_test_norm)
        roc1 = round(roc_auc_score(encoded_y_test, test_scores1), ndigits=4)
        prn1 = round(precision_n_scores(encoded_y_test, test_scores1), ndigits=4)
        print("Test dataset:")
        print('ROC:{roc1}, precision @ rank n:{prn1}'.format(roc1=roc1, prn1=prn1))
        
        

Angle-based Outlier Detector (ABOD)
Angle-based Outlier Detector (ABOD) Train dataset:
ROC:0.3172, precision @ rank n:0.0633
Test dataset:
ROC:0.8229, precision @ rank n:0.2451
Cluster-based Local Outlier Factor
Cluster-based Local Outlier Factor Train dataset:
ROC:0.7875, precision @ rank n:0.5133
Test dataset:
ROC:0.7994, precision @ rank n:0.3856
Feature Bagging
Feature Bagging Train dataset:
ROC:0.5259, precision @ rank n:0.1933
Test dataset:
ROC:0.6719, precision @ rank n:0.0718
Histogram-base Outlier Detection (HBOS)
Histogram-base Outlier Detection (HBOS) Train dataset:
ROC:0.6638, precision @ rank n:0.0433
Test dataset:
ROC:0.5096, precision @ rank n:0.0071
Isolation Forest
Isolation Forest Train dataset:
ROC:0.6963, precision @ rank n:0.05
Test dataset:
ROC:0.5941, precision @ rank n:0.0018
K Nearest Neighbors (KNN)
K Nearest Neighbors (KNN) Train dataset:
ROC:0.3725, precision @ rank n:0.1003
Test dataset:
ROC:0.7118, precision @ rank n:0.2795
Local Outlier Factor (LOF)
Local

Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Auto Encoder
Auto Encoder Train dataset:
ROC:0.6142, precision @ rank n:0.0233
Test dataset:
ROC:0.5864, precision @ rank n:0.0027
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape 

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 153)]        0                                            
__________________________________________________________________________________________________
functional_1 (Functional)       [(None, 2), (None, 2 26374       input_1[0][0]                    
__________________________________________________________________________________________________
functional_3 (Functional)       (None, 153)          2935        functional_1[0][2]               
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 153)          23562       input_1[0][0]                    
_______________________________________________________________________________________

Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100

Epoch 99/100
Epoch 100/100
VAE
VAE Train dataset:
ROC:0.6107, precision @ rank n:0.0233


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').