# Test AD Models 

Test other Anomaly Detection models to compare to EIF+. 

Potential models to test: 

- Deep Isolation Forest (DIF) 
- INNE: Isolation Based AD using nearest neighbours ensemble
- SUOD: This is another ensemble model like EIF 
- Auto Encoder 

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from pyod.models.dif import DIF
from pyod.models.inne import INNE
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.suod import SUOD

from scipy.io import loadmat

sys.path.append('../src')
from src.performance_report_functions import *
from src.utils import *

sys.path.append('../models')
from models.forests import *

2024-02-09 16:24:48.980488: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load Data

In [2]:
os.chdir('../data/real/')
os.getcwd()

'/home/davidefrizzo/Desktop/PHD/ExIFFI/data/real'

In [3]:
X,y=mat_dataset('wine')

wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


In [6]:
X,y=csv_dataset('diabetes',os.getcwd()+'/')

diabetes 

[number of samples = 85916]
[percentage outliers = 0.096582708692211]
[number features = 4]
[number outliers = 8298]


In [4]:
X_train,X_test=partition_data(X,y)
X_train,X_test,X=pre_process(X_train,X_test)
X_train.shape,X_test.shape,X.shape

((119, 13), (129, 13), (129, 13))

## EIF

In [9]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=0)
EIF.fit(X_train)

In [11]:
score_eif=EIF.predict(X_test)
perf=performance_eif(y,score_eif,X_test,EIF)
perf

{'Precision': 0.2,
 'Recall': 0.2,
 'f1 score': 0.20000000000000004,
 'Accuracy': 0.875968992248062,
 'Balanced Accuracy': 0.5663865546218487,
 'Average Precision': 0.1363386518400739,
 'ROC AUC Score': 0.6781512605042017}

## EIF+

In [12]:
EIF_plus=ExtendedIsolationForest(n_estimators=300,plus=1)
EIF_plus.fit(X_train)

In [13]:
score_eif_plus=EIF_plus.predict(X_test)
perf_plus=performance_eif(y,score_eif,X_test,EIF_plus)
perf_plus

{'Precision': 0.1,
 'Recall': 0.1,
 'f1 score': 0.10000000000000002,
 'Accuracy': 0.8604651162790697,
 'Balanced Accuracy': 0.5121848739495799,
 'Average Precision': 0.1363386518400739,
 'ROC AUC Score': 0.6781512605042017}

## Deep Isolation Forest (DIF)

Let's start with all the default parameters except for the contamination parameter that will be set at the real contamination of the dataset

In [8]:
p=sum(y)/len(y)
dif=DIF(contamination=p)
dif.fit(X_train)

DIF(batch_size=1000, contamination=0.07751937984496124,
  device=device(type='cpu'), hidden_activation='tanh',
  hidden_neurons=[500, 100], max_samples=256, n_ensemble=50,
  n_estimators=6, random_state=None, representation_dim=20,
  skip_connection=False)

Testing on X_test concatenation of X_train and X_test training on X_train 

In [9]:
dif_perf,y_pred=performance(dif,X_test,y)
dif_perf

{'Precision': 0.0,
 'Recall': 0.0,
 'f1 score': 0.0,
 'Accuracy': 0.9224806201550387,
 'Balanced Accuracy': 0.5,
 'Average Precision': 0.07751937984496124,
 'ROC AUC Score': 0.5}

Testing on X

In [10]:
dif_perf,y_pred=performance(dif,X,y)
dif_perf

{'Precision': 0.0,
 'Recall': 0.0,
 'f1 score': 0.0,
 'Accuracy': 0.9224806201550387,
 'Balanced Accuracy': 0.5,
 'Average Precision': 0.07751937984496124,
 'ROC AUC Score': 0.5}

## INNE 

In [11]:
p=sum(y)/len(y)
inne=INNE(contamination=p)
inne.fit(X_train)

INNE(contamination=0.07751937984496124, max_samples='auto', n_estimators=200,
   random_state=None)

Testing on X_test

In [12]:
dif_perf,y_pred=performance(inne,X_test,y)
dif_perf

{'Precision': 0.07142857142857142,
 'Recall': 0.1,
 'f1 score': 0.08333333333333333,
 'Accuracy': 0.8294573643410853,
 'Balanced Accuracy': 0.4953781512605042,
 'Average Precision': 0.07691029900332226,
 'ROC AUC Score': 0.4953781512605042}

Testing on X

In [13]:
dif_perf,y_pred=performance(inne,X,y)
dif_perf

{'Precision': 0.07692307692307693,
 'Recall': 0.1,
 'f1 score': 0.08695652173913043,
 'Accuracy': 0.8372093023255814,
 'Balanced Accuracy': 0.4995798319327731,
 'Average Precision': 0.07745974955277281,
 'ROC AUC Score': 0.49957983193277317}

## AutoEncoder

In the AutoEncoder model we have to reduce the dimensionality of the dataset in the hidden layer, so we have to pass the dimensions of each layer in the AutoEncoder in the `hidden_neurons` parameter.



In [6]:
p=sum(y)/len(y)
ae=AutoEncoder(contamination=p,hidden_neurons=[X_train.shape[1],7,7,X_train.shape[1]])
ae.fit(X_train)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 13)                182       
                                                                 
 dropout (Dropout)           (None, 13)                0         
                                                                 
 dense_1 (Dense)             (None, 13)                182       
                                                                 
 dropout_1 (Dropout)         (None, 13)                0         
                                                                 
 dense_2 (Dense)             (None, 13)                182       
                                                                 
 dropout_2 (Dropout)         (None, 13)                0         
                                                                 
 dense_3 (Dense)             (None, 7)                 9

2024-02-09 15:57:08.377740: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epo

AutoEncoder(batch_size=32, contamination=0.07751937984496124,
      dropout_rate=0.2, epochs=100, hidden_activation='relu',
      hidden_neurons=[13, 7, 7, 13], l2_regularizer=0.1,
      loss=<function mean_squared_error at 0x7f3a1f5002c0>,
      optimizer='adam', output_activation='sigmoid', preprocessing=True,
      random_state=None, validation_size=0.1, verbose=1)

Testing on X_test

In [7]:
dif_perf,y_pred=performance(ae,X_test,y)
dif_perf



{'Precision': 0.2,
 'Recall': 0.2,
 'f1 score': 0.20000000000000004,
 'Accuracy': 0.875968992248062,
 'Balanced Accuracy': 0.5663865546218487,
 'Average Precision': 0.102015503875969,
 'ROC AUC Score': 0.5663865546218487}

Testing on X

In [8]:
dif_perf,y_pred=performance(ae,X,y)
dif_perf



{'Precision': 0.1111111111111111,
 'Recall': 0.1,
 'f1 score': 0.10526315789473685,
 'Accuracy': 0.8682170542635659,
 'Balanced Accuracy': 0.5163865546218488,
 'Average Precision': 0.08087855297157623,
 'ROC AUC Score': 0.5163865546218488}

## SUOD