# Test AD Models 

Test other Anomaly Detection models to compare to EIF+. 

Potential models to test: 

- Deep Isolation Forest (DIF) 
- INNE: Isolation Based AD using nearest neighbours ensemble
- SUOD: This is another ensemble model like EIF 
- Auto Encoder 

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from pyod.models.dif import DIF
from pyod.models.inne import INNE
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.suod import SUOD

from glob import glob

sys.path.append('../src')
from src.performance_report_functions import *
from src.utils import *

sys.path.append('../models')
from models.forests import *

2024-02-14 17:51:46.611671: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load Data

In [2]:
path=os.getcwd()
path = os.path.dirname(path)
path_real = os.path.join(path, "data", "real")
path_syn = os.path.join(path, "data", "syn")
mat_files_real = glob(os.path.join(path_real, "*.mat"))
mat_file_names_real = {os.path.basename(x).split(".")[0]: x for x in mat_files_real}
mat_files_syn = glob(os.path.join(path_syn, "*.pkl"))
mat_file_names_syn = {os.path.basename(x).split(".")[0]: x for x in mat_files_syn}
csv_files_real = glob(os.path.join(path_real, "*.csv"))
csv_file_names_real = {os.path.basename(x).split(".")[0]: x for x in csv_files_real}
dataset_names = list(mat_file_names_real.keys()) + list(mat_file_names_syn) + list(csv_file_names_real.keys())
mat_file_names_real.update(mat_file_names_syn)
mat_file_names_real.update(csv_file_names_real)
dataset_paths = mat_file_names_real.copy()

In [13]:
dataset_names=['wine']
paths=[dataset_paths[name] for name in dataset_names]
X_train,X_test,X,y=load_preprocess('StandardScaler','wine',dataset_paths['wine'])

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


In [4]:
X_train,X_test,X,y=load_preprocess('StandardScaler','wine',dataset_paths['wine'])
X_test.shape

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


(129, 13)

# Experiments Scenario

- Scenario I → Train and test on the entire dataset → (X_test,X_test)
- Scenario II → Train on the inliers and test on the entire dataset → (X_train,X_test)

## EIF

### Scenario I, contamination=0.1

In [14]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=0)
df_perf=collect_performance_df(dataset_names,paths,10,split=False,scaler='StandardScaler',model=EIF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:04<00:00,  2.29it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.366667,0.44,0.4,0.897674,0.688067,0.206744,0.688067


### Scenario I,contamination=p

In [15]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=0,contamination=np.sum(y)/len(y))
df_perf=collect_performance_df(dataset_names,paths,10,split=False,scaler='StandardScaler',model=EIF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:03<00:00,  2.62it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.34,0.34,0.34,0.897674,0.642269,0.169163,0.642269


### Scenario II, contamination=0.1

In [4]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=0)
df_perf=collect_performance_df(dataset_names,paths,10,scaler='StandardScaler',model=EIF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:03<00:00,  3.24it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.5,0.6,0.545455,0.922481,0.77479,0.336008,0.77479


### Scenario II contamination=sum(y)/len(y)

In [10]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=0,contamination=np.sum(y)/len(y))
df_perf=collect_performance_df(dataset_names,paths,10,scaler='StandardScaler',model=EIF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:02<00:00,  3.74it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.54,0.54,0.54,0.928682,0.750672,0.329659,0.750672


## EIF+

## `wine`

### Scenario I, contamination=0.1

In [16]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=1)
df_perf=collect_performance_df(dataset_names,paths,10,split=False,scaler='StandardScaler',model=EIF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:03<00:00,  2.71it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.291667,0.35,0.318182,0.883721,0.639286,0.161221,0.639286


### Scenario I, contamination=p

In [17]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=1,contamination=np.sum(y)/len(y))
df_perf=collect_performance_df(dataset_names,paths,10,split=False,scaler='StandardScaler',model=EIF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:03<00:00,  2.50it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.23,0.23,0.23,0.88062,0.582647,0.11869,0.582647


### Scenario II, contamination=0.1 

In [5]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=1)
df_perf=collect_performance_df(dataset_names,paths,10,scaler='StandardScaler',model=EIF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:02<00:00,  3.56it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.591667,0.71,0.645455,0.939535,0.834412,0.444981,0.834412


### Scenario II, contamnation=sum(y)/len(y)

In [11]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=1,contamination=np.sum(y)/len(y))
df_perf=collect_performance_df(dataset_names,paths,10,scaler='StandardScaler',model=EIF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:02<00:00,  3.39it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.67,0.67,0.67,0.948837,0.821134,0.476581,0.821134


## `shuttle`

In [3]:
dataset_names=['shuttle']
paths=[dataset_paths[name] for name in dataset_names]

### EIF

#### Scenario I, contamination=0.1

In [20]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=0)
df_perf=collect_performance_df(dataset_names,paths,1,split=False,scaler='StandardScaler',model=EIF)
df_perf

Loading shuttle dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/shuttle.mat
shuttle 

[number of samples = 49097]
[percentage outliers = 0.0715114976475141]
[number features = 9]
[number outliers = 3511]


Computing metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Computing metrics: 100%|██████████| 1/1 [00:09<00:00,  9.37s/it]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,shuttle,0.703402,0.98348,0.82019,0.969163,0.97577,0.692963,0.97577


#### Scenario II, contamination=0.1

In [21]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=0)
df_perf=collect_performance_df(dataset_names,paths,1,scaler='StandardScaler',model=EIF)
df_perf

Loading shuttle dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/shuttle.mat
shuttle 

[number of samples = 49097]
[percentage outliers = 0.0715114976475141]
[number features = 9]
[number outliers = 3511]


Computing metrics: 100%|██████████| 1/1 [00:08<00:00,  8.88s/it]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,shuttle,0.702994,0.982911,0.819715,0.969082,0.975464,0.692203,0.975464


### EIF+

#### Scenario I, contamination=0.1

In [22]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=1)
df_perf=collect_performance_df(dataset_names,paths,5,split=False,scaler='StandardScaler',model=EIF)
df_perf

Loading shuttle dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/shuttle.mat
shuttle 

[number of samples = 49097]
[percentage outliers = 0.0715114976475141]
[number features = 9]
[number outliers = 3511]


Computing metrics: 100%|██████████| 5/5 [00:46<00:00,  9.36s/it]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,shuttle,0.707802,0.989633,0.825321,0.970043,0.979083,0.701205,0.979083


#### Scenario I, contamination=0.1 + Downsample

In [4]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=1)
df_perf=collect_performance_df(dataset_names,paths,5,split=False,scaler='StandardScaler',model=EIF,use_downsample=True)
df_perf

Loading shuttle dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/shuttle.mat
shuttle 

[number of samples = 49097]
[percentage outliers = 0.0715114976475141]
[number features = 9]
[number outliers = 3511]
downsampled to 2500
(2500, 9)


Computing metrics: 100%|██████████| 5/5 [00:04<00:00,  1.12it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,shuttle,0.7152,0.998883,0.833566,0.97144,0.984103,0.714484,0.984103


#### Scenario II, contamination=0.1 

In [5]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=1)
df_perf=collect_performance_df(dataset_names,paths,5,scaler='StandardScaler',model=EIF)
df_perf

Loading shuttle dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/shuttle.mat
shuttle 

[number of samples = 49097]
[percentage outliers = 0.0715114976475141]
[number features = 9]
[number outliers = 3511]


Computing metrics: 100%|██████████| 5/5 [00:45<00:00,  9.10s/it]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,shuttle,0.709053,0.991341,0.826765,0.970291,0.980006,0.703533,0.980006


#### Scenario II, contamination=0.1 + Downsample

In [6]:
EIF=ExtendedIsolationForest(n_estimators=300,plus=1)
df_perf=collect_performance_df(dataset_names,paths,5,scaler='StandardScaler',model=EIF,use_downsample=True)
df_perf

Loading shuttle dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/shuttle.mat
shuttle 

[number of samples = 49097]
[percentage outliers = 0.0715114976475141]
[number features = 9]
[number outliers = 3511]
downsampled to 2500
(2500, 9)


Computing metrics: 100%|██████████| 5/5 [00:04<00:00,  1.15it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,shuttle,0.716,1.0,0.834499,0.9716,0.984705,0.716,0.984705


## Deep Isolation Forest (DIF)

Let's start with all the default parameters except for the contamination parameter that will be set at the real contamination of the dataset

### N.B 

Looking at the source code of the DIF I discovered that in the `fit` method the `MinMaxScaler` is used to scale the data so it does not make sense to scale the data before fitting the model.

In [5]:
p=sum(y)/len(y)
p

0.07751937984496124

### Scenario II, contamination=0.1 + Normalization

In [9]:
DF=DIF(contamination=0.1)
df_perf=collect_performance_df(dataset_names,paths,10,model=DF)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:30<00:00,  3.04s/it]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.1,0.01,0.018182,0.923256,0.505,0.086744,0.505


### Scenario II, contamination=0.1 + No Normalization

In [6]:
DF=DIF(contamination=0.1)
df_perf=collect_performance_df(dataset_names,paths,10,model=DF,use_scaler=False)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:17<00:00,  1.71s/it]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.0,0.0,0.0,0.922481,0.5,0.077519,0.5


### Scenario II, contamination=sum(y)/len(y)

In [7]:
DF=DIF(contamination=p)
df_perf=collect_performance_df(dataset_names,paths,10,model=DF,use_scaler=False)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics: 100%|██████████| 10/10 [00:08<00:00,  1.14it/s]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.0,0.0,0.0,0.922481,0.5,0.077519,0.5


## AutoEncoder

In the AutoEncoder model we have to reduce the dimensionality of the dataset in the hidden layer, so we have to pass the dimensions of each layer in the AutoEncoder in the `hidden_neurons` parameter.

### N.B

Also here by default the data are standardized with `StandardScaler` so there is no need to scale them before fitting the model.

In [None]:
p=sum(y)/len(y)
ae=AutoEncoder(contamination=p,hidden_neurons=[X_train.shape[1],7,7,X_train.shape[1]])
ae.fit(X_train)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 13)                182       
                                                                 
 dropout (Dropout)           (None, 13)                0         
                                                                 
 dense_1 (Dense)             (None, 13)                182       
                                                                 
 dropout_1 (Dropout)         (None, 13)                0         
                                                                 
 dense_2 (Dense)             (None, 13)                182       
                                                                 
 dropout_2 (Dropout)         (None, 13)                0         
                                                                 
 dense_3 (Dense)             (None, 7)                 9

2024-02-09 15:57:08.377740: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epo

AutoEncoder(batch_size=32, contamination=0.07751937984496124,
      dropout_rate=0.2, epochs=100, hidden_activation='relu',
      hidden_neurons=[13, 7, 7, 13], l2_regularizer=0.1,
      loss=<function mean_squared_error at 0x7f3a1f5002c0>,
      optimizer='adam', output_activation='sigmoid', preprocessing=True,
      random_state=None, validation_size=0.1, verbose=1)

### Scenario II, contamination=0.1 + No Normalization  

In [10]:
ae=AutoEncoder(contamination=0.1,hidden_neurons=[X_train.shape[1],7,7,X_train.shape[1]])
df_perf=collect_performance_df(dataset_names,paths,10,scaler='StandardScaler',model=ae,use_scaler=False)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics:   0%|          | 0/10 [00:00<?, ?it/s]2024-02-14 17:32:44.065070: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 13)                182       
                                                                 
 dropout (Dropout)           (None, 13)                0         
                                                                 
 dense_1 (Dense)             (None, 13)                182       
                                                                 
 dropout_1 (Dropout)         (None, 13)                0         
                                                                 
 dense_2 (Dense)             (None, 13)                182       
                                                                 
 dropout_2 (Dropout)         (None, 13)                0         
                                                                 
 dense_3 (Dense)             (None, 7)                 9

Computing metrics:  10%|█         | 1/10 [00:05<00:52,  5.81s/it]

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 13)                182       
                                                                 
 dropout_6 (Dropout)         (None, 13)                0         
                                                                 
 dense_8 (Dense)             (None, 13)                182       
                                                                 
 dropout_7 (Dropout)         (None, 13)                0         
                                                                 
 dense_9 (Dense)             (None, 13)                182       
                                                                 
 dropout_8 (Dropout)         (None, 13)                0         
                                                                 
 dense_10 (Dense)            (None, 7)                

Computing metrics:  20%|██        | 2/10 [00:09<00:38,  4.80s/it]

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_14 (Dense)            (None, 13)                182       
                                                                 
 dropout_12 (Dropout)        (None, 13)                0         
                                                                 
 dense_15 (Dense)            (None, 13)                182       
                                                                 
 dropout_13 (Dropout)        (None, 13)                0         
                                                                 
 dense_16 (Dense)            (None, 13)                182       
                                                                 
 dropout_14 (Dropout)        (None, 13)                0         
                                                                 
 dense_17 (Dense)            (None, 7)                

Computing metrics:  30%|███       | 3/10 [00:14<00:33,  4.75s/it]

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_21 (Dense)            (None, 13)                182       
                                                                 
 dropout_18 (Dropout)        (None, 13)                0         
                                                                 
 dense_22 (Dense)            (None, 13)                182       
                                                                 
 dropout_19 (Dropout)        (None, 13)                0         
                                                                 
 dense_23 (Dense)            (None, 13)                182       
                                                                 
 dropout_20 (Dropout)        (None, 13)                0         
                                                                 
 dense_24 (Dense)            (None, 7)                

Computing metrics:  40%|████      | 4/10 [00:19<00:29,  4.84s/it]

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 13)                182       
                                                                 
 dropout_24 (Dropout)        (None, 13)                0         
                                                                 
 dense_29 (Dense)            (None, 13)                182       
                                                                 
 dropout_25 (Dropout)        (None, 13)                0         
                                                                 
 dense_30 (Dense)            (None, 13)                182       
                                                                 
 dropout_26 (Dropout)        (None, 13)                0         
                                                                 
 dense_31 (Dense)            (None, 7)                

Computing metrics:  50%|█████     | 5/10 [00:23<00:22,  4.55s/it]

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_35 (Dense)            (None, 13)                182       
                                                                 
 dropout_30 (Dropout)        (None, 13)                0         
                                                                 
 dense_36 (Dense)            (None, 13)                182       
                                                                 
 dropout_31 (Dropout)        (None, 13)                0         
                                                                 
 dense_37 (Dense)            (None, 13)                182       
                                                                 
 dropout_32 (Dropout)        (None, 13)                0         
                                                                 
 dense_38 (Dense)            (None, 7)                

Computing metrics:  60%|██████    | 6/10 [00:27<00:17,  4.31s/it]

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_42 (Dense)            (None, 13)                182       
                                                                 
 dropout_36 (Dropout)        (None, 13)                0         
                                                                 
 dense_43 (Dense)            (None, 13)                182       
                                                                 
 dropout_37 (Dropout)        (None, 13)                0         
                                                                 
 dense_44 (Dense)            (None, 13)                182       
                                                                 
 dropout_38 (Dropout)        (None, 13)                0         
                                                                 
 dense_45 (Dense)            (None, 7)                

Computing metrics:  70%|███████   | 7/10 [00:31<00:12,  4.21s/it]

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_49 (Dense)            (None, 13)                182       
                                                                 
 dropout_42 (Dropout)        (None, 13)                0         
                                                                 
 dense_50 (Dense)            (None, 13)                182       
                                                                 
 dropout_43 (Dropout)        (None, 13)                0         
                                                                 
 dense_51 (Dense)            (None, 13)                182       
                                                                 
 dropout_44 (Dropout)        (None, 13)                0         
                                                                 
 dense_52 (Dense)            (None, 7)                

Computing metrics:  80%|████████  | 8/10 [00:35<00:08,  4.13s/it]

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_56 (Dense)            (None, 13)                182       
                                                                 
 dropout_48 (Dropout)        (None, 13)                0         
                                                                 
 dense_57 (Dense)            (None, 13)                182       
                                                                 
 dropout_49 (Dropout)        (None, 13)                0         
                                                                 
 dense_58 (Dense)            (None, 13)                182       
                                                                 
 dropout_50 (Dropout)        (None, 13)                0         
                                                                 
 dense_59 (Dense)            (None, 7)                

Computing metrics:  90%|█████████ | 9/10 [00:39<00:04,  4.06s/it]

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_63 (Dense)            (None, 13)                182       
                                                                 
 dropout_54 (Dropout)        (None, 13)                0         
                                                                 
 dense_64 (Dense)            (None, 13)                182       
                                                                 
 dropout_55 (Dropout)        (None, 13)                0         
                                                                 
 dense_65 (Dense)            (None, 13)                182       
                                                                 
 dropout_56 (Dropout)        (None, 13)                0         
                                                                 
 dense_66 (Dense)            (None, 7)                

Computing metrics: 100%|██████████| 10/10 [00:43<00:00,  4.33s/it]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.100526,0.2,0.133793,0.799225,0.52479,0.082121,0.52479


### Scenario II, contamination=p + No Normalization

In [11]:
ae=AutoEncoder(contamination=p,hidden_neurons=[X_train.shape[1],7,7,X_train.shape[1]])
df_perf=collect_performance_df(dataset_names,paths,1,scaler='StandardScaler',model=ae,use_scaler=False)
df_perf

Loading wine dataset from /home/davidefrizzo/Desktop/PHD/ExIFFI/data/real/wine.mat
wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


Computing metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_70 (Dense)            (None, 13)                182       
                                                                 
 dropout_60 (Dropout)        (None, 13)                0         
                                                                 
 dense_71 (Dense)            (None, 13)                182       
                                                                 
 dropout_61 (Dropout)        (None, 13)                0         
                                                                 
 dense_72 (Dense)            (None, 13)                182       
                                                                 
 dropout_62 (Dropout)        (None, 13)                0         
                                                                 
 dense_73 (Dense)            (None, 7)               

Computing metrics: 100%|██████████| 1/1 [00:04<00:00,  4.44s/it]


Unnamed: 0,Dataset,Precision,Recall,f1 score,Accuracy,Balanced Accuracy,Average Precision,ROC AUC Score
0,wine,0.117647,0.2,0.148148,0.821705,0.536975,0.085545,0.536975


## Test `compute_std` function

In [2]:
os.chdir('../data/real/')

In [3]:
import time

### `wine`

In [35]:
X,y=mat_dataset('wine',os.path.join(os.getcwd(),'wine.mat'))
X_train,X_test=partition_data(X,y)
X_train,X_test,X,y=pre_process('StandardScaler',X_train,X_test)
n=make_rand_vector(X.shape[1],X.shape[1])
val=X.dot(n)

wine 

[number of samples = 129]
[percentage outliers = 0.07751937984496124]
[number features = 13]
[number outliers = 10]


### `moodify`

In [39]:
X,y=csv_dataset('moodify',os.path.join(os.getcwd(),'moodify.csv'))
X_train,X_test=partition_data(X,y)
X_train,X_test,X,y=pre_process('StandardScaler',X_train,X_test)
n=make_rand_vector(X.shape[1],X.shape[1])
val=X.dot(n)

moodify 

[number of samples = 276260]
[percentage outliers = 0.1527112140736987]
[number features = 11]
[number outliers = 42188]


### Compare Execution times `compute_std` function

In [5]:
val=np.random.rand(300000)
start_np=time.time()
val_std=np.std(val)
np_time=time.time()-start_np
print(f'numpy Time: {np_time}')
print(f'std: {val_std}')

start_numba=time.time()
val_std=compute_std(val)
numba_time=time.time()-start_numba
print(f'numba Time: {numba_time}')
print(f'std: {val_std}')

time_diff=np_time-numba_time
time_perc=(numba_time/np_time)*100
print(f'Time difference: {time_diff}')
print(f'Time percentage improvement: {time_perc}')

numpy Time: 0.0020461082458496094
std: 0.2887009189267893
numba Time: 0.0009682178497314453
std: 0.2887009189267898
Time difference: 0.001077890396118164
Time percentage improvement: 47.319972034490796


### Compare Execution times `compute_std_for` function

In [6]:
start_np=time.time()
val_std=np.std(val)
np_time=time.time()-start_np
print(f'numpy Time: {np_time}')
print(f'std: {val_std}')

start_numba=time.time()
val_std=compute_std_for(val)
numba_time=time.time()-start_numba
print(f'numba Time: {numba_time}')
print(f'std: {val_std}')

time_diff=np_time-numba_time
time_perc=(numba_time/np_time)*100
print(f'Time difference: {time_diff}')

numpy Time: 0.001087188720703125
std: 0.28873840543901796
numba Time: 0.05278801918029785
std: 0.2887384054390195
Time difference: -0.05170083045959473


### Compare Execution times `compute_std_dev` function

In [5]:
val=np.random.rand(300000)
start_np=time.time()
val_std=np.var(val)
np_time=time.time()-start_np
print(f'numpy Time: {np_time}')
print(f'std: {val_std}')

start_numba=time.time()
val_std=calculate_std_dev(val)
numba_time=time.time()-start_numba
print(f'numba Time: {numba_time}')
print(f'std: {val_std}')

time_diff=np_time-numba_time
print(f'Time difference: {time_diff}')

numpy Time: 0.0006468296051025391
std: 0.08336986677546672
numba Time: 0.001039266586303711
std: 0.2887384054390195
Time difference: -0.0003924369812011719


### Compare Execution times `range` vs `std`

In [29]:
#val=np.random.rand(300000)
start_np=time.time()
val_std=np.std(val)
np_time=time.time()-start_np
print(f'std Time: {np_time}')
print(f'std: {val_std}')
print('#'*50)

start_range=time.time()
val_range=np.max(val)-np.min(val)
range_time=time.time()-start_range
print(f'range Time: {range_time}')
print(f'range: {val_range}')
print('#'*50)

time_diff=np_time-range_time
print(f'Time difference: {time_diff}')

std Time: 0.0010101795196533203
std: 1.0658973576087336
##################################################
range Time: 0.0003421306610107422
range: 19.634186249839274
##################################################
Time difference: 0.0006680488586425781


### Compare Execution times `IQR` vs `std`

In [36]:
#val=np.random.rand(300000)
start_np=time.time()
val_std=np.std(val)
np_time=time.time()-start_np
print(f'std Time: {np_time}')
print(f'std: {val_std}')
print('#'*50)

start_IQR=time.time()
val_IQR=np.percentile(val,75)-np.percentile(val,25)
IQR_time=time.time()-start_IQR
print(f'IQR Time: {IQR_time}')
print(f'IQR: {val_IQR}')
print('#'*50)

time_diff=np_time-IQR_time
print(f'Time difference: {time_diff}')

std Time: 0.0001201629638671875
std: 0.8612610241892044
##################################################
IQR Time: 0.0007569789886474609
IQR: 1.1644698014405102
##################################################
Time difference: -0.0006368160247802734


### Compare Execution times `MAD` vs `std`

In [40]:
#val=np.random.rand(300000)
start_np=time.time()
val_std=np.std(val)
np_time=time.time()-start_np
print(f'std Time: {np_time}')
print(f'std: {val_std}')
print('#'*50)

start_MAD=time.time()
val_MAD=np.mean(np.abs(val-np.mean(val)))
MAD_time=time.time()-start_MAD
print(f'MAD Time: {MAD_time}')
print(f'MAD: {val_MAD}')
print('#'*50)

time_diff=np_time-MAD_time
print(f'Time difference: {time_diff}')

std Time: 0.0011882781982421875
std: 1.2166114762203248
##################################################
MAD Time: 0.0007545948028564453
MAD: 0.8354337822996187
##################################################
Time difference: 0.0004336833953857422
