### Importing essential modules

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from time import time 
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

### Importing pyod methods

In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

from warnings import filterwarnings
filterwarnings('ignore')

### Importing metrics

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

### Assigning all the datasets to a list

In [4]:
mat_file_list = ['arrhythmia.mat','cardio.mat',
                 'glass.mat','ionosphere.mat',
                 'letter.mat','lympho.mat',
                 'mnist.mat','musk.mat',
                 'optdigits.mat','pendigits.mat',
                 'pima.mat','satellite.mat',
                 'satimage-2.mat','shuttle.mat',
                 'vertebral.mat','vowels.mat','wbc.mat']

In [6]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [7]:
len(mat_file_list)

17

### Making a list of the column name to record the result

In [9]:
df_columns=['Data','#Sample','#Dimensions','Outlier Perc','PCA','MCD','OCSVM','LOF','CBLOF','KNN','HBOS','ABOD','IFOREST',
            'FEATUREBAGGING']

### 1. ROC Dataframe

In [10]:
roc_df=pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


### 2. Precison Dataframe

In [11]:
prn_df=pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


### 3. Execution Time Dataframe

In [12]:
time_df=pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


### Load one Dataframe to check values of X and y

In [14]:
data_1 = loadmat("Anamoly_detec_data/vowels.mat") 
data_1

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-26 08:42:13 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.58046914, -0.90253404,  0.61789919, ...,  1.60463715,
         -0.6230598 , -0.38312549],
        [ 0.78437493, -1.07736635,  0.6157809 , ...,  1.26023551,
         -0.42333934, -0.2877912 ],
        [ 0.79129238, -1.08624216,  0.66977272, ...,  1.08179729,
         -0.26720104, -0.17220348],
        ...,
        [ 0.9470763 ,  0.35810832,  0.27472497, ..., -1.08832841,
          0.3271257 ,  1.69283401],
        [ 1.58485142,  0.69359118, -0.37568588, ..., -3.07682047,
         -0.24109405,  1.94433536],
        [ 2.32735022,  0.38281412,  0.77590669, ..., -0.48257003,
         -0.59043614, -0.72199018]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

### Exploring Mat files and finding best Algorithm to detect Anomaly

In [15]:
# Creating random state
random_state = np.random.RandomState(42)

# Processing mat files one by one : 
for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join('Anamoly_detec_data', mat_file))

    X = mat['X']
    y = mat['y'].ravel()  #ravel() function converts 2D to 1D
    
    # Counting Outlier :
    
    # Counts the number of non-zero values in the array y and divide by length of y : It gives outlier in fraction
    outliers_fraction = np.count_nonzero(y) / len(y)
    
    # Calculating Outlier percentage
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # Construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # Spliting Data into : 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state)
    
    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    # Applying all the algorithms and storing thier result in a dictionary format:
    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                   
                   'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False,
                                                               random_state=random_state),
                   
                   'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state),
                   
                   'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                   
                   'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state),
                   
                   'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
                   
                   'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
                   
                   'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),
                   
                   'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
                   
                   'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state),
   }


    # Calculating Time taken for each algorithm
    for clf_name, clf in classifiers.items():
        # Initialize the start time 
        t0 = time() 
        
        # Fit( Train )the data
        clf.fit(X_train_norm) 
        
        # Predicting Value on Xtest
        test_scores = clf.decision_function(X_test_norm)  
        
        # Final Time
        t1 = time()   
        
        # Total time duration : t1 - t0
        duration = round(t1 - t0, ndigits=4) 
        
        # Append duration in time list
        time_list.append(duration)

        #Calculating roc and precision value of the algorithm
        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        # Print the roc , precision and executing time 
        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        # Append roc and precision value to their respective list
        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 4.2453s
Cluster-based Local Outlier Factor ROC:0.7789, precision @ rank n:0.4643, execution time: 4.2143s
Feature Bagging ROC:0.7796, precision @ rank n:0.4643, execution time: 0.1465s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 3.5335s
Isolation Forest ROC:0.8595, precision @ rank n:0.5714, execution time: 1.0284s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.081s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.0117s
Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 4.4417s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.0503s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.1856s

... Processing cardio.mat ...
Angle-based Outlier Dete

One-class SVM (OCSVM) ROC:0.5171, precision @ rank n:0.0, execution time: 3.2851s
Principal Component Analysis (PCA) ROC:0.526, precision @ rank n:0.0, execution time: 0.0962s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.667, precision @ rank n:0.0526, execution time: 3.4535s
Cluster-based Local Outlier Factor ROC:0.8082, precision @ rank n:0.1579, execution time: 0.4225s
Feature Bagging ROC:0.4889, precision @ rank n:0.0526, execution time: 7.4186s
Histogram-base Outlier Detection (HBOS) ROC:0.9348, precision @ rank n:0.2632, execution time: 0.0184s
Isolation Forest ROC:0.9414, precision @ rank n:0.2807, execution time: 1.3728s
K Nearest Neighbors (KNN) ROC:0.7371, precision @ rank n:0.0702, execution time: 1.6941s
Local Outlier Factor (LOF) ROC:0.4965, precision @ rank n:0.0702, execution time: 1.2592s
Minimum Covariance Determinant (MCD) ROC:0.8204, precision @ rank n:0.0877, execution time: 6.4029s
One-class SVM (OCSVM) ROC:0.9235, precision @ rank n:

### ROC Dataframe

In [16]:
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.7687,0.7789,0.7796,0.8511,0.8595,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5892,0.8845,0.6385,0.8373,0.9527,0.734,0.588,0.8534,0.9478,0.9616
0,glass,214,9,4.2056,0.6951,0.811,0.7073,0.7073,0.7134,0.8384,0.7043,0.8293,0.6585,0.686
0,ionosphere,351,33,35.8974,0.9181,0.9176,0.9303,0.6052,0.8516,0.932,0.9227,0.9669,0.8257,0.7941
0,letter,1600,32,6.25,0.8783,0.7783,0.8947,0.6063,0.6178,0.8573,0.8765,0.8061,0.5927,0.5216
0,lympho,148,18,4.0541,0.9831,1.0,1.0,1.0,1.0,1.0,1.0,0.9153,1.0,1.0
0,mnist,7603,100,9.2069,0.7628,0.8389,0.7157,0.5766,0.7804,0.8498,0.7195,0.8713,0.854,0.8534
0,musk,3062,166,3.1679,0.2161,1.0,0.473,0.9999,1.0,0.8009,0.4629,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4894,0.7901,0.5062,0.8774,0.6682,0.406,0.5277,0.3822,0.5171,0.526
0,pendigits,6870,16,2.2707,0.667,0.8082,0.4889,0.9348,0.9414,0.7371,0.4965,0.8204,0.9235,0.9309


### Precision Dataframe

In [17]:
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.4643,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1918,0.4932,0.1781,0.4521,0.6027,0.3562,0.1507,0.411,0.5342,0.6849
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8431,0.8039,0.8039,0.3922,0.6078,0.8824,0.7843,0.8627,0.6863,0.5686
0,letter,1600,32,6.25,0.4375,0.1875,0.4062,0.0938,0.0625,0.3125,0.3438,0.1875,0.125,0.125
0,lympho,148,18,4.0541,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
0,mnist,7603,100,9.2069,0.3367,0.3912,0.3741,0.1361,0.2823,0.432,0.3673,0.2653,0.3946,0.3878
0,musk,3062,166,3.1679,0.1,1.0,0.125,0.975,1.0,0.175,0.125,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.0152,0.0,0.0303,0.2121,0.0,0.0,0.0303,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0526,0.1579,0.0526,0.2632,0.2807,0.0702,0.0702,0.0877,0.3158,0.3158


### Execution Time Dataframe

In [18]:
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,4.2453,4.2143,0.1465,3.5335,1.0284,0.081,0.0117,4.4417,0.0503,0.1856
0,cardio,1831,21,9.6122,0.7528,0.221,1.1146,0.0156,0.8345,0.2517,0.1004,2.3495,0.3516,0.011
0,glass,214,9,4.2056,0.0747,0.0715,0.0855,0.0094,0.5951,0.024,0.0102,0.1085,0.006,0.0052
0,ionosphere,351,33,35.8974,0.1271,0.0869,0.0811,0.027,0.6243,0.0268,0.0081,0.3998,0.008,0.008
0,letter,1600,32,6.25,0.6851,0.2427,0.7913,0.02,0.8316,0.1913,0.0771,7.2884,0.2903,0.0161
0,lympho,148,18,4.0541,0.0505,0.0707,0.067,0.008,0.6054,0.0146,0.004,0.2116,0.004,0.0072
0,mnist,7603,100,9.2069,8.1873,1.1667,16.5414,0.1164,3.3854,5.4412,1.6617,14.4434,8.131,0.2716
0,musk,3062,166,3.1679,2.6254,0.4081,2.7648,0.1219,2.1817,1.5,0.2683,40.1254,2.0442,0.3624
0,optdigits,5216,64,2.8758,4.5801,0.6958,8.1719,0.0758,1.9608,1.8139,0.7355,7.2164,3.2851,0.0962
0,pendigits,6870,16,2.2707,3.4535,0.4225,7.4186,0.0184,1.3728,1.6941,1.2592,6.4029,5.4625,0.0183
