# Import Python Packages

In [70]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# Import Pyod packages and methods

In [71]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

# Import Metrics Packages

In [72]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# Define Data

In [73]:
mat_file_list = ["arrhythmia.mat",
                 "cardio.mat",
                 "glass.mat",
                 "ionosphere.mat",
                 "letter.mat",
                 "lympho.mat",
                 "mnist.mat", 
                 "musk.mat",
                 "optdigits.mat",
                 "pendigits.mat",
                 "pima.mat",
                 "satellite.mat",
                 "satimage-2.mat",
                 "shuttle.mat",
                 "vertebral.mat",
                 "vowels.mat",
                 "wbc.mat"]

# How to load MAT file

In [74]:
data = loadmat('./Dataset/cardio.mat')

In [75]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [76]:
len(data)

5

In [77]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [78]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

# Input (Independent) Feature Shape in MAT File format

In [79]:
type(data['X']), data['X'].shape

(numpy.ndarray, (1831, 21))

# Dependent / Target / Ouput Feature Shape

In [80]:
type(data['y']), data['y'].shape

(numpy.ndarray, (1831, 1))

# Exploring all MAT files

In [81]:
from time import time
random_state = np.random.RandomState(42)

In [None]:
for mat_file in mat_file_list:
    print("\n....Processing",mat_file,'....')
    mat = loadmat(os.path.join('Dataset',mat_file))
    
    X = mat['X']
    y = mat['y']
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits = 4)
    
    print("Outliers Percentage:",outliers_percentage)
    
    #construct containers to save the results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    
    #training and test data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=random_state)
    
    #standardizing the data from processing
    X_train_norm, X_test_norm = standardizer(X_train,X_test)
    
    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination = outliers_fraction),
                  'Cluster-based Local Outlier Factor': CBLOF(contamination = outliers_fraction, check_estimator=False, random_state=random_state),
                   'Feature Bagging':FeatureBagging(contamination=outliers_fraction, random_state=random_state),
                   'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                   'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
                   'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
                   'Average KNN': KNN(contamination=outliers_fraction),
                   'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),
                   'One-class SVM (OCSVM)':OCSVM(contamination=outliers_fraction),
                   'Principal Component Analysis (PCA)':PCA(contamination=outliers_fraction, random_state=random_state)
                  }
    
    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)
        
        roc = round(roc_auc_score(y_test,test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test,test_scores), ndigits=4)
        
        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, execution_time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))
        roc_list.append(roc)
        prn_list.append(prn)
        
        
    #temp_df = pd.DataFrame(time_list).transpose()
    #temp_df.columns = df_columns
    #time_df = pd.concat([time_df, temp_df], axis = 0)
    
    #temp_df = pd.DataFrame(toc_list).transpose()
    #temp_df.columns = df_columns
    #roc_df = pd.concat([roc_df, temp_df], axis = 0)
    
    #temp_df = pd.DataFrame(prn_list).transpose()
    #temp_df.columns = df_columns
    #prn_df = pd.concat([prn_df, temp_df], axis = 0)
    


....Processing arrhythmia.mat ....
Outliers Percentage: 14.6018
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution_time: 0.1177s
Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643, execution_time: 0.1047s
Feature Bagging ROC:0.7799, precision @ rank n:0.5, execution_time: 0.5137s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution_time: 0.0467s
Isolation Forest ROC:0.8527, precision @ rank n:0.5714, execution_time: 0.2933s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution_time: 0.0708s
Average KNN ROC:0.782, precision @ rank n:0.5, execution_time: 0.0708s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution_time: 0.4398s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution_time: 0.0409s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution_time: 0.0439s

....Processing cardio.mat ....
Outliers Percentage: 9.6122
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n:0.1875, execution_time: 0.3042s
Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844, execution_time: 0.1416s
Feature Bagging ROC:0.4879, precision @ rank n:0.1406, execution_time: 0.6353s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @ rank n:0.4688, execution_time: 0.006s
Isolation Forest ROC:0.9414, precision @ rank n:0.5, execution_time: 0.3012s
K Nearest Neighbors (KNN) ROC:0.6959, precision @ rank n:0.2812, execution_time: 0.1187s
Average KNN ROC:0.6959, precision @ rank n:0.2812, execution_time: 0.1167s




Minimum Covariance Determinant (MCD) ROC:0.8778, precision @ rank n:0.3906, execution_time: 0.392s
One-class SVM (OCSVM) ROC:0.9507, precision @ rank n:0.5938, execution_time: 0.0708s
Principal Component Analysis (PCA) ROC:0.9638, precision @ rank n:0.6875, execution_time: 0.003s

....Processing glass.mat ....
Outliers Percentage: 4.2056
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n:0.25, execution_time: 0.0399s
Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25, execution_time: 0.0319s
Feature Bagging ROC:0.7043, precision @ rank n:0.25, execution_time: 0.0269s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @ rank n:0.0, execution_time: 0.002s
Isolation Forest ROC:0.7195, precision @ rank n:0.25, execution_time: 0.2294s
K Nearest Neighbors (KNN) ROC:0.7805, precision @ rank n:0.25, execution_time: 0.007s
Average KNN ROC:0.7805, precision @ rank n:0.25, execution_time: 0.007s
Minimum Covariance Determinant (MCD) ROC:0.7165, precis



Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @ rank n:0.3562, execution_time: 5.3357s
Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007, execution_time: 0.7769s
Feature Bagging ROC:0.7259, precision @ rank n:0.3664, execution_time: 39.1712s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @ rank n:0.1199, execution_time: 0.0369s
Isolation Forest ROC:0.7801, precision @ rank n:0.2979, execution_time: 1.2637s
K Nearest Neighbors (KNN) ROC:0.8409, precision @ rank n:0.4144, execution_time: 4.885s
Average KNN ROC:0.8409, precision @ rank n:0.4144, execution_time: 5.58s


