# import python packagees

In [1]:
import os
import sys
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# import pyod packages and methods

In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging




# import metrics and packages

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# Define data and read x and y

In [4]:
mat_file_list = ['arrhythmia.mat','cardio.mat', 'glass.mat' ,'ionosphere.mat', 'letter.mat', 'lympho.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat',
'vowels.mat',]



# how to load mat

In [5]:
data = loadmat('data/cardio.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

len(data)

In [6]:
len(data)

5

In [7]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [8]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

# input independent feature in mat file

In [9]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

# Dependent shape

In [10]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [25]:
len(y)

452

# how to import all files in mat

In [30]:
from time import time
random_state = np.random.RandomState(42)


for mat_file in mat_file_list:
    print("\n...processing",mat_file, '----')
    mat = loadmat(os.path.join('data',mat_file))
    
    
    x = mat['X']
    y = mat['y'].ravel()
    
    
    outliers_fraction=np.count_nonzero(y) / len(y)
    outliers_percentage=round(outliers_fraction * 100, ndigits=4)
    
    #construct containers for saving results
    roc_list = [mat_file[:-4], x.shape[0] ,x.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], x.shape[0] ,x.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], x.shape[0] ,x.shape[1], outliers_percentage]
    
    # 60% data for traning and 40% for testing
    X_train,y_train,X_test,y_test = train_test_split(x,y,test_size = 0.4,random_state = random_state)
    
    #standardizing the data for processing
    X_train_norm , X_test_norm = standardizer(X_train,y_train)
    
    classifiers = {'Angle-based outlier Detector (ABOD)': ABOD(contamination = outliers_fraction),
                  'cluster-based local outlier factor': CBLOF(contamination = outliers_fraction, check_estimator =False,
                                                             random_state = random_state),
                   'Feature Bagging' : FeatureBagging(contamination=outliers_fraction,random_state=random_state),
                   'Histogram-base outlier Detection(HBOS)':HBOS(contamination = outliers_fraction),
                   'Isolation Forest':IForest(contamination=outliers_fraction,random_state=random_state),
                   'K Nearest Neighbour(KNN)':KNN(contamination=outliers_fraction),
                   'Local Outlier Factor(LOF)':LOF(contamination=outliers_fraction),
                   'Minumum covarience deterinant(MCD)':MCD(contamination=outliers_fraction,random_state=random_state),
                   'one-class SVM (OCSVM)':OCSVM(contamination=outliers_fraction),
                   'principal component Analysis(PCA)' : PCA(contamination=outliers_fraction,random_state=random_state)
                  }
    
    
    for clf_name,clf in classifiers.items():
        tO = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - tO, ndigits = 4)
        time_list.append(duration)
        
        
        roc = round(roc_auc_score(y_test , test_scores), ndigits = 4)
        prn = round(precision_n_scores(y_test , test_scores), ndigits = 4)
        
        
        print('{clf_name} ROC:{roc}, precision @ rank n:{prn},'
            'execution time: {duration}s' .format(
            clf_name = clf_name ,roc=roc,prn = prn,duration=duration))
        
        
        roc_list.append(roc)
        prn_list.append(prn)
        
        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df,temp_df] , axis = 0)
    
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([roc_df,temp_df] , axis = 0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([prn_df,temp_df] , axis = 0)

    


...processing arrhythmia.mat ----
Angle-based outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571,execution time: 0.1556s




cluster-based local outlier factor ROC:0.7684, precision @ rank n:0.4643,execution time: 0.1022s
Feature Bagging ROC:0.7799, precision @ rank n:0.5,execution time: 0.5755s
Histogram-base outlier Detection(HBOS) ROC:0.8511, precision @ rank n:0.5714,execution time: 0.0549s
Isolation Forest ROC:0.8478, precision @ rank n:0.5357,execution time: 0.4029s
K Nearest Neighbour(KNN) ROC:0.782, precision @ rank n:0.5,execution time: 0.0818s
Local Outlier Factor(LOF) ROC:0.7787, precision @ rank n:0.4643,execution time: 0.0688s




Minumum covarience deterinant(MCD) ROC:0.8228, precision @ rank n:0.4286,execution time: 0.6233s
one-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5,execution time: 0.0409s
principal component Analysis(PCA) ROC:0.7997, precision @ rank n:0.5,execution time: 0.0568s


NameError: name 'time_df' is not defined