In [1]:
#Loading the required libraries

#Warnings package for ignoring unneccesary warnings
import warnings 
warnings.filterwarnings('ignore')

Below are the list of algorithms associated with pyOD package that we are going to learn

1. **Linear based Outlier Detection**
> 1.1 **PCA: Principal Component Analysis** uses sum of weighted projected distances to the eigenvector hyperplane as outlier score<br>
> 1.2 **MCD: Minimum Covariant Determinant** uses mahalanobis distance as the outlier scores<br>
> 1.3 **OCSVM: One-Factor Support Vector Machine** uses<br>

2. **Proximity Based Outlier Detection**
> 2.1 **LOF: Local Outlier Factor**<br>
> 2.2 **CBLOF: Clustering-Based Local Outlier Factor**<br>
> 2.3 **kNN: k-Nearst Neighbours**<br>
> 2.4 **HBOS: Histogram Based Outlier Scores**<br>

3. **Probabilistic Outlier Detection**<br>
> 3.1 **ABOD: Angular Based Outlier Detection**<br>

4. **Outlier Ensembles and Framework Combinations**<br>
> 4.1 **Isolation Forest**<br>
> 4.2 **Feature Bagging**

### Import required libraries

In [2]:
#General libraries
import os
import sys
from time import time

#data manipulation and storage libraries
import numpy as np
import pandas as pd

#matlab file reading libraries
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

#import pyod packages
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

#Performance packages
from pyod.utils.utility import standardizer, precision_n_scores
from sklearn.metrics import roc_auc_score

### 2. Collecting and reading files

In [3]:
#define the list of all files 
mat_files_list = [
    'arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat',
    'lympho.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat',
    'pima.mat','satellite.mat','satimage-2.mat','shuttle.mat','vertebral.mat',
    'vowels.mat','wbc.mat'
]

#define the columns
df_columns = ['Data','#Samples','#Dimensions','Outlier Perc','ABOD','CBLOF','FB','HBOS','IForest','KNN','LOF','MCD','OCSVM','PCA']

In [4]:
#creating a empty results dataframes(ROC, precision scores and Time)
roc_df = pd.DataFrame(columns=df_columns)
prec_df = pd.DataFrame(columns=df_columns)
time_df = pd.DataFrame(columns=df_columns)

### 3. Exploring all matfiles and build the models

In [9]:
#declare random state
random_state = np.random.RandomState(42)

#instantiate a for loop that takes all above mat files and divide the X and y data
for mat_file in mat_files_list:
    print("\n....Processing....", mat_file)
    mat = loadmat(mat_file)
    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)
    
    #construct containers for saving results
    roc_list = [mat_file[:-4],X.shape[0], X.shape[1], outliers_percentage]
    prec_list = [mat_file[:-4],X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4],X.shape[0], X.shape[1], outliers_percentage]
    
    #split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state)
    
    #Standardizing the data for normalize values
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    
    #Define the classifiers dictionary
    classifiers = {
        'Angle-Based Outlier Detector (ABOD)' : ABOD(contamination=outliers_fraction),
        'Cluster-Based Local Outlier Factor(CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=101),
        'Feature Bagging(FB)': FeatureBagging(contamination=outliers_fraction, random_state=random_state),
        'Histogram Based Outlier Scores(HBOS)': HBOS(contamination=outliers_fraction),
        'Isolation Forest(IF)': IForest(contamination=outliers_fraction, random_state=random_state),
        'k-nearest Neighbours(kNN)' : KNN(contamination=outliers_fraction),
        'Local Outlier Factor(LOF)' : LOF(contamination=outliers_fraction),
        'Minimum Covariant Determinant(MCD)': MCD(contamination=outliers_fraction),
        'One-Factor Support Vector Machine(OCSVM)': OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis(PCA)': PCA(contamination=outliers_fraction, random_state=random_state)
        
    }
    
    #instantiate a for loop that takes all above mat files and apply all classifiers above
    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)
        
        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)
        print('{clf_name} ROC: {roc}, precision @ rank n:{prn}, execution time: {duration} s'.format(clf_name = clf_name,roc = roc, prn = prn, duration = duration))
        roc_list.append(roc)
        prec_list.append(prn)
    
    temp_df_1 = pd.DataFrame(time_list).transpose()
    temp_df_1.columns = df_columns
    time_df = pd.concat([time_df, temp_df_1], axis=0)
    
    temp_df_2 = pd.DataFrame(roc_list).transpose()
    temp_df_2.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df_2], axis=0)
    
    temp_df_3 = pd.DataFrame(prec_list).transpose()
    temp_df_3.columns = df_columns
    prec_df = pd.concat([prec_df, temp_df_3], axis=0)
    


....Processing.... arrhythmia.mat
Angle-Based Outlier Detector (ABOD) ROC: 0.7687, precision @ rank n:0.3571, execution time: 5.7999 s
Cluster-Based Local Outlier Factor(CBLOF) ROC: 0.7747, precision @ rank n:0.5, execution time: 4.7285 s
Feature Bagging(FB) ROC: 0.7806, precision @ rank n:0.4643, execution time: 1.0101 s
Histogram Based Outlier Scores(HBOS) ROC: 0.8511, precision @ rank n:0.5714, execution time: 5.5793 s
Isolation Forest(IF) ROC: 0.8527, precision @ rank n:0.5714, execution time: 1.4086 s
k-nearest Neighbours(kNN) ROC: 0.782, precision @ rank n:0.5, execution time: 0.218 s
Local Outlier Factor(LOF) ROC: 0.7787, precision @ rank n:0.4643, execution time: 0.126 s
Minimum Covariant Determinant(MCD) ROC: 0.8228, precision @ rank n:0.4286, execution time: 3.0955 s
One-Factor Support Vector Machine(OCSVM) ROC: 0.7986, precision @ rank n:0.5, execution time: 0.083 s
Principal Component Analysis(PCA) ROC: 0.7997, precision @ rank n:0.5, execution time: 0.1867 s

....Processi

Isolation Forest(IF) ROC: 0.7121, precision @ rank n:0.0172, execution time: 2.65 s
k-nearest Neighbours(kNN) ROC: 0.363, precision @ rank n:0.0, execution time: 4.552 s
Local Outlier Factor(LOF) ROC: 0.4692, precision @ rank n:0.0172, execution time: 3.486 s
Minimum Covariant Determinant(MCD) ROC: 0.4116, precision @ rank n:0.0, execution time: 4.383 s
One-Factor Support Vector Machine(OCSVM) ROC: 0.4732, precision @ rank n:0.0, execution time: 2.37 s
Principal Component Analysis(PCA) ROC: 0.4931, precision @ rank n:0.0, execution time: 0.117 s

....Processing.... pendigits.mat
Angle-Based Outlier Detector (ABOD) ROC: 0.7067, precision @ rank n:0.0526, execution time: 5.188 s
Cluster-Based Local Outlier Factor(CBLOF) ROC: 0.7942, precision @ rank n:0.1053, execution time: 1.0444 s
Feature Bagging(FB) ROC: 0.5604, precision @ rank n:0.0702, execution time: 7.574 s
Histogram Based Outlier Scores(HBOS) ROC: 0.9246, precision @ rank n:0.2105, execution time: 0.028 s
Isolation Forest(IF) R

In [10]:
print(time_df)

         Data #Samples #Dimensions Outlier Perc     ABOD   CBLOF       FB  \
0  arrhythmia      452         274      14.6018   5.7999  4.7285   1.0101   
0      cardio     1831          21       9.6122    1.332  0.4759    1.599   
0       glass      214           9       4.2056   0.1681   0.143    0.101   
0  ionosphere      351          33      35.8974    0.286   0.214    0.186   
0      letter     1600          32         6.25    1.209   0.478    1.345   
0      lympho      148          18       4.0541    0.115   0.145    0.092   
0       mnist     7603         100       9.2069   18.028   3.732  93.0682   
0        musk     3062         166       3.1679    5.844   1.335  32.5271   
0   optdigits     5216          64       2.8758     6.89   1.852   25.414   
0   pendigits     6870          16       2.2707    5.188  1.0444    7.574   
0        pima      768           8      34.8958    0.935   0.287    0.236   
0   satellite     6435          36      31.6395    6.491    1.36   16.693   

In [11]:
print(roc_df)

         Data #Samples #Dimensions Outlier Perc    ABOD   CBLOF      FB  \
0  arrhythmia      452         274      14.6018  0.7687  0.7747  0.7806   
0      cardio     1831          21       9.6122  0.6618  0.8471  0.6281   
0       glass      214           9       4.2056   0.755  0.8554  0.8514   
0  ionosphere      351          33      35.8974  0.9275  0.9101  0.9228   
0      letter     1600          32         6.25  0.8871  0.7627  0.8956   
0      lympho      148          18       4.0541  0.8362  0.9741  0.9741   
0       mnist     7603         100       9.2069  0.7796  0.8643  0.7209   
0        musk     3062         166       3.1679  0.1806       1  0.7416   
0   optdigits     5216          64       2.8758  0.4699  0.7638  0.4581   
0   pendigits     6870          16       2.2707  0.7067  0.7942  0.5604   
0        pima      768           8      34.8958  0.7186  0.6529  0.6497   
0   satellite     6435          36      31.6395  0.5564   0.714   0.563   
0  satimage-2     5803   

In [12]:
print(prec_df)

         Data #Samples #Dimensions Outlier Perc    ABOD   CBLOF      FB  \
0  arrhythmia      452         274      14.6018  0.3571     0.5  0.4643   
0      cardio     1831          21       9.6122  0.2571  0.5714     0.1   
0       glass      214           9       4.2056  0.3333  0.3333  0.3333   
0  ionosphere      351          33      35.8974  0.8364  0.7963  0.7818   
0      letter     1600          32         6.25  0.4091  0.2727  0.5227   
0      lympho      148          18       4.0541     0.5     0.5     0.5   
0       mnist     7603         100       9.2069  0.3563  0.4368  0.3065   
0        musk     3062         166       3.1679  0.1081       1  0.2973   
0   optdigits     5216          64       2.8758       0       0  0.0172   
0   pendigits     6870          16       2.2707  0.0526  0.1053  0.0702   
0        pima      768           8      34.8958   0.534  0.4272  0.4369   
0   satellite     6435          36      31.6395  0.3819  0.5337  0.3953   
0  satimage-2     5803   