In [1]:
%load_ext autoreload
%autoreload

import numpy as np
import pandas as pd
import os
import sys
import pickle

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from custom_scripts.config import loader
from custom_scripts.evaluate_performance import evaluate_performance
from custom_scripts.prepare_data import prepare_data

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

### Load the data

In [23]:
day_of_week='Monday'

data_file= loader(day_of_week=day_of_week,data_type='processed',subtype='Normalized')
df=pd.read_pickle(data_file)

## Preparing the Dataset for Anomaly Detection 

Here we need to separate our class variable, which in our case is "Label" from the rest of the dataset.

Anomaly detection algorithms output +1 for inlier and -1 for outliers. Therefore:

1. We need to map multiple classes to binary classes.
2. Since we are interesting in detecting anomalies, the negative class is 'BENIGN'.

In [24]:
X,y=prepare_data(data=df,class_column='Label',classes='binary',neg_class='BENIGN')

## 1. Training the Isolation Forest Method

We train an Isolation Forest model to detect anomalies in the dataset. Returns -1 for outlier and 1 for inliers.

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

### Train and save the model

We first train our model and pickle save it for future retrieval. 

The contamination parameter of the model is the expected fraction of outliers in the dataset and should be adjusted if necessary.

In [26]:
isf = IsolationForest(behaviour='new', random_state=42, n_jobs=3)
isf.fit(X)

filename='./models/isf_model_'+day_of_week+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(isf,file)



## 2. Training the one-class SVM Method

We train an one class SVM model to detect anomalies in the dataset. Returns -1 for outlier and 1 for inliers.

See: https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM

### Train and save the model

We first train our model and pickle save it for future retrieval. The kernel choice is **Radial Basis Function** ('rbf) as we would like to have locality in the decision boundaries learned by our SVM.

In [6]:
svm_model=OneClassSVM(kernel='rbf',gamma='auto')
svm_model.fit(X)

filename='./models/svm_model_'+day_of_week+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(svm_model,file)

## 3. Training the Elliptic Envelope method

We train an an Elliptic envelope model designed to detect outliers in a Gaussian distributed dataset.

Returns -1 for outlier and 1 for inliers.

See: https://scikit-learn.org/stable/modules/generated/sklearn.covariance.EllipticEnvelope.html

### Train and save the model

We first train our model and pickle save it for future retrieval. 

The contamination parameter of the model is the expected fraction of outliers in the dataset and should be adjusted if necessary.

In [27]:
cov = EllipticEnvelope(contamination=0.1,random_state=0)
cov.fit(X)

filename='./models/cov_model_'+day_of_week+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(cov,file)



## 4. Training the Local Outlier Factor (LOF) method

We train an an LOF model to detect anomalies in the dataset.
Returns -1 for outlier and 1 for inliers.

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html

In [28]:
Novelty= True

lof = LocalOutlierFactor(n_neighbors=35, novelty=Novelty,n_jobs=3)

if Novelty:

    lof = lof.fit(X)

    filename='./models/lof_model_'+day_of_week+'.pkl'

    with open(filename,'wb') as file:
        pickle.dump(lof,file)

else:

    lof_anomalies=lof.fit_predict(x_red_pca)

    filename='./predictions/lof_anomalies_'+day_of_week+'_PCA_'+str(new_dimension)+'.pkl'

    with open(filename,'wb') as file:
        pickle.dump(lof_anomalies,file)

