In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
 
import sklearn.preprocessing
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

import utils
import constants

In [10]:
def drop_and_fill(data):
    data["timestamp"] = pd.to_datetime(data["timestamp"])
    data = data.drop(["QGL", "T-JUS-CKGL"], axis=1)
    data = data.fillna(0)
    data = data.set_index("timestamp")
    
    return data

In [11]:
def split_normal_data(data, split_size=constants.TRAIN_SPLIT):
    normal_data = data[data["class"] == 0]
    
    normal_data = normal_data.drop("class", axis=1)

    split = int(split_size * len(normal_data))

    train_normal_data = normal_data.iloc[:split]
    test_normal_data = normal_data.iloc[split:]
    
    return train_normal_data, test_normal_data

In [12]:
def match_normal_samples(abnormal_data, n_test_normal):
    samples = np.arange(len(abnormal_data))
    np.random.shuffle(samples)
    samples = samples[:n_test_normal]
    
    return abnormal_data[samples]

In [13]:
def shuffle_test(test_data):
    shuffle = np.arange(len(test_data))
    np.random.shuffle(shuffle)
    
    y_test = [0 if s < len(test_data)/2 else 1 for s in shuffle]
    
    return test_data[shuffle], y_test

### Vargas (2019) benchmark

In [6]:
precision_scores = []
recall_scores = []
f1_scores = []

files = utils.read_files(constants.FILE_PATH, classes=constants.ABNORMAL_CLASSES, real_only=True)

for file in tqdm(files):
    data = pd.read_csv(file)
    data = drop_and_fill(data)
    train_normal_data, test_normal_data = split_normal_data(data, split_size=constants.TRAIN_SPLIT)
    abnormal_data = data[data["class"] != 0]
    abnormal_data = abnormal_data.drop("class", axis=1)
    
    if len(train_normal_data) < 50:
        continue
    
    scaler = sklearn.preprocessing.StandardScaler()
    train_data = scaler.fit_transform(train_normal_data)
    test_normal_data = scaler.transform(test_normal_data)
    test_abnormal_data = scaler.transform(abnormal_data)
    
    train = utils.create_sequence(train_data)
    test_normal = utils.create_sequence(test_normal_data)
    
    test_abnormal = utils.create_sequence(test_abnormal_data)
    test_abnormal = match_normal_samples(test_abnormal, len(test_normal))
    
    test = np.concatenate((test_normal, test_abnormal))
    test, y_test = shuffle_test(test)
    
    x_train = np.empty(shape=(len(train), 5, train.shape[2]))
    x_test = np.empty(shape=(len(test), 5, test.shape[2]))
    
    for i, sample in enumerate(train):
        x_train[i] = utils.get_features(sample)
        
    for i, sample in enumerate(test):
        x_test[i] = utils.get_features(sample)
    
    if len(x_train) < 8 or len(x_test) < 6:
        continue
    
    x_train = x_train.reshape(len(x_train), 5 * train.shape[2])
    x_test = x_test.reshape(len(x_test), 5 * test.shape[2])
    
    pca = PCA(n_components=4)
    x_train = pca.fit_transform(x_train)
    x_test = pca.transform(x_test)
    
    lof = LocalOutlierFactor(novelty=True, n_neighbors=5, metric="euclidean").fit(x_train)
    p = [0 if pred == 1 else 1 for pred in lof.predict(x_test)]
    
    precision = utils.precision(y_test, p)
    recall = utils.recall(y_test, p)
    f1 = utils.f1(y_test, p)

    
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

100%|██████████| 49/49 [00:07<00:00,  6.95it/s]


In [8]:
precision_scores = np.array(precision_scores)
recall_scores = np.array(recall_scores)
f1_scores = np.array(f1_scores)

print("Scores:")
print("Precision:", (precision_scores.mean(), precision_scores.std()))
print("Recall:", (recall_scores.mean(), recall_scores.std()))
print("F1:", (f1_scores.mean(), f1_scores.std()))

Scores:
Precision: (0.6980335947793712, 0.1788824053101556)
Recall: (0.9933862433862435, 0.03019236034527335)
F1: (0.806121234521302, 0.11408482694909357)
