In [None]:
import os
import sys
if '/home/zechengh/Mastik/ad/detector/' not in sys.path:
    sys.path.append('/home/zechengh/Mastik/ad/detector/')
from collections import OrderedDict
    
import numpy as np
import torch
import matplotlib.pyplot as plt
%matplotlib inline

import utils
import ADbenchmark
import LSTMAD

import json
import collections

import torch
from pathlib import Path

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

np.random.seed(0)
torch.manual_seed(0)

# Do not write .pyc
sys.dont_write_bytecode = True

# Reload code when code is changed
%load_ext autoreload
%autoreload 2

id_to_feature = utils.id_to_feature
for k, v in id_to_feature.items():
    print(k, v)

In [None]:
id_to_feature = utils.id_to_feature
data = collections.defaultdict(collections.defaultdict)

for bg_program in ['none', 'mysql', 'webserver', 'streamserver', 'mltrain', 'mapreduce']:
    data_dir = 'perf/data/{bg_program}_same_core/10000us/'.format(bg_program=bg_program)
    for f in os.listdir(data_dir):
        if f.endswith('.npy'):
            file_name = f.split('.')[0]
            data[bg_program][file_name] = np.load(os.path.join(data_dir, f))

feature_list = utils.FeatureSelect.feature_list

print("Used features:")
for i in feature_list:
    print(id_to_feature[i])

In [None]:
models = {}
models['mysql'] = torch.load("detector/checkpoints/AnomalyDetectorMysql.ckpt")
models['streamserver'] = torch.load("detector/checkpoints/AnomalyDetectorSS.ckpt")
models['webserver'] = torch.load("detector/checkpoints/AnomalyDetectorWS.ckpt")
models['mltrain'] = torch.load("detector/checkpoints/AnomalyDetectorMLtrain.ckpt")
models['merged'] = torch.load("detector/checkpoints/AnomalyDetectorMerged.ckpt")

In [None]:
from pathlib import Path
import concurrent

model_name = 'merged'
model = models[model_name]
pred_errors = collections.defaultdict(collections.defaultdict)

for bg_program in data.keys(): 
    for file_name in data[bg_program].keys():
        
        data_in = model.normalize(np.float32(data[bg_program][file_name][:, feature_list]))
        data_in_tensor = torch.tensor(data_in)
        
        _, pred = model._get_reconstruction_error(
            data_in_tensor,
            gpu=True,
        )
        
        pred = pred[:, 0, :].detach().cpu().numpy()
        pred_errors[bg_program][file_name] = data_in[1:, :]-pred
        
        Path(f'detector/preprocessed/pred_errors/{model_name}/{bg_program}').mkdir(parents=True, exist_ok=True)
        np.save(f'detector/preprocessed/pred_errors/{model_name}/{bg_program}/{file_name}', pred_errors[bg_program][file_name])

In [None]:
from sklearn.neighbors import KernelDensity
import concurrent

sampling = True
training_data = []
testing_data = collections.defaultdict(list)

pred_errors = collections.defaultdict(collections.defaultdict)
model_name = 'merged'

for bg_program in ['none', 'mysql', 'webserver', 'streamserver', 'mltrain', 'mapreduce']:
    data_dir = f'detector/preprocessed/pred_errors/{model_name}/{bg_program}/'.format(bg_program=bg_program)
    for f in os.listdir(data_dir):
        if f.endswith('.npy'):
            file_name = f.split('.')[0]
            pred_errors[bg_program][file_name] = np.load(os.path.join(data_dir, f))

for bg_program in pred_errors.keys():
    d = pred_errors[bg_program]['ref_and_val_normal']
    if sampling:
        sampling_idx = np.random.randint(low=0, high=len(d), size=1000)
        d = d[sampling_idx, :]
        
    training_data.append(d)

training_data = np.concatenate(training_data, axis=0)

kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(training_data)
th = 5.0

for bg_program in pred_errors.keys():
    
    # Accelerate with multiprocessing
    def anomaly_detection(kde, data, bg_program, file_name):  
        kde_result = kde.score_samples(data)
        total = np.float32(len(kde_result))
        
        Path(f'detector/preprocessed/kde/{model_name}/{bg_program}').mkdir(parents=True, exist_ok=True)
        np.save(f'detector/preprocessed/kde/{model_name}/{bg_program}/{file_name}', kde_result)
        print(bg_program, file_name, kde_result)
        
        return 1
        
    executor = concurrent.futures.ProcessPoolExecutor(20)
    futures = [executor.submit(anomaly_detection, kde, pred_errors[bg_program][file_name], bg_program, file_name) for file_name in pred_errors[bg_program].keys()]
    concurrent.futures.wait(futures)

In [None]:
model_name = 'merged'
kde_results = collections.defaultdict(collections.defaultdict)
stats = collections.defaultdict(list)

predicts = []
ked_result_all = []
for bg_program in ['mysql', 'webserver', 'streamserver', 'mltrain', 'mapreduce']:
    for file_name in [
            'train_normal',
            'train_normal_with_gpg',
            'train_normal_with_gcc',
            'train_normal_with_mcf',
            'train_normal_with_libquantum',
            ]:

        kde_result = np.load(f'detector/preprocessed/kde/{model_name}/{bg_program}/{file_name}.npy')
        ked_result_all.append(kde_result)
        
ked_result_all = np.array(ked_result_all).reshape(-1)
ked_result_all.sort()

# Use the 10 percentile as threshold (10% of training normal is incorrectly classified as abnormal)
print("Mean ", np.mean(ked_result_all), "Std", np.std(ked_result_all))
plt.hist(ked_result_all, bins=np.linspace(-20, 0, 100))
print("Threshold ", np.percentile(ked_result_all, 20))

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)

model_name = 'merged'
th = -3.3

kde_results = collections.defaultdict(collections.defaultdict)

predicts = []
for bg_program in ['mysql', 'webserver', 'streamserver', 'mltrain', 'mapreduce']: #'webserver'
    for file_name in [
            'test_normal',
            'test_normal_with_gpg',
            'test_normal_with_gcc',
            'test_normal_with_mcf',
            'test_normal_with_libquantum',
            'test_abnormal_l1pp',
            'test_abnormal_l3pp',
            'test_abnormal_fr',
            'test_abnormal_ff',
            'test_abnormal_spectrev1',
            'test_abnormal_spectrev2',
            'test_abnormal_spectrev3',
            'test_abnormal_spectrev4',
            'test_abnormal_bufferoverflow',
            'test_abnormal_l1pp_with_gpg',
            'test_abnormal_l3pp_with_gpg',
            'test_abnormal_fr_with_gpg',
            'test_abnormal_ff_with_gpg',
            'test_abnormal_spectrev1_with_gpg',
            'test_abnormal_spectrev2_with_gpg',
            'test_abnormal_spectrev3_with_gpg',
            'test_abnormal_spectrev4_with_gpg',
            'test_abnormal_bufferoverflow_with_gpg',
            'test_abnormal_l1pp_with_gcc',
            'test_abnormal_l3pp_with_gcc',
            'test_abnormal_fr_with_gcc',
            'test_abnormal_ff_with_gcc',
            'test_abnormal_spectrev1_with_gcc',
            'test_abnormal_spectrev2_with_gcc',
            'test_abnormal_spectrev3_with_gcc',
            'test_abnormal_spectrev4_with_gcc',
            'test_abnormal_bufferoverflow_with_gcc',
            'test_abnormal_l1pp_with_libquantum',
            'test_abnormal_l3pp_with_libquantum',
            'test_abnormal_fr_with_libquantum',
            'test_abnormal_ff_with_libquantum',
            'test_abnormal_spectrev1_with_libquantum',
            'test_abnormal_spectrev2_with_libquantum',
            'test_abnormal_spectrev3_with_libquantum',
            'test_abnormal_spectrev4_with_libquantum',
            'test_abnormal_bufferoverflow_with_libquantum',

            ]:

        kde_result = np.load(f'detector/preprocessed/kde/{model_name}/{bg_program}/{file_name}.npy')
        total = float(len(kde_result))
        
        pred_normal = np.sum(kde_result >= th) / total
        pred_abnormal = np.sum(kde_result < th) / total
        
        predicts.append([
            bg_program, file_name, pred_normal, pred_abnormal,
            np.mean(kde_result), np.std(kde_result), np.min(kde_result), np.max(kde_result),
            np.percentile(kde_result, 10), np.percentile(kde_result, 90)])

columns = ['Workload', 'Test Case', 'Pred normal', 'Pred abnormal', 'Mean', 'Std', 'Min', 'Max', '10%', '90%']
print(pd.DataFrame(predicts, columns=columns))