In [None]:
import os
import sys
if '/home/zechengh/Mastik/ad/detector/' not in sys.path:
    sys.path.append('/home/zechengh/Mastik/ad/detector/')
from collections import OrderedDict
    
import numpy as np
import torch
import matplotlib.pyplot as plt
%matplotlib inline

import utils
import ADbenchmark
import LSTMAD

import json
import collections

import torch

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

np.random.seed(0)
torch.manual_seed(0)

# Do not write .pyc
sys.dont_write_bytecode = True

# Reload code when code is changed
%load_ext autoreload
%autoreload 2

id_to_feature = utils.id_to_feature
for k, v in id_to_feature.items():
    print(k, v)

In [None]:
pred_errors = collections.defaultdict(collections.defaultdict)
model_name = 'merged'

for bg_program in ['none', 'mysql', 'webserver', 'streamserver', 'mltrain', 'mapreduce']:
    data_dir = f'detector/preprocessed/pred_errors/{model_name}/{bg_program}/'.format(bg_program=bg_program)
    for f in os.listdir(data_dir):
        if f.endswith('.npy'):
            file_name = f.split('.')[0]
            pred_errors[bg_program][file_name] = np.load(os.path.join(data_dir, f))

In [None]:
training_data = []
sampling = True

for file_name in [
    "train_abnormal_l1pp",
    "train_abnormal_l3pp",
    "train_abnormal_fr",
    "train_abnormal_ff",
    "train_abnormal_spectrev1",
    "train_abnormal_spectrev2",
    "train_abnormal_spectrev3",
    "train_abnormal_spectrev4",
    "train_abnormal_bufferoverflow",
]:
    d = pred_errors['none'][file_name][:5000]
    if sampling:
        sampling_idx = np.random.randint(low=0, high=len(d), size=1000)
        d = d[sampling_idx, :]
        
    training_data.append(d)
    
training_data = np.concatenate(training_data, axis=0)

np.save('train', training_data)

In [None]:
from sklearn.neighbors import KernelDensity
from pathlib import Path
import concurrent

kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(training_data)

kde_results = collections.defaultdict(collections.defaultdict)
for bg_program in ['none']:
        
    def known_attack_detection(kde, data, bg_program, file_name):  
        kde_result = kde.score_samples(data)
        total = np.float32(len(kde_result))
        
        Path(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}').mkdir(parents=True, exist_ok=True)
        np.save(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}/{file_name}', kde_result)
        print(bg_program, file_name, kde_result)
        return 1
        
    executor = concurrent.futures.ProcessPoolExecutor(20)
    futures = [executor.submit(known_attack_detection, kde, pred_errors[bg_program][file_name], bg_program, file_name) for file_name in pred_errors[bg_program].keys()]
    concurrent.futures.wait(futures)

In [None]:
from sklearn.metrics import roc_curve, auc
import pandas as pd
pd.set_option('display.max_rows', None)

model_name = 'merged'

kde_results = collections.defaultdict(collections.defaultdict)

predicts = []
benign_ked_result_all = []
attack_ked_result_all = []

for bg_program in ['none']:
    for file_name in [
        'train_normal',
        'train_normal_with_gpg',
        'train_normal_with_bzip2',
        'train_normal_with_gcc',
        'train_normal_with_mcf',
        'train_normal_with_milc',
        'train_normal_with_namd',
        'train_normal_with_gobmk',
        'train_normal_with_soplex',
        'train_normal_with_hmmer',
        'train_normal_with_libquantum',
        'train_normal_with_h264ref',
    ]:
        kde_result = np.load(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}/{file_name}.npy')[5000:]
        benign_ked_result_all.append(kde_result)
benign_ked_result_all = np.array(benign_ked_result_all).reshape(-1)
        
for bg_program in ['none']:
    for file_name in [ 
        "train_abnormal_l1pp",
        "train_abnormal_l3pp",
        "train_abnormal_fr",
        "train_abnormal_ff",
        "train_abnormal_spectrev1",
        "train_abnormal_spectrev2",
        "train_abnormal_spectrev3",
        "train_abnormal_spectrev4",
    ]:
        kde_result = np.load(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}/{file_name}.npy')[5000:]
        attack_ked_result_all.append(kde_result)
attack_ked_result_all = np.array(attack_ked_result_all).reshape(-1)

def get_eer(y, y_pred):
    fpr, tpr, threshold = roc_curve(y, y_pred, pos_label=1, drop_intermediate=False)
    fnr = 1 - tpr
    
    i = np.nanargmin(np.absolute((fnr - fpr)))
    eer_threshold = threshold[i]
    
    eer_threshold = float('inf')
    
    # For attack detector, there are a wide range of thresholds which have approx EER (diff < 1e-4)
    # Choose the smallest to be strict (highest FPR), because later larger window size can reduce FPR
    for i in range(len(fpr)):
        if abs(fpr[i] - fnr[i]) < 1e-4 and threshold[i] < eer_threshold:
            eer_threshold = threshold[i]
    return eer_threshold

y = [1]*len(attack_ked_result_all) + [0]*len(benign_ked_result_all)
y_pred = np.array(list(attack_ked_result_all) + list(benign_ked_result_all))
eer_threshold = get_eer(y, y_pred)
print(eer_threshold)

In [None]:
print(benign_ked_result_all)
print(attack_ked_result_all)

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)

model_name = 'merged'
th = -18

kde_results = collections.defaultdict(collections.defaultdict)

predicts = []
attack_list = ['l1pp', 'l3pp', 'fr', 'ff', 'spectrev1', 'spectrev2', 'spectrev3', 'spectrev4', 'bufferoverflow']
benign_list = ['gpg', 'bzip2', 'gcc', 'mcf', 'milc', 'namd', 'gobmk', 'soplex', 'hmmer', 'libquantum', 'h264ref']

for bg_program in ['none']:
    """
    for file_name in [
            'test_normal',
            'test_normal_with_gpg',
    ] + [f'test_normal_with_{spec_benchmark}' for spec_benchmark in utils.spec_benchmarks] + [
            'test_abnormal_l1pp',
            'test_abnormal_l3pp',
            'test_abnormal_fr',
            'test_abnormal_ff',
            'test_abnormal_spectrev1',
            'test_abnormal_spectrev2',
            'test_abnormal_spectrev3',
            'test_abnormal_spectrev4',
            'test_abnormal_bufferoverflow',
        ] + [f'test_abnormal_{attack}_with_{benign}' for attack in attack_list for benign in benign_list]:
    """
    for file_name in [
        'test_normal',
        'test_normal_with_gpg',
        'test_normal_with_bzip2',
        'test_normal_with_gcc',
        'test_normal_with_mcf',
        'test_normal_with_milc',
        'test_normal_with_namd',
        'test_normal_with_gobmk',
        'test_normal_with_soplex',
        'test_normal_with_hmmer',
        'test_normal_with_libquantum',
        'test_normal_with_h264ref',
    
    ] + [
            'test_abnormal_l1pp',
            'test_abnormal_l3pp',
            'test_abnormal_fr',
            'test_abnormal_ff',
            'test_abnormal_spectrev1',
            'test_abnormal_spectrev2',
            'test_abnormal_spectrev3',
            'test_abnormal_spectrev4',
            'test_abnormal_bufferoverflow',
        ] + [
            'test_abnormal_l1pp_with_gpg',
            'test_abnormal_l3pp_with_gpg',
            'test_abnormal_fr_with_gpg',
            'test_abnormal_ff_with_gpg',
            'test_abnormal_spectrev1_with_gpg',
            'test_abnormal_spectrev2_with_gpg',
            'test_abnormal_spectrev3_with_gpg',
            'test_abnormal_spectrev4_with_gpg',
            'test_abnormal_bufferoverflow_with_gpg',        
            
            'test_abnormal_l1pp_with_gcc',
            'test_abnormal_l3pp_with_gcc',
            'test_abnormal_fr_with_gcc',
            'test_abnormal_ff_with_gcc',
            'test_abnormal_spectrev1_with_gcc',
            'test_abnormal_spectrev2_with_gcc',
            'test_abnormal_spectrev3_with_gcc',
            'test_abnormal_spectrev4_with_gcc',
            'test_abnormal_bufferoverflow_with_gcc',
        
            'test_abnormal_l1pp_with_libquantum',
            'test_abnormal_l3pp_with_libquantum',
            'test_abnormal_fr_with_libquantum',
            'test_abnormal_ff_with_libquantum',
            'test_abnormal_spectrev1_with_libquantum',
            'test_abnormal_spectrev2_with_libquantum',
            'test_abnormal_spectrev3_with_libquantum',
            'test_abnormal_spectrev4_with_libquantum',
            'test_abnormal_bufferoverflow_with_libquantum',
        ]:
        kde_result = np.load(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}/{file_name}.npy')
        total = float(len(kde_result))
        
        # Model the distribution of attack (high probability -> attack)
        pred_normal = np.sum(kde_result <= th) / total
        pred_abnormal = np.sum(kde_result > th) / total
        
        predicts.append([
            bg_program, file_name, pred_normal, pred_abnormal,
            np.mean(kde_result), np.min(kde_result), np.max(kde_result),
            np.percentile(kde_result, 10), np.percentile(kde_result, 90)])

columns = ['Workload', 'Test Case', 'Pred normal', 'Pred abnormal', 'Mean', 'Min', 'Max', '10%', '90%']
print(pd.DataFrame(predicts, columns=columns))

In [None]:
from sklearn.neighbors import KernelDensity
from pathlib import Path
import concurrent

kde_results = collections.defaultdict(collections.defaultdict)
file_names = [
    'train_normal',
    'train_abnormal_l1pp',
    'train_abnormal_l3pp',
    'train_abnormal_fr',
    'train_abnormal_ff',
    'train_abnormal_spectrev1',
    'train_abnormal_spectrev2',
    'train_abnormal_spectrev3',
    'train_abnormal_spectrev4',
    'train_abnormal_bufferoverflow',
]


training_data = []
sampling = True

for bg_program in ['mltrain', 'mysql', 'webserver', 'streamserver', 'mapreduce']:
    for file_name in file_names:
        d = pred_errors[bg_program][file_name]
        if sampling:
            sampling_idx = np.random.randint(low=0, high=len(d), size=1000)
            d = d[sampling_idx, :]
        
        training_data.append(d)

training_data = np.concatenate(training_data, axis=0)


In [None]:
from sklearn.neighbors import KernelDensity
from pathlib import Path
import concurrent

kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(training_data)

kde_results = collections.defaultdict(collections.defaultdict)
for bg_program in ['mltrain', 'mysql', 'webserver', 'streamserver', 'mapreduce']:
        
    def known_attack_detection(kde, data, bg_program, file_name):  
        kde_result = kde.score_samples(data)
        total = np.float32(len(kde_result))
        
        Path(f'detector/preprocessed/test_step1_attack_kde/{model_name}/{bg_program}').mkdir(parents=True, exist_ok=True)
        np.save(f'detector/preprocessed/test_step1_attack_kde/{model_name}/{bg_program}/{file_name}', kde_result)
        print(bg_program, file_name, kde_result)
        return 1
        
    executor = concurrent.futures.ProcessPoolExecutor(20)
    futures = [executor.submit(known_attack_detection, kde, pred_errors[bg_program][file_name], bg_program, file_name) for file_name in pred_errors[bg_program].keys()]
    concurrent.futures.wait(futures)

In [None]:
import pandas as pd

from sklearn.neighbors import KernelDensity
from pathlib import Path
import concurrent

pd.set_option('display.max_rows', None)

model_name = 'merged'
th = -14

kde_results = collections.defaultdict(collections.defaultdict)

predicts = []
attack_list = ['l1pp', 'l3pp', 'fr', 'ff', 'spectrev1', 'spectrev2', 'spectrev3', 'spectrev4', 'bufferoverflow']

for bg_program in ['mltrain', 'mysql', 'webserver', 'streamserver', 'mapreduce']:
    for file_name in [
        'test_normal',
        'test_normal_with_gpg',
        'test_normal_with_gcc',
        'test_normal_with_mcf',
        'test_normal_with_libquantum',
        'test_abnormal_l1pp',
        'test_abnormal_l3pp',
        'test_abnormal_fr',
        'test_abnormal_ff',
        'test_abnormal_spectrev1',
        'test_abnormal_spectrev2',
        'test_abnormal_spectrev3',
        'test_abnormal_spectrev4',
        'test_abnormal_bufferoverflow',
    ]:
        kde_result = np.load(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}/{file_name}.npy')
        total = float(len(kde_result))
        
        # Model the distribution of attack (high probability -> attack)
        pred_normal = np.sum(kde_result <= th) / total
        pred_abnormal = np.sum(kde_result > th) / total
        
        predicts.append([
            bg_program, file_name, pred_normal, pred_abnormal,
            np.mean(kde_result), np.min(kde_result), np.max(kde_result),
            np.percentile(kde_result, 10), np.percentile(kde_result, 90)])

columns = ['Workload', 'Test Case', 'Pred normal', 'Pred abnormal', 'Mean', 'Min', 'Max', '10%', '90%']
print(pd.DataFrame(predicts, columns=columns))
pd.DataFrame(predicts, columns=columns).to_excel(f'test_step1_attack_detection.xlsx')

In [None]:
training_data = []
sampling = True

for file_name in [
    "train_abnormal_l1pp",
    "train_abnormal_l3pp",
    #"train_abnormal_fr",
    #"train_abnormal_ff",
    "train_abnormal_spectrev1",
    "train_abnormal_spectrev2",
    #"train_abnormal_spectrev3",
    #"train_abnormal_spectrev4",
    "train_abnormal_bufferoverflow",
]:
    d = pred_errors['none'][file_name][:5000]
    if sampling:
        sampling_idx = np.random.randint(low=0, high=len(d), size=1000)
        d = d[sampling_idx, :]
        
    training_data.append(d)
    
training_data = np.concatenate(training_data, axis=0)


kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(training_data)

kde_results = collections.defaultdict(collections.defaultdict)
for bg_program in ['none']:
        
    def known_attack_detection(kde, data, bg_program, file_name):  
        kde_result = kde.score_samples(data)
        total = np.float32(len(kde_result))
        
        Path(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}').mkdir(parents=True, exist_ok=True)
        np.save(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}/{file_name}', kde_result)
        print(bg_program, file_name, kde_result)
        return 1
        
    executor = concurrent.futures.ProcessPoolExecutor(20)
    futures = [executor.submit(known_attack_detection, kde, pred_errors[bg_program][file_name], bg_program, file_name) for file_name in pred_errors[bg_program].keys()]
    concurrent.futures.wait(futures)