In [None]:
import os
import sys
if '/home/zechengh/Mastik/ad/detector/' not in sys.path:
    sys.path.append('/home/zechengh/Mastik/ad/detector/')
from collections import OrderedDict
    
import numpy as np
import torch
import matplotlib.pyplot as plt
%matplotlib inline

import utils
import ADbenchmark
import LSTMAD

import json
import collections

import torch

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

np.random.seed(0)
torch.manual_seed(0)

# Do not write .pyc
sys.dont_write_bytecode = True

# Reload code when code is changed
%load_ext autoreload
%autoreload 2

id_to_feature = utils.id_to_feature
for k, v in id_to_feature.items():
    print(k, v)

In [None]:
pred_errors = collections.defaultdict(collections.defaultdict)
model_name = 'merged'

for bg_program in ['none', 'mysql', 'webserver', 'streamserver', 'mltrain', 'mapreduce']:
    data_dir = f'detector/preprocessed/pred_errors/{model_name}/{bg_program}/'.format(bg_program=bg_program)
    #data_dir = f'perf/data/{bg_program}_same_core/10000us/'.format(bg_program=bg_program)
    for f in os.listdir(data_dir):
        if f.endswith('.npy'):
            file_name = f.split('.')[0]
            pred_errors[bg_program][file_name] = np.load(os.path.join(data_dir, f))

In [None]:
training_data = []
sampling = True
for spec_benchmark in ['gpg', 'bzip2', 'gcc', 'mcf', 'milc', 'namd', 'gobmk']:
#['perlbench', 'bzip2', 'gcc', 'mcf', 'milc', 'namd', 
#  'gobmk', 'soplex', 'povray', 'hmmer', 'sjeng', 'libquantum',
#  'h264ref', 'lbm', 'omnetpp', 'astar']:
    
    d = pred_errors['none'][f'train_normal_with_{spec_benchmark}'][:5000]
    if sampling:
        sampling_idx = np.random.randint(low=0, high=len(d), size=1000)
        d = d[sampling_idx, :]
        
    training_data.append(d)

training_data = np.concatenate(training_data, axis=0)

In [None]:
from sklearn.neighbors import KernelDensity
from pathlib import Path
import concurrent

kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(training_data)

kde_results = collections.defaultdict(collections.defaultdict)
for bg_program in ['none']:
        
    def benign_program_detection(kde, data, bg_program, file_name):  
        kde_result = kde.score_samples(data)
        total = np.float32(len(kde_result))
        
        Path(f'detector/preprocessed/benign_kde/{model_name}/{bg_program}').mkdir(parents=True, exist_ok=True)
        np.save(f'detector/preprocessed/benign_kde/{model_name}/{bg_program}/{file_name}', kde_result)
        print(bg_program, file_name, kde_result)
        return 1
        
    executor = concurrent.futures.ProcessPoolExecutor(10)
    futures = [executor.submit(benign_program_detection, kde, pred_errors[bg_program][file_name], bg_program, file_name) for file_name in pred_errors[bg_program].keys()]
    concurrent.futures.wait(futures)

In [None]:
from sklearn.metrics import roc_curve, auc
import pandas as pd
pd.set_option('display.max_rows', None)

model_name = 'merged'

kde_results = collections.defaultdict(collections.defaultdict)

predicts = []
benign_ked_result_all = []
attack_ked_result_all = []

for bg_program in ['none']:
    for file_name in [
        'train_normal',
        'train_normal_with_gpg',
        'train_normal_with_bzip2',
        'train_normal_with_gcc',
        'train_normal_with_mcf',
        'train_normal_with_milc',
        'train_normal_with_namd',
        'train_normal_with_gobmk',
        'train_normal_with_soplex',
        'train_normal_with_hmmer',
        'train_normal_with_libquantum',
        'train_normal_with_h264ref',
    ]:
        kde_result = np.load(f'detector/preprocessed/benign_kde/{model_name}/{bg_program}/{file_name}.npy')[5000:]
        benign_ked_result_all.append(kde_result)
benign_ked_result_all = np.array(benign_ked_result_all).reshape(-1)
        
for bg_program in ['none']:
    for file_name in [ 
        "train_abnormal_l1pp",
        "train_abnormal_l3pp",
        "train_abnormal_fr",
        "train_abnormal_ff",
        "train_abnormal_spectrev1",
        "train_abnormal_spectrev2",
        "train_abnormal_spectrev3",
        "train_abnormal_spectrev4",
    ]:
        kde_result = np.load(f'detector/preprocessed/benign_kde/{model_name}/{bg_program}/{file_name}.npy')[5000:]
        attack_ked_result_all.append(kde_result)
attack_ked_result_all = np.array(attack_ked_result_all).reshape(-1)

def get_eer(y, y_pred):
    fpr, tpr, threshold = roc_curve(y, y_pred, pos_label=1)
    fnr = 1 - tpr
    
    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
    return eer_threshold

y = [0]*len(attack_ked_result_all) + [1]*len(benign_ked_result_all)
y_pred = np.array(list(attack_ked_result_all) + list(benign_ked_result_all))
eer_threshold = get_eer(y, y_pred)
print(eer_threshold)

#ked_result_all.sort()
#ked_result_all = ked_result_all
# Use the 1 percentile as threshold (1% of training normal is incorrectly classified as abnormal)
#print("Threshold ", np.percentile(ked_result_all, 1))

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)

model_name = 'merged'
benign_th = -14
attack_th = -14


for window_size in [1]: # 3, 5, 10, 20, 50, 100, 200, 500]:
    kde_results = collections.defaultdict(collections.defaultdict)
    final_decisions = collections.defaultdict(collections.defaultdict)
    predicts = []
    false_positives = []
    false_negatives = []

    for bg_program in ['none']:
        for file_name in [
            'test_normal',
            'test_normal_with_gpg',
            'test_normal_with_bzip2',
            'test_normal_with_gcc',
            'test_normal_with_mcf',
            'test_normal_with_milc',
            'test_normal_with_namd',
            'test_normal_with_gobmk',
            'test_normal_with_soplex',
            'test_normal_with_hmmer',
            'test_normal_with_libquantum',
            'test_normal_with_h264ref',
    
        ] + [
            'test_abnormal_l1pp',
            'test_abnormal_l3pp',
            'test_abnormal_fr',
            'test_abnormal_ff',
            'test_abnormal_spectrev1',
            'test_abnormal_spectrev2',
            'test_abnormal_spectrev3',
            'test_abnormal_spectrev4',
            'test_abnormal_bufferoverflow',
        ] + [
            'test_abnormal_l1pp_with_gpg',
            'test_abnormal_l3pp_with_gpg',
            'test_abnormal_fr_with_gpg',
            'test_abnormal_ff_with_gpg',
            'test_abnormal_spectrev1_with_gpg',
            'test_abnormal_spectrev2_with_gpg',
            'test_abnormal_spectrev3_with_gpg',
            'test_abnormal_spectrev4_with_gpg',
            'test_abnormal_bufferoverflow_with_gpg',       
            
            'test_abnormal_l1pp_with_gcc',
            'test_abnormal_l3pp_with_gcc',
            'test_abnormal_fr_with_gcc',
            'test_abnormal_ff_with_gcc',
            'test_abnormal_spectrev1_with_gcc',
            'test_abnormal_spectrev2_with_gcc',
            'test_abnormal_spectrev3_with_gcc',
            'test_abnormal_spectrev4_with_gcc',
            'test_abnormal_bufferoverflow_with_gcc', 
        
            'test_abnormal_l1pp_with_libquantum',
            'test_abnormal_l3pp_with_libquantum',
            'test_abnormal_fr_with_libquantum',
            'test_abnormal_ff_with_libquantum',
            'test_abnormal_spectrev1_with_libquantum',
            'test_abnormal_spectrev2_with_libquantum',
            'test_abnormal_spectrev3_with_libquantum',
            'test_abnormal_spectrev4_with_libquantum',
            'test_abnormal_bufferoverflow_with_libquantum',
        ]:
        
            def consecutive_decision(decisions, consecutive_window_size, mode='all'):
                res = []
                for i in range(consecutive_window_size, len(decisions)):
                    window_data = decisions[i-consecutive_window_size: i]
                    if (np.all(window_data == 1) and mode == 'all') or (np.any(window_data == 1) and mode == 'any'):
                        res.append(1)
                    else:
                        res.append(0)
                return np.array(res)
        
            attack_kde_result = np.load(f'detector/preprocessed/attack_kde/{model_name}/{bg_program}/{file_name}.npy')
            attack_kde_result_binary = attack_kde_result >= attack_th
        
            benign_kde_result = np.load(f'detector/preprocessed/benign_kde/{model_name}/{bg_program}/{file_name}.npy')
            benign_kde_result_binary = benign_kde_result >= benign_th
        
        
            def combined_pred(attack_kde_result_binary, benign_kde_result_binary):
                assert len(benign_kde_result_binary) == len(attack_kde_result_binary)
            
                res = []
                for i in range(len(attack_kde_result_binary)):
                    if attack_kde_result_binary[i] and benign_kde_result_binary[i]:
                        res.append(1)
                    elif attack_kde_result_binary[i] and (not benign_kde_result_binary[i]):
                        res.append(2)
                    elif (not attack_kde_result_binary[i]) and benign_kde_result_binary[i]:
                        res.append(3)
                    elif (not attack_kde_result_binary[i]) and (not benign_kde_result_binary[i]):
                        res.append(4)
                assert len(res) == len(benign_kde_result_binary)
                return np.array(res)

            def majority_vote_decision(decisions, consecutive_window_size):
                """
                    Args:
                        decisions: an array of four cases
                        consecutive_window_size: window size
                    Returns:
                        an array of four cases
                    Majority vote within a consecutive_window_size.
                """
                res = []
                for i in range(consecutive_window_size, len(decisions)):
                    res.append(collections.Counter(decisions[i-consecutive_window_size: i]).most_common(1)[0][0])
                return np.array(res)

            def consecutive_attack_decision(decisions, consecutive_window_size):
                """
                    Args:
                        decisions: an array of four cases
                        consecutive_window_size: window size
                    Returns:
                        an array of **binary (attack or none attack)**
                    Consecutive attack measurements (case 1,2,4) are recognized as an attack. Otherwise none-attack.  
                """
                
                res = []
                for i in range(consecutive_window_size, len(decisions)):
                    window_data = decisions[i-consecutive_window_size: i]
                    if np.all(window_data != 3):
                        res.append(1)
                    else:
                        res.append(0)
                return np.array(res)
            
            total = float(len(benign_kde_result))
            final_decision = combined_pred(attack_kde_result_binary, benign_kde_result_binary)
            #final_decision = majority_vote_decision(final_decision, window_size)
            final_decisions[bg_program][file_name] = final_decision
        
            case_YY = sum(final_decision == 1) / float(total)
            case_YN = sum(final_decision == 2) / float(total)
            case_NY = sum(final_decision == 3) / float(total)
            case_NN = sum(final_decision == 4) / float(total)
            
            final_decision_binary = consecutive_attack_decision(final_decision, window_size)
            pred_none_attack = sum(final_decision_binary != 1) / float(len(final_decision_binary))
            pred_attack = 1 - pred_none_attack
        
            if 'abnormal' in file_name:
                false_negatives.append(pred_none_attack)
            else:
                false_positives.append(pred_attack)
        
            predicts.append([file_name, case_NY, 1-case_NY, case_YY, case_YN, case_NY, case_NN])

    print('window_size', window_size)
    print('fpr:', np.mean(false_positives))
    print('fnr:', np.mean(false_negatives))
    columns = ['Test Case', 'Pred benign', 'Pred attack', 'case 1', 'case 2', 'case 3', 'case 4']
    print(pd.DataFrame(predicts, columns=columns))
    pd.DataFrame(predicts, columns=columns).to_excel(f'window{window_size}.xlsx')

In [None]:
import collections

predicts = []

false_positives = []
false_negatives = []

for file_name in [
        'test_normal',
        'test_normal_with_gpg',
        'test_normal_with_bzip2',
        'test_normal_with_gcc',
        'test_normal_with_mcf',
        'test_normal_with_milc',
        'test_normal_with_namd',
        'test_normal_with_gobmk',
        'test_normal_with_soplex',
        'test_normal_with_hmmer',
        'test_normal_with_libquantum',
        'test_normal_with_h264ref',
]:
    final_decision = final_decisions['none'][file_name]
    fpr = sum(final_decision != 3) / float(len(final_decision))
    false_positives.append(fpr)
        
    case_YY = sum(final_decision == 1) / float(total)
    case_YN = sum(final_decision == 2) / float(total)
    case_NY = sum(final_decision == 3) / float(total)
    case_NN = sum(final_decision == 4) / float(total)
    predicts.append([file_name, case_NY, 1-case_NY, case_YY, case_YN, case_NY, case_NN])
    
    
for file_name in [
        'test_abnormal_l1pp',
        'test_abnormal_l3pp',
        'test_abnormal_fr',
        'test_abnormal_ff',
        'test_abnormal_spectrev1',
        'test_abnormal_spectrev2',
        'test_abnormal_spectrev3',
        'test_abnormal_spectrev4',
        'test_abnormal_bufferoverflow',
    ] + [
        'test_abnormal_l1pp_with_gpg',
        'test_abnormal_l3pp_with_gpg',
        'test_abnormal_fr_with_gpg',
        'test_abnormal_ff_with_gpg',
        'test_abnormal_spectrev1_with_gpg',
        'test_abnormal_spectrev2_with_gpg',
        'test_abnormal_spectrev3_with_gpg',
        'test_abnormal_spectrev4_with_gpg',
        'test_abnormal_bufferoverflow_with_gpg',       
            
        'test_abnormal_l1pp_with_gcc',
        'test_abnormal_l3pp_with_gcc',
        'test_abnormal_fr_with_gcc',
        'test_abnormal_ff_with_gcc',
        'test_abnormal_spectrev1_with_gcc',
        'test_abnormal_spectrev2_with_gcc',
        'test_abnormal_spectrev3_with_gcc',
        'test_abnormal_spectrev4_with_gcc',
        'test_abnormal_bufferoverflow_with_gcc', 
    
        'test_abnormal_l1pp_with_libquantum',
        'test_abnormal_l3pp_with_libquantum',
        'test_abnormal_fr_with_libquantum',
        'test_abnormal_ff_with_libquantum',
        'test_abnormal_spectrev1_with_libquantum',
        'test_abnormal_spectrev2_with_libquantum',
        'test_abnormal_spectrev3_with_libquantum',
        'test_abnormal_spectrev4_with_libquantum',
        'test_abnormal_bufferoverflow_with_libquantum',
]:
    final_decision = final_decisions['none'][file_name]
    fnr = sum(final_decision == 3) / float(len(final_decision))
    false_negatives.append(fnr)
        
    case_YY = sum(final_decision == 1) / float(total)
    case_YN = sum(final_decision == 2) / float(total)
    case_NY = sum(final_decision == 3) / float(total)
    case_NN = sum(final_decision == 4) / float(total)
    predicts.append([file_name, case_NY, 1-case_NY, case_YY, case_YN, case_NY, case_NN])
        
print('fpr:', np.mean(false_positives))
print('fnr:', np.mean(false_negatives))