In [1]:
"""
Imports
"""
import json, time
import numpy as np
import pandas as pd

import sys
rootPath = '../../'
sys.path.insert(0, rootPath+'Architecture/')
from groundTruth import GroundTruth
from statistics import Statistics
from dataManagement import Scaler

from ods import Sample, ODS
from multiprocessing import Process, Queue
np.seterr(invalid='ignore')

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import copy
from sklearn.metrics import average_precision_score
import multiprocessing, pickle

In [2]:
def get_features_node(node, rootPath):
    features_node = json.loads(open(rootPath+'features_node.json').read())
    features_to_use = features_node[node]['DataPlane']+features_node[node]['ControlPlane']
    features_to_use = features_to_use+['time']
    len(features_to_use)
    return features_to_use

def get_interval_scores(dataset, times, scores_np, output_series):
    st = Statistics(dataset, rootPath)
    st.gt.loadGroundTruthBGP_testbed()
    eventsList = st.gt.df.to_dict('records')
    
    if dataset == 'BGP_testbed_3':
        start =  eventsList.pop(3)
        end = eventsList.pop(3)
        start['End'] = end['End']
        eventsList.append(start)

        start = eventsList.pop(3)
        middle = eventsList.pop(3)
        end =  eventsList.pop(3)

        start['End'] = end['End']
        eventsList.append(start)        

    label = pd.Series(np.zeros(scores_np.shape[0]))
    
    for event in eventsList:
        indexes = times[(times>event['Start']) & (times<event['End'])].index
        first_index = indexes[0]
        label.drop(indexes, inplace=True)
        label[first_index] = 1

        output_series.drop(indexes, inplace=True)
        output_series[first_index] = scores_np[indexes].max()    

    label = label.sort_index().reset_index(drop=True)
    output_series = output_series.sort_index().reset_index(drop=True)
    
    return label, output_series

In [3]:
lamb = 0.125
beta = 0.4
radius_factor_type = 'dynamic'

total_scores_ods = {}

scores_df_list = []

for dataset in ['BGP_testbed_2', 'BGP_testbed_3', 'BGP_testbed_5', 'BGP_testbed_9', 'BGP_testbed_10']:
    total_scores_ods[dataset] = {}
    config = json.loads(open(rootPath+'configuration.json').read())['datasets'][dataset] 
    for node in config['nodes']:
        
        total_scores_ods[dataset][node]= {}
        features_node = get_features_node(node, rootPath)

        df = pd.read_csv(rootPath + config['directory']+node+config['filename'],
                         low_memory=False, dtype='float64')\
                .dropna()
        df = df[features_node]

        times = df['time']//1e9
        times = times.astype('int')
        df.drop(['time'], axis=1, inplace=True)    

        scaler = Scaler()
        dfNormalized = scaler.normalize(df)    

        sampleSkip = 30

        bufferDF = dfNormalized[0:sampleSkip]
        testDF = dfNormalized[sampleSkip:]

        ods = ODS(lamb = lamb,\
                    epsilon = radius_factor_type,\
                    beta = beta,\
                    mu = 'auto',\
                    startingBuffer = bufferDF.values,
                    tp = 'auto')
        ods.runInitialization()

        outputCurrentNode = []
        output_score = []
        for sampleNumber in range(len(testDF)):
            sample = testDF.iloc[sampleNumber]
            label, score = ods.runOnNewSample(Sample(sample.values, times.iloc[sampleNumber]))
            outputCurrentNode.append(label)
            output_score.append(score)        
        output_score = np.array(output_score)
        output_score = (np.where(np.isinf(output_score), output_score[~np.isinf(output_score)].max(), output_score))

        scores_ods = np.append(np.zeros(sampleSkip), output_score)
        output_ods = pd.Series(copy.deepcopy(scores_ods))

        label, interval_scores = get_interval_scores(dataset, times, scores_ods, output_ods)

        total_scores_ods[dataset][node]['label'] = label
        total_scores_ods[dataset][node]['scores'] = interval_scores
        
        scores_df_list.append({
            'dataset': dataset,
            'node': node,
            'label': label,
            'scores': interval_scores
        })

In [4]:
"""
Save ODS results to pickle
"""
pd.DataFrame(scores_df_list).to_pickle('ods_scores.pkl')

In [5]:
"""
RRCF in multiprocessing
"""

def get_rrcf_tasks():
    tasks = []
    for dataset in ['BGP_testbed_5', 'BGP_testbed_9', 'BGP_testbed_10']:
        config = json.loads(open(rootPath+'configuration.json').read())['datasets'][dataset] 
        for node in config['nodes']:
            for test_id in range(10):
                tasks.append({
                    'dataset':dataset,
                    'node':node,
                    'test_id': test_id
                })
    return tasks

def run_rrcf(task):
    import rrcf
    
    dataset = task['dataset']
    node = task['node']
    
    features_node = get_features_node(node, rootPath)
    config = json.loads(open(rootPath+'configuration.json').read())['datasets'][dataset]
    df = pd.read_csv(rootPath + config['directory']+node+config['filename'],
                     low_memory=False, dtype='float64')\
            .dropna()
    df = df[features_node]

    times = df['time']//1e9
    times = times.astype('int')
    df.drop(['time'], axis=1, inplace=True)    

    scaler = Scaler()
    points = scaler.normalize(df).values  
    
    tree_size = 95
    num_trees = 100
    
    # Create a forest of empty trees
    forest = []
    for _ in range(num_trees):
        tree = rrcf.RCTree()
        forest.append(tree)
    # Create a dict to store anomaly score of each point
    avg_codisp = {}

    for index, point in enumerate(points):
        # For each tree in the forest...
        for tree in forest:
            # If tree is above permitted size, drop the oldest point (FIFO)
            if len(tree.leaves) > tree_size:
                tree.forget_point(index - tree_size)
            # Insert the new point into the tree
            tree.insert_point(point, index=index)
            # Compute codisp on the new point and take the average among all trees
            if not index in avg_codisp:
                avg_codisp[index] = 0
            avg_codisp[index] += tree.codisp(index) / num_trees

    scores = np.array(list(avg_codisp.values()))
    scores[:sampleSkip] = 0
    
    scores_rrcf = scores
    output_rrcf = pd.Series(copy.deepcopy(scores_rrcf))

    label, interval_scores = get_interval_scores(dataset, times, scores_rrcf, output_rrcf)

    task['label'] = label.values
    task['scores'] = interval_scores.values    
    
    return task

In [6]:
MAX_PROCESSES = 30
p = multiprocessing.Pool(processes=MAX_PROCESSES)

tasks = get_rrcf_tasks()
print(len(tasks))

results = p.map_async(run_rrcf, tasks)

p.close()
p.join()

totalRes = []

for r in results.get():
    totalRes.append(r)

rrcf_res = pd.DataFrame(totalRes)

330


In [7]:
"""
Save RRCF results to pickle
"""
rrcf_res.to_pickle('rrcf_scores.pkl')