In [1]:
import pandas as pd, numpy as np, json, time
from sklearn.neighbors import KDTree

import sys
sys.path.insert(0,'../../../Architecture/')

import rrcf

In [2]:
%%time
df = pd.read_csv('extended_bgp_testbed_5.csv.gz')
df = df.drop(['Unnamed: 0'], axis=1)
df = df.dropna()
df = df.astype('float64')
df.shape

CPU times: user 5.51 s, sys: 254 ms, total: 5.77 s
Wall time: 5.79 s


(185984, 82)

In [3]:
samples_to_use_list = list(range(1000, 10000, 1000))
samples_to_use_list = samples_to_use_list + list(range(10000, 110000, 10000))

In [None]:
results = {}
results['algorithm'] = 'RRCF'
results['times'] = []

for sample_to_use in samples_to_use_list:
            
    print('RRCF: {}'.format(sample_to_use))
    results_samples = []
    dfNormalized = df[:sample_to_use]
        
    num_runs = 3
        
    for measurement in range(num_runs):
        sampleSkip = 30
        bufferDF = dfNormalized[0:sampleSkip]
        testDF = dfNormalized[sampleSkip:]

        points = testDF.values
        tree_size = 95
        num_trees = 100
        
        # Create a forest of empty trees
        forest = []
        for _ in range(num_trees):
            tree = rrcf.RCTree()
            forest.append(tree)
        # Create a dict to store anomaly score of each point
        avg_codisp = {}        
        
        start = time.time()
        
        for index, point in enumerate(points):
            # For each tree in the forest...
            for tree in forest:
                # If tree is above permitted size, drop the oldest point (FIFO)
                if len(tree.leaves) > tree_size:
                    tree.forget_point(index - tree_size)
                # Insert the new point into the tree
                tree.insert_point(point, index=index)
                # Compute codisp on the new point and take the average among all trees
                if not index in avg_codisp:
                    avg_codisp[index] = 0
                avg_codisp[index] += tree.codisp(index) / num_trees
        
        end = time.time()
        time_interval = end-start
        results_samples.append(time_interval)
        print('{} - {}'.format(sample_to_use, time_interval))
    
    results['times'].append(results_samples)
    with open('Results/RRCF_extended.json', 'w') as f:
        json.dump(results, f, indent=2)

RRCF: 1000
1000 - 96.73315930366516
1000 - 96.87112259864807
1000 - 94.49154949188232
RRCF: 2000
2000 - 197.90905952453613
2000 - 200.56196928024292
2000 - 201.2188844680786
RRCF: 3000
3000 - 303.39291548728943
3000 - 301.8438537120819
4000 - 409.4901831150055
4000 - 410.06203866004944
4000 - 405.4571182727814
RRCF: 5000
5000 - 509.37081718444824
5000 - 509.9335708618164
5000 - 515.2171144485474
RRCF: 6000
6000 - 616.9344561100006
6000 - 616.5821328163147
6000 - 598.1705801486969
RRCF: 7000
7000 - 697.4214684963226
7000 - 703.1724119186401
7000 - 700.5761675834656
RRCF: 8000
8000 - 801.346185207367
8000 - 805.9196536540985
