In [1]:
import logging
import numpy as np
import os
import numpy.random as rnd
from sklearn.metrics import f1_score

from ad_examples.common.utils import read_csv, dataframe_to_matrix
from ad_examples.common.gen_samples import get_synthetic_samples
from ad_examples.aad.aad_support import AadOpts, get_aad_command_args, configure_logger
from ad_examples.aad.forest_description import CompactDescriber, MinimumVolumeCoverDescriber, BayesianRulesetsDescriber, get_region_memberships
from ad_examples.aad.demo_aad import get_debug_args, detect_anomalies_and_describe

from ad_examples.loda.loda import Loda

logger = logging.getLogger(__name__)



In [2]:
def convert_scores_to_classes(scores, anomaly_ratio):
    """
    Converts list of scores to flags (0/1) - top anomalies are marked as 1.
    """
    anomaly_cnt = int(len(scores) * anomaly_ratio)
    anomaly_indices = np.array(scores).argsort()[-anomaly_cnt:][::-1]
    y_pred = np.zeros(len(scores))
    np.put(y_pred, anomaly_indices, 1)
    return y_pred


def load_data():
    print("loading csv...")
    data_df = read_csv("../notebooks/data/simple.type123.csv", header=True)

    print("transforming data...")
    x, y = dataframe_to_matrix(data_df)
    return (x, y)


def slice_data(x, y, idx_from, idx_to):
    n = x.shape[0]
    return (x[idx_from:idx_to, :], y[idx_from:idx_to])


In [3]:
def run_loda(x_old, scores_old, x, outliers_fraction):
    # ad_type="loda"
    # data_type = "type123"
    # data_size="complex"

    rnd.seed(42)

    n = x.shape[0]
    # outliers_fraction = np.sum(y) / len(y)
    # xx = yy = x_grid = Z = scores = None

    print("running LODA...")
    ad = Loda(mink=100, maxk=200)
    ad.fit(x_old)
    if scores_old == None:
        print("Calculating inital scores")
        scores_old = -ad.decision_function(x_old)

    print("Evaluating...")
    scores = -ad.decision_function(x)

    print("Combining with historic scores and converting to classes...")
    print(scores_old)
    print(scores)
    scores_combined = np.concatenate(scores_old, scores)
    y_pred_combined = convert_scores_to_classes(scores_combined, outliers_fraction)
    y_pred = y_pred_combined[len(scores_old):]

    # print("Calculating F1 scores...")
    # f1 = f1_score(y, y_pred, average=None) # average='weighted')
    # print(f1)

    return (scores_combined, y_pred)


In [4]:
(gt_x, gt_y) = load_data()
# print(gt_x.shape)

day_rec_cnt = 24 * 12
block_size = 70 * day_rec_cnt
idx_start = 160 * day_rec_cnt
idx_curr_time = idx_start
n = gt_y.shape[0]
scores_all = None
y_pred = np.zeros(0)

while idx_curr_time < n :
    print(n, idx_curr_time, block_size)
    (x1, y1) = slice_data(gt_x, gt_y, 0, idx_curr_time)
    (x2, y2) = slice_data(gt_x, gt_y, idx_curr_time, block_size)
    (scores_all, y_pred_new) = run_loda(x1, scores_all, x2, 0.01)
    y_pred = np.concatenate(y_pred, y_pred_new)
    idx_curr_time = idx_curr_time + block_size

print("finished with training, analyzing combined output")
y = gt_y[idx_start:]

print("Calculating F1 scores...")
f1 = f1_score(y, y_pred, average=None) # average='weighted')

print(f1)


loading csv...
transforming data...
104880 46080 20160
running LODA...
Calculating inital scores
Evaluating...
Combining with historic scores and converting to classes...
[399.69744409 403.78871826 446.81361814 ... 403.55507375 455.44715337
 385.80685385]
[]


TypeError: only integer scalar arrays can be converted to a scalar index

In [5]:
(gt_x, gt_y) = load_data()
# print(gt_x.shape)

day_rec_cnt = 24 * 12
block_size = 70 * day_rec_cnt
idx_start = 160 * day_rec_cnt
idx_curr_time = idx_start
n = gt_y.shape[0]
scores_all = None
y_pred = np.zeros(0)

loading csv...
transforming data...


In [17]:
print(n, idx_curr_time, block_size)
(x1, y1) = slice_data(gt_x, gt_y, 0, idx_curr_time)
(x2, y2) = slice_data(gt_x, gt_y, idx_curr_time, idx_curr_time + block_size)
print(x1.shape)
print(y1.shape)
print(x2.shape)
print(y2.shape)

46080 46080 20160
(46080, 12)
(46080,)
(20160, 12)
(20160,)


In [18]:
rnd.seed(42)

n = x1.shape[0]

print("running LODA...")
ad = Loda(mink=100, maxk=200)

running LODA...


In [19]:
ad.fit(x1)
print("Calculating inital scores")
scores_old = -ad.decision_function(x1)
print(scores_old)

Calculating inital scores
[399.69744409 403.78871826 446.81361814 ... 403.55507375 455.44715337
 385.80685385]


In [20]:
print("Evaluating...")
scores = -ad.decision_function(x2)

Evaluating...


In [24]:
print("Combining with historic scores and converting to classes...")
print(scores_old.shape)
print(scores.shape)

Combining with historic scores and converting to classes...
(46080,)
(20160,)


In [28]:
np.concatenate((np.array(scores_old), np.array(scores)), 0)
outliers_fraction = 0.01

In [29]:
scores_combined = np.concatenate((np.array(scores_old), np.array(scores)), 0)
y_pred_combined = convert_scores_to_classes(scores_combined, outliers_fraction)
y_pred = y_pred_combined[len(scores_old):]

In [31]:
np.sum(y_pred)

637.0

In [None]:

return (scores_combined, y_pred)

In [None]:
(scores_all, y_pred_new) = run_loda(x1, scores_all, x2, 0.01)
y_pred = np.concatenate(y_pred, y_pred_new)
idx_curr_time = idx_curr_time + block_size
