In [17]:
import logging
import numpy as np
import os
import numpy.random as rnd
from sklearn.metrics import f1_score

In [2]:
from ad_examples.common.utils import read_csv, dataframe_to_matrix
from ad_examples.common.gen_samples import get_synthetic_samples
from ad_examples.aad.aad_support import AadOpts, get_aad_command_args, configure_logger
from ad_examples.aad.forest_description import CompactDescriber, MinimumVolumeCoverDescriber, \
    BayesianRulesetsDescriber, get_region_memberships

from ad_examples.aad.demo_aad import get_debug_args, detect_anomalies_and_describe



In [3]:
from ad_examples.loda.loda import Loda

In [4]:
logger = logging.getLogger(__name__)

# Prepare the aad arguments. It is easier to first create the parsed args and
# then create the actual AadOpts from the args
args = get_aad_command_args(debug=True, debug_args=get_debug_args())

In [5]:
opts = AadOpts(args)
logger.debug(opts.str_opts())
np.random.seed(opts.randseed)
# load synthetic (toy 2) dataset
x, y = get_synthetic_samples(stype=2)

In [6]:
# run interactive anomaly detection loop
model, x_transformed, queried, ridxs_counts, region_extents = detect_anomalies_and_describe(x, y, opts)

baseline found:
[0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 9]
AAD found:
[0, 0, 0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 7, 7, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16]
UNK  4
UNK  4


## Testing simple anomaly detection

Open data and run LODA algorithm on it.

In [7]:
ad_type="loda"
sample_type="donut_"
rnd.seed(42)

In [9]:
# x, y = get_demo_samples(sample_type)
# x, y = get_synthetic_samples(stype=2)
# data_df = read_csv("./data/simple.type123.csv", header=True)
data_df = read_csv("./data/single.type123.csv", header=True)
x, y = dataframe_to_matrix(data_df)

In [10]:
n = x.shape[0]
outliers_fraction = 0.01
xx = yy = x_grid = Z = scores = None

In [11]:
ad = Loda(mink=100, maxk=200)
ad.fit(x)
scores = -ad.decision_function(x)
# Z = -ad.decision_function(x_grid)

In [12]:
print("scores:\n%s" % str(list(scores)))
top_anoms = np.argsort(-scores)[np.arange(10)]

scores:
[482.7925083727895, 434.56830508877164, 427.41014595748777, 601.3994338237019, 641.5434741690622, 709.3481357420129, 493.17354175580004, 436.953111731906, 538.8971111718367, 526.7854597519606, 346.1725139053025, 461.03622547580926, 508.5285722497696, 534.2042454256466, 586.2818806883664, 414.7338745831288, 509.23099649418157, 492.3852484405587, 416.66408595490327, 384.4963707641149, 602.8424191809879, 709.2415523385731, 363.49392478893867, 406.98903419891224, 598.4753328804304, 427.5480362443534, 539.1525933003917, 482.8148786883603, 516.368115851459, 553.8031586782922, 489.1739781327939, 504.86940808829746, 474.98916488845816, 631.606275609281, 557.7094861731721, 416.99387368756425, 338.2173986427892, 377.16680595866586, 380.1679475750481, 553.1085975052608, 735.2175544892636, 417.4483689104154, 531.3354024244265, 453.29097673325657, 494.2942467735826, 420.3830924992027, 469.1284899829982, 500.8691158215088, 444.24369327032576, 528.588342024947, 648.9127071759024, 691.52026481

In [13]:
print(top_anoms)

[98930 98918 98915 98917 98920 98921 98919 98927 98923 98933]


In [14]:
def convert_scores_to_classes(scores, anomaly_ratio):
    """
    Converts list of scores to flags (0/1) - top anomalies are marked as 1.
    """
    anomaly_cnt = int(len(scores) * anomaly_ratio)
    anomaly_indices = np.array(scores).argsort()[-anomaly_cnt:][::-1]
    y_pred = np.zeros(len(scores))
    np.put(y_pred, anomaly_indices, 1)
    return y_pred

In [22]:
y_pred = convert_scores_to_classes(scores, 0.06)

In [23]:
f1 = f1_score(y, y_pred, average='weighted')
print("F1={:f}".format(f1))

F1=0.885286


In [21]:
np.sum(y) / len(y)

0.06111811204032535