In [1]:
import os

import numpy as np
import pandas as pd
from anomaly_delays.helper_functions import generate_random_delays, read_nab
from anomaly_delays.main_functions import calculate_loss, get_scores
from tqdm import tqdm

In [2]:
experts = [
    "knncad",
    "numentaTM",
    "twitterADVec",
    "skyline",
    "earthgeckoSkyline",
    "numenta",
    "bayesChangePt",
    "null",
    "expose",
    "relativeEntropy",
    "htmjava",
    "randomCutForest",
    "random",
    "contextOSE",
    "windowedGaussian",
]

## Read predictions from NAB and calculate predictions of Fixed-share and Variable-share

In [3]:
np.random.seed(42)
folders = [
    s
    for s in os.listdir("NAB/results/numenta")
    if s.startswith("real") or s.startswith("artificial")
]
alpha_range = (0, 0.01, 0.05, 0.1, 0.3)
share_range = ("Fixed", "Variable")
for m, folder_name in enumerate(tqdm(folders)):
    files = [
        i.replace("numenta", "")
        for i in os.listdir(
            os.path.join("NAB/results/numenta", f"{folder_name}")
        )
    ]
    for n, file_name in enumerate(files):
        dt = read_nab(experts, folder_name, file_name)
        score_experts = np.array(dt.filter(regex="^score", axis=1))
        assert score_experts.shape[1] == len(experts)
        target = dt["label"].values
        delays_random = generate_random_delays(
            max_length=dt.shape[0], min_delay=20, max_delay=100
        )
        delays_range = (1, 20, 50, 100, delays_random)
        scores_share = get_scores(
            target, score_experts, share_range, alpha_range, delays_range
        )
        dt = pd.merge(
            dt, scores_share, left_index=True, right_index=True, validate="1:1"
        )
        dt["file_name"] = file_name
        dt["folder_name"] = folder_name
        if (m == 0) & (n == 0):
            results = dt.copy()
        else:
            results = pd.concat([results, dt], axis=0, sort=False)

100%|██████████| 7/7 [34:50<00:00, 298.67s/it]


## Calculate losses

In [4]:
%%time
alg_list = results.filter(regex="^score", axis=1).columns.tolist()
alg_list = [i.replace("score_", "") for i in alg_list]
losses_log = results[
    ["timestamp", "value", "label", "file_name", "folder_name"]
].copy()
losses_square = results[
    ["timestamp", "value", "label", "file_name", "folder_name"]
].copy()
for alg_ind in alg_list:
    losses_log[f"loss_{alg_ind}"] = calculate_loss(
        results["label"].values,
        results[f"score_{alg_ind}"].values,
        share_type="Fixed",
    )
    losses_square[f"loss_{alg_ind}"] = calculate_loss(
        results["label"].values,
        results[f"score_{alg_ind}"].values,
        share_type="Variable",
    )
losses_log_total = losses_log.groupby(["folder_name", "file_name"])[
    losses_log.filter(regex="^loss", axis=1).columns.tolist()
].sum()
losses_square_total = losses_square.groupby(["folder_name", "file_name"])[
    losses_square.filter(regex="^loss", axis=1).columns.tolist()
].sum()

CPU times: user 1.39 s, sys: 520 ms, total: 1.91 s
Wall time: 1.9 s


## Export results

In [5]:
%%time
if not os.path.exists("results"):
    os.makedirs("results")
results.to_csv("results/scores.csv", index=False)
losses_log.to_csv("results/losses_log.csv", index=False)
losses_square.to_csv("results/losses_square.csv", index=False)
losses_log_total.to_csv("results/losses_log_total.csv")
losses_square_total.to_csv("results/losses_square_total.csv")

CPU times: user 1min 15s, sys: 1.25 s, total: 1min 16s
Wall time: 1min 16s
