In [1]:
import csv
import os
import shlex
import shutil
import subprocess
from pathlib import Path

import numpy as np
import pandas as pd
from main_functions import calculate_loss
from nab_functions import get_scores, read_nab
from tqdm.notebook import tqdm

## List of experts

In [2]:
algorithm_list = [
    "knncad",
    "numentaTM",
    "twitterADVec",
    "skyline",
    "earthgeckoSkyline",
    "numenta",
    "bayesChangePt",
    "null",
    "expose",
    "relativeEntropy",
    "htmjava",
    "randomCutForest",
    "random",
    "contextOSE",
    "windowedGaussian",
]

## Read predictions from NAB and calculate predictions of Fixed-share and Variable-share

In [3]:
dir_list = os.listdir(f"NAB/results/numenta")
folder_list = [s for s in dir_list if any(xs in s for xs in ["real", "artificial"])]
alpha_range = [0, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9]
share_range = ["Fixed", "Variable"]
for m, folder_name in enumerate(tqdm(folder_list)):
    file_list = os.listdir(f"NAB/results/numenta/{folder_name}")
    file_list = [i.replace("numenta", "") for i in file_list]
    for n, file_name in enumerate(file_list):
        dt = read_nab(algorithm_list, folder_name, file_name)
        score_experts = np.array(dt.filter(regex="^score", axis=1))
        assert score_experts.shape[1] == len(algorithm_list)
        target = dt["label"].values
        scores_share = get_scores(target, score_experts, share_range, alpha_range)
        dt = pd.merge(dt, scores_share, left_index=True, right_index=True, validate="1:1")
        dt["file_name"] = file_name
        dt["folder_name"] = folder_name
        if (m == 0) & (n == 0):
            results = dt.copy()
        else:
            results = pd.concat([results, dt], axis=0, sort=False)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




## Calculate losses

In [4]:
%%time
alg_list = results.filter(regex="^score", axis=1).columns.tolist()
alg_list = [i.replace("score_", "") for i in alg_list]
losses_log = results[["timestamp", "value", "label", "file_name", "folder_name"]].copy()
losses_square = results[["timestamp", "value", "label", "file_name", "folder_name"]].copy()
for alg_ind in alg_list:
    losses_log[f"loss_{alg_ind}"] = calculate_loss(
        results["label"].values, results[f"score_{alg_ind}"].values, loss_type="log"
    )
    losses_square[f"loss_{alg_ind}"] = calculate_loss(
        results["label"].values, results[f"score_{alg_ind}"].values, loss_type="square"
    )
losses_log_total = losses_log.groupby(["folder_name", "file_name"])[
    losses_log.filter(regex="^loss", axis=1).columns.tolist()
].sum()
losses_square_total = losses_square.groupby(["folder_name", "file_name"])[
    losses_square.filter(regex="^loss", axis=1).columns.tolist()
].sum()

CPU times: user 649 ms, sys: 149 ms, total: 798 ms
Wall time: 797 ms


## Export results

In [5]:
%%time
if not os.path.exists("results"):
    os.makedirs("results")
results.to_csv(f"results/scores.csv", index=False)
losses_log.to_csv(f"results/losses_log.csv", index=False)
losses_square.to_csv(f"results/losses_square.csv", index=False)
losses_log_total.to_csv(f"results/losses_log_total.csv")
losses_square_total.to_csv(f"results/losses_square_total.csv")

CPU times: user 39.8 s, sys: 643 ms, total: 40.4 s
Wall time: 40.4 s


## Export the predictions of Fixed-share and Variable-share in the NAB format

In [6]:
%%time
if not os.path.exists("results/NAB"):
    shutil.copytree("NAB", "results/NAB")
share_list = results.filter(regex="Fixed|Variable", axis=1).columns.tolist()
share_list = [i.replace("score_", "") for i in share_list]
for share_name in share_list:
    for m, folder_name in enumerate(folder_list):
        file_list = os.listdir(f"results/NAB/results/numenta/{folder_name}")
        file_list = [i.replace("numenta", "") for i in file_list]
        for n, file_name in enumerate(file_list):
            results_temp = results[
                (results["folder_name"] == folder_name) & (results["file_name"] == file_name)
            ]
            if not os.path.exists(f"results/NAB/results/{share_name}/{folder_name}"):
                os.makedirs(f"results/NAB/results/{share_name}/{folder_name}")
            results_temp[["timestamp", "value", f"score_{share_name}", "label"]].rename(
                {f"score_{share_name}": "anomaly_score"}, axis=1
            ).to_csv(
                f"results/NAB/results/{share_name}/{folder_name}/{share_name}{file_name}",
                index=False,
            )

CPU times: user 44.1 s, sys: 1.51 s, total: 45.6 s
Wall time: 45.7 s


## Calculate NAB scores

In [7]:
%%time
if os.path.exists("results/NAB/results/final_results.json"):
    (Path("results") / "NAB/results/final_results.json").unlink()
ps = subprocess.Popen(["echo", "y"], stdout=subprocess.PIPE)
subprocess.run(
    shlex.split(
        f"python run.py -d {' '.join(share_list+algorithm_list)} --optimize --score --normalize"
    ),
    cwd="results/NAB",
    stdin=ps.stdout,
)

CPU times: user 8.29 ms, sys: 29.2 ms, total: 37.5 ms
Wall time: 6min 24s


CompletedProcess(args=['python', 'run.py', '-d', 'Fixed0', 'Fixed5', 'Fixed10', 'Fixed20', 'Fixed30', 'Fixed50', 'Fixed70', 'Fixed90', 'Variable0', 'Variable5', 'Variable10', 'Variable20', 'Variable30', 'Variable50', 'Variable70', 'Variable90', 'knncad', 'numentaTM', 'twitterADVec', 'skyline', 'earthgeckoSkyline', 'numenta', 'bayesChangePt', 'null', 'expose', 'relativeEntropy', 'htmjava', 'randomCutForest', 'random', 'contextOSE', 'windowedGaussian', '--optimize', '--score', '--normalize'], returncode=0)