# Anomaly Detection
```
CSV files required as preocnditions
```

### Make anomaly datasets

In [2]:
import pandas as pd
import numpy as np
import os


def write_anomaly_datasets_with_csv(input_path, output_path, anomaly_rate=1):
    df = pd.read_csv(input_path)
    target_name = df.columns[-1]
    targets = np.array(df[target_name])
    threshold = np.percentile(targets, 100-anomaly_rate)
    anomaly_target = []
    for i in targets:
        anomaly_target.append(0 if i <= threshold else 1)
    modified_df = df.copy()
    modified_df = modified_df.drop(columns=target_name)
    modified_df['Anomaly'] = np.array(anomaly_target)

    modified_df.to_csv(output_path, index=False)


def make_input_output_path(target, poc, delay):
    input_path = f'./datasets/{poc}/{poc}_{target[:3].lower()}_{delay}hr.csv'
    if not os.path.exists('./anomaly_datasets'):
        os.makedirs('./anomaly_datasets')
    if not os.path.exists(f'./anomaly_datasets/{poc}'):
        os.makedirs(f'./anomaly_datasets/{poc}')
    output_path = f'./anomaly_datasets/{poc}/{poc}_{target[:3].lower()}_{delay}hr.csv'

    return input_path, output_path


targets = ['Avg$PerMWHr', 'Med$PerMWHr']
pocs = [
        'ALB0331',   ## Auckland
        'HAM0331',   ## Hamilton
        'WIL0331',   ## Wellington
        'ISL0661',   ## ChristChurch
        'SDN0331',   ## Dunedin
        'STK0331',   ## Nelson
       ]
delays = [
    0.5, 4, 6, 24
]
# Percentage of defined anomalies in the datasets
anomaly_rate = [0.1, 0.5, 1, 2]

for r in anomaly_rate:
    for t in targets:
        for p in pocs:
            for d in delays:
                input, output = make_input_output_path(t, p, d)
                write_anomaly_datasets_with_csv(input, output[:-4]+'_'+str(r)+output[-4:], r)

### Define experimental loop

In [3]:
import concurrent.futures
import time

from river import anomaly
from river import stream
from sklearn.metrics import roc_auc_score, average_precision_score
import random

datasets_folder = './'
def _evaluate_hst_river(file_name, window_size=256, height=8, n_trees=100, seed=None, shuffle=True, skip_anomaly=False):
    df = pd.read_csv(file_name)
    X = df.iloc[:, :-1]  # features
    y = df.iloc[:, -1].astype(int)  # label

    X = (X - X.min()) / (X.max() - X.min())  # normalize

    model = anomaly.HalfSpaceTrees(n_trees=n_trees, height=height, window_size=window_size, seed=seed)

    dataset = stream.iter_pandas(X, y)
    scores = []

    start = time.time()
    for i, (x, l) in enumerate(dataset):
        s = model.score_one(x)
        scores.append(float(s))
        if l == 1 and skip_anomaly:
            continue
        model.learn_one(x)
    end = time.time()

    auc = roc_auc_score(y, scores)
    pr = average_precision_score(y, scores)
    return auc, pr, end - start


def evaluate_hst_river(file_name, window_size=256, height=8, n_trees=100, runs=10, shuffle=True, skip_anomaly=False, print_each_result=False):
    auc_results = []
    pr_results = []
    runtimes = []
    records = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(_evaluate_hst_river, file_name, window_size, height, n_trees, run, shuffle,
                                   skip_anomaly): run for run in range(runs)}

    for future in concurrent.futures.as_completed(futures):
        auc, pr, elapse = future.result()
        auc_results.append(auc)
        pr_results.append(pr)
        runtimes.append(elapse)
        records.append((file_name, 'HST-RIVER', auc, pr, elapse, futures[future], shuffle))
    if print_each_result:
        print(f"AUC mean: {np.mean(auc_results)} , AUC std: {np.std(auc_results)}")
        print(f"PR mean: {np.mean(pr_results)}, PR std: {np.std(pr_results)}")
        print(f"Runtime mean: {np.mean(runtimes)}, Runtime std: {np.std(runtimes)}")
    df = pd.DataFrame(records, columns=['dataset', 'method', 'auc', 'pr', 'runtime', 'seed', 'shuffled'])
    # df.to_csv(os.path.join(record_folder, 'aif-ecai-records.csv'), index=False, header=None,  mode='a')
    return np.mean(auc_results), np.mean(pr_results), np.mean(runtimes), df


### Run experiments

In [None]:
def make_dateset_name(poc, delay, type, ratio):
    return f'./anomaly_datasets/{poc}/{poc}_{type}_{delay}hr_{ratio}.csv'


pocs = [
        'ALB0331',   ## Auckland
        'HAM0331',   ## Hamilton
        'WIL0331',   ## Wellington
        'ISL0661',   ## ChristChurch
        'SDN0331',   ## Dunedin
        'STK0331',   ## Nelson
       ]

results_dfs = []
p_col = []
t_col = []
d_col = []
r_col = []
auc_col = []

for p in pocs:
    for d in ['0.5','4','6','24']:
        for t in ['avg', 'med']:
            for r in ['0.1', '0.5', '1', '2']:
                file_name = make_dateset_name(p,d,t,r)
                auc, pr, time111, single_run_df = evaluate_hst_river(file_name=file_name, runs=3)
                p_col.append(p)
                t_col.append(t)
                d_col.append(d)
                r_col.append(r)
                auc_col.append(auc)
                results_dfs.append(single_run_df)

data ={
    "PoC":p_col,
    "Target":t_col,
    "Delay":d_col,
    "Ratio":r_col,
    "AUC ROC":auc_col,
}
