In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
train_no_anom = pd.read_csv("../../data/01_raw/BATADAL_dataset03_train_no_anomaly.csv")
train_some_anom = pd.read_csv("../../data/01_raw/BATADAL_dataset04_train_some_anomaly.csv")
test_with_anom = pd.read_csv("../../data/01_raw/BATADAL_test_dataset_some_anomaly.csv")

In [3]:
# has leading white space
train_some_anom.columns = train_some_anom.columns.str.strip()

In [4]:
train_no_anom.shape, train_some_anom.shape, test_with_anom.shape

((8761, 45), (4177, 45), (2089, 44))

In [5]:
for i, _df in enumerate([train_no_anom, train_some_anom, test_with_anom]):
    for j, _df2 in enumerate([train_no_anom, train_some_anom, test_with_anom]):
        set1 = set(_df.columns.tolist())
        set2 = set(_df2.columns.tolist())
        
        print(i, j, len(set1 - set2))
        if len(set1 - set2) != 0:
            print("\t", set1-set2)

0 0 0
0 1 0
0 2 1
	 {'ATT_FLAG'}
1 0 0
1 1 0
1 2 1
	 {'ATT_FLAG'}
2 0 0
2 1 0
2 2 0


In [6]:
train_no_anom.columns

Index(['DATETIME', 'L_T1', 'L_T2', 'L_T3', 'L_T4', 'L_T5', 'L_T6', 'L_T7',
       'F_PU1', 'S_PU1', 'F_PU2', 'S_PU2', 'F_PU3', 'S_PU3', 'F_PU4', 'S_PU4',
       'F_PU5', 'S_PU5', 'F_PU6', 'S_PU6', 'F_PU7', 'S_PU7', 'F_PU8', 'S_PU8',
       'F_PU9', 'S_PU9', 'F_PU10', 'S_PU10', 'F_PU11', 'S_PU11', 'F_V2',
       'S_V2', 'P_J280', 'P_J269', 'P_J300', 'P_J256', 'P_J289', 'P_J415',
       'P_J302', 'P_J306', 'P_J307', 'P_J317', 'P_J14', 'P_J422', 'ATT_FLAG'],
      dtype='object')

# NAB
NAB expects a directly containing CSVs with `timestamp, value` as the columns. Therefore we will split each CSV into multiple CSVs as required

# Train no anomaly

In [7]:
anomaly_dict = defaultdict(list)

In [8]:
tna_dataset_name = "train_no_anomaly"

In [9]:
# !mkdir ../data/03_primary/{tna_dataset_name}

In [10]:
# !mkdir ../results
# !mkdir ../results/NAB
# !mkdir ../results/NAB/{tna_dataset_name}

In [11]:
output_dir = f"../../data/03_primary/{tna_dataset_name}"

In [12]:
train_no_anom["timestamp"] = pd.to_datetime(train_no_anom["DATETIME"], format="%d/%m/%y %H")

In [13]:
SENSOR_COLS = [c for c in train_no_anom.columns if c not in ["DATETIME", "ATT_FLAG", "timestamp"]]

In [14]:
for c in SENSOR_COLS:
    train_no_anom[["timestamp", c]].rename(columns={c:"value"}).to_csv(f"{output_dir}/{c}.csv", index=False)
    anomaly_dict[f"{tna_dataset_name}/{c}.csv"] = []

# Train with anomaly to use as test set

In [15]:
twa_dataset_name = "train_with_anomaly"
twa_output_dir = f"../../data/03_primary/{twa_dataset_name}"

In [16]:
# !mkdir ../data/03_primary/{twa_dataset_name}
# !mkdir ../results/NAB/{twa_dataset_name}

In [17]:
# from http://www.batadal.net/images/Attacks_TrainingDataset2.png
fmt ="%d/%m/%Y %H"
train_anomalies = [
    ("13/09/2016 23", "16/09/2016 00"),
    ("26/09/2016 11", "27/09/2016 10"),
    ("09/10/2016 09", "11/10/2016 20"),
    ("29/10/2016 19", "02/11/2016 16"),
    ("26/11/2016 17", "29/11/2016 04"),
    ("06/12/2016 07", "10/12/2016 04"),
    ("14/12/2016 15", "19/12/2016 04")
]

train_anomalies_dt = [
    (pd.to_datetime(s, format=fmt), pd.to_datetime(e, format=fmt)) for s, e in train_anomalies
]

In [18]:
# train_anomalies_dt[0][0].strftime('%Y-%m-%d %H:%M:%S.%f')#[:-3]

In [19]:
train_some_anom["timestamp"] = pd.to_datetime(train_some_anom["DATETIME"], format="%d/%m/%y %H")
train_some_anom = train_some_anom.set_index(["timestamp"])

In [20]:
train_some_anom["attack"] = 0

In [21]:
for start, end in train_anomalies_dt:
    train_some_anom.loc[start:end, "attack"] = 1

In [22]:
train_some_anom["attack"].value_counts()

0    3685
1     492
Name: attack, dtype: int64

In [23]:
for c in SENSOR_COLS:
    train_some_anom.reset_index()[["timestamp", c]].rename(
        columns={c:"value"}
    ).to_csv(f"{twa_output_dir}/{c}.csv", index=False)
    
    for s_anom, e_anom in train_anomalies_dt:
        anomaly_dict[f"{twa_dataset_name}/{c}.csv"].append([
            s_anom.strftime('%Y-%m-%d %H:%M:%S.%f'), e_anom.strftime('%Y-%m-%d %H:%M:%S.%f')
        ])

# Actual test set

In [25]:
testwa_dataset_name = "test_with_anomaly"
testwa_output_dir = f"../../data/03_primary/{testwa_dataset_name}"

In [26]:
# !mkdir ../data/03_primary/{testwa_dataset_name}
# !mkdir ../results/NAB/{testwa_dataset_name}

In [27]:
# http://www.batadal.net/images/Attacks_TestDataset.png
test_anomalies = [
    ("16/01/2017 09", "19/01/2017 06"),
    ("30/01/2017 08", "02/02/2017 00"),
    ("09/02/2017 03", "10/02/2017 09"),
    ("12/02/2017 01", "13/02/2017 07"),
    ("24/02/2017 05", "28/02/2017 08"),
    ("10/03/2017 14", "13/03/2017 21"),
    ("25/03/2017 20", "27/03/2017 01")
]

test_anomalies_dt = [
    (pd.to_datetime(s, format=fmt), pd.to_datetime(e, format=fmt)) for s, e in test_anomalies
]

In [28]:
test_with_anom["timestamp"] = pd.to_datetime(test_with_anom["DATETIME"], format="%d/%m/%y %H")
test_with_anom = test_with_anom.set_index(["timestamp"])

In [29]:
test_with_anom["attack"] = 0
for start, end in test_anomalies_dt:
    test_with_anom.loc[start:end, "attack"] = 1

In [30]:
test_with_anom["attack"].value_counts()

0    1682
1     407
Name: attack, dtype: int64

In [31]:
for c in SENSOR_COLS:
    test_with_anom.reset_index()[["timestamp", c]].rename(
        columns={c:"value"}
    ).to_csv(f"{testwa_output_dir}/{c}.csv", index=False)
    
    for s_anom, e_anom in test_anomalies_dt:
        anomaly_dict[f"{testwa_dataset_name}/{c}.csv"].append([
            s_anom.strftime('%Y-%m-%d %H:%M:%S.%f'), e_anom.strftime('%Y-%m-%d %H:%M:%S.%f')
        ])

### Save labels

In [32]:
import json

In [33]:
# json.dumps(anomaly_dict)
with open("../../data/03_primary/labels-window.json", 'w') as fp:
    json.dump(anomaly_dict, fp, indent=4)


In [38]:
debug_keys = [k for k in anomaly_dict if "train_with_anomaly/F_" in k]
debug_labels = {key: anomaly_dict[key] for key in debug_keys}

In [39]:
with open("../../data/03_primary/labels-window-debug.json", 'w') as fp:
    json.dump(debug_labels, fp, indent=4)

In [40]:
# !cp ../../data/03_primary/labels-window-debug.json /home/ec2-user/SageMaker/NAB/labels/batadal-debug-labels.json

# Version 2: combined

As NAB finds anomalies at individual time series level by using previous time steps to predict future time steps, it is also worth experimenting with concatenating the three sets into one large set

In [43]:
train_no_anom["timestamp"] = pd.to_datetime(train_no_anom["DATETIME"], format="%d/%m/%y %H")
train_no_anom = train_no_anom.set_index(["timestamp"])

In [51]:
combined = pd.concat([
    train_no_anom.drop("ATT_FLAG", axis=1),
    train_some_anom.drop("ATT_FLAG", axis=1),
    test_with_anom
])

In [54]:
combined["attack"] = combined["attack"].fillna(0)
combined["attack"].value_counts()

0.0    14128
1.0      899
Name: attack, dtype: int64

In [62]:
combined_anomalies = train_anomalies_dt + test_anomalies_dt

In [59]:
combined_output_dir = "/home/ec2-user/SageMaker/NAB/data-batadal/combined"

In [63]:
for c in SENSOR_COLS:
    combined.reset_index()[["timestamp", c]].rename(
        columns={c:"value"}
    ).to_csv(f"{combined_output_dir}/{c}.csv", index=False)
    
    
    for s_anom, e_anom in combined_anomalies:
        anomaly_dict[f"combined/{c}.csv"].append([
            s_anom.strftime('%Y-%m-%d %H:%M:%S.%f'), e_anom.strftime('%Y-%m-%d %H:%M:%S.%f')
        ])

In [64]:
anomaly_dict.keys()

dict_keys(['train_no_anomaly/L_T1.csv', 'train_no_anomaly/L_T2.csv', 'train_no_anomaly/L_T3.csv', 'train_no_anomaly/L_T4.csv', 'train_no_anomaly/L_T5.csv', 'train_no_anomaly/L_T6.csv', 'train_no_anomaly/L_T7.csv', 'train_no_anomaly/F_PU1.csv', 'train_no_anomaly/S_PU1.csv', 'train_no_anomaly/F_PU2.csv', 'train_no_anomaly/S_PU2.csv', 'train_no_anomaly/F_PU3.csv', 'train_no_anomaly/S_PU3.csv', 'train_no_anomaly/F_PU4.csv', 'train_no_anomaly/S_PU4.csv', 'train_no_anomaly/F_PU5.csv', 'train_no_anomaly/S_PU5.csv', 'train_no_anomaly/F_PU6.csv', 'train_no_anomaly/S_PU6.csv', 'train_no_anomaly/F_PU7.csv', 'train_no_anomaly/S_PU7.csv', 'train_no_anomaly/F_PU8.csv', 'train_no_anomaly/S_PU8.csv', 'train_no_anomaly/F_PU9.csv', 'train_no_anomaly/S_PU9.csv', 'train_no_anomaly/F_PU10.csv', 'train_no_anomaly/S_PU10.csv', 'train_no_anomaly/F_PU11.csv', 'train_no_anomaly/S_PU11.csv', 'train_no_anomaly/F_V2.csv', 'train_no_anomaly/S_V2.csv', 'train_no_anomaly/P_J280.csv', 'train_no_anomaly/P_J269.csv', 't

In [65]:
with open("/home/ec2-user/SageMaker/NAB/labels/batadal-labels-combined.json", 'w') as fp:
    json.dump(anomaly_dict, fp, indent=4)