In [1]:
import pandas as pd

In [2]:
train_no_anom = pd.read_csv("../data/01_raw/BATADAL_dataset03_train_no_anomaly.csv")
train_some_anom = pd.read_csv("../data/01_raw/BATADAL_dataset04_train_some_anomaly.csv")
test_with_anom = pd.read_csv("../data/01_raw/BATADAL_test_dataset_some_anomaly.csv")

In [3]:
# has leading white space
train_some_anom.columns = train_some_anom.columns.str.strip()

In [4]:
train_no_anom.shape, train_some_anom.shape, test_with_anom.shape

((8761, 45), (4177, 45), (2089, 44))

In [5]:
for i, _df in enumerate([train_no_anom, train_some_anom, test_with_anom]):
    for j, _df2 in enumerate([train_no_anom, train_some_anom, test_with_anom]):
        set1 = set(_df.columns.tolist())
        set2 = set(_df2.columns.tolist())
        
        print(i, j, len(set1 - set2))
        if len(set1 - set2) != 0:
            print("\t", set1-set2)

0 0 0
0 1 0
0 2 1
	 {'ATT_FLAG'}
1 0 0
1 1 0
1 2 1
	 {'ATT_FLAG'}
2 0 0
2 1 0
2 2 0


In [6]:
train_no_anom.columns

Index(['DATETIME', 'L_T1', 'L_T2', 'L_T3', 'L_T4', 'L_T5', 'L_T6', 'L_T7',
       'F_PU1', 'S_PU1', 'F_PU2', 'S_PU2', 'F_PU3', 'S_PU3', 'F_PU4', 'S_PU4',
       'F_PU5', 'S_PU5', 'F_PU6', 'S_PU6', 'F_PU7', 'S_PU7', 'F_PU8', 'S_PU8',
       'F_PU9', 'S_PU9', 'F_PU10', 'S_PU10', 'F_PU11', 'S_PU11', 'F_V2',
       'S_V2', 'P_J280', 'P_J269', 'P_J300', 'P_J256', 'P_J289', 'P_J415',
       'P_J302', 'P_J306', 'P_J307', 'P_J317', 'P_J14', 'P_J422', 'ATT_FLAG'],
      dtype='object')

# Train no anomaly

In [7]:
# !mkdir ../../GDN/data/batadal

output_dir = "../../GDN/data/batadal"

In [8]:
SENSOR_COLS = [c for c in train_no_anom.columns if c not in ["DATETIME", "ATT_FLAG"]]

In [9]:
with open(f"{output_dir}/list.txt", "w") as f:
    f.writelines("\n".join(SENSOR_COLS))

In [10]:
train_no_anom.reset_index().rename(columns={"index": "timestamp"})[["timestamp" ]+ SENSOR_COLS].to_csv(f"{output_dir}/train.csv")

# Train with anomaly to use as test set

In [11]:
# from http://www.batadal.net/images/Attacks_TrainingDataset2.png
fmt ="%d/%m/%Y %H"
train_anomalies = [
    ("13/09/2016 23", "16/09/2016 00"),
    ("26/09/2016 11", "27/09/2016 10"),
    ("09/10/2016 09", "11/10/2016 20"),
    ("29/10/2016 19", "02/11/2016 16"),
    ("26/11/2016 17", "29/11/2016 04"),
    ("06/12/2016 07", "10/12/2016 04"),
    ("14/12/2016 15", "19/12/2016 04")
]

train_anomalies_dt = [
    (pd.to_datetime(s, format=fmt), pd.to_datetime(e, format=fmt)) for s, e in train_anomalies
]

In [12]:
train_anomalies_dt

[(Timestamp('2016-09-13 23:00:00'), Timestamp('2016-09-16 00:00:00')),
 (Timestamp('2016-09-26 11:00:00'), Timestamp('2016-09-27 10:00:00')),
 (Timestamp('2016-10-09 09:00:00'), Timestamp('2016-10-11 20:00:00')),
 (Timestamp('2016-10-29 19:00:00'), Timestamp('2016-11-02 16:00:00')),
 (Timestamp('2016-11-26 17:00:00'), Timestamp('2016-11-29 04:00:00')),
 (Timestamp('2016-12-06 07:00:00'), Timestamp('2016-12-10 04:00:00')),
 (Timestamp('2016-12-14 15:00:00'), Timestamp('2016-12-19 04:00:00'))]

In [13]:
train_some_anom = train_some_anom.reset_index().rename(columns={"index": "timestamp"})
train_some_anom["pdDateTime"] = pd.to_datetime(train_some_anom["DATETIME"], format="%d/%m/%y %H")
train_some_anom = train_some_anom.set_index(["pdDateTime"])

In [14]:
train_some_anom["attack"] = 0

In [15]:
for start, end in train_anomalies_dt:
    train_some_anom.loc[start:end, "attack"] = 1

In [16]:
train_some_anom["attack"].value_counts()

0    3685
1     492
Name: attack, dtype: int64

In [27]:
train_some_anom[["timestamp"] + SENSOR_COLS + ["attack"]].to_csv(f"{output_dir}/train_some_anom.csv", index=False)

In [19]:
train_some_anom["attack"].value_counts()

0    3685
1     492
Name: attack, dtype: int64

# Actual test set

In [22]:
# http://www.batadal.net/images/Attacks_TestDataset.png
test_anomalies = [
    ("16/01/2017 09", "19/01/2017 06"),
    ("30/01/2017 08", "02/02/2017 00"),
    ("09/02/2017 03", "10/02/2017 09"),
    ("12/02/2017 01", "13/02/2017 07"),
    ("24/02/2017 05", "28/02/2017 08"),
    ("10/03/2017 14", "13/03/2017 21"),
    ("25/03/2017 20", "27/03/2017 01")
]

test_anomalies_dt = [
    (pd.to_datetime(s, format=fmt), pd.to_datetime(e, format=fmt)) for s, e in test_anomalies
]

In [23]:
test_anomalies_dt

[(Timestamp('2017-01-16 09:00:00'), Timestamp('2017-01-19 06:00:00')),
 (Timestamp('2017-01-30 08:00:00'), Timestamp('2017-02-02 00:00:00')),
 (Timestamp('2017-02-09 03:00:00'), Timestamp('2017-02-10 09:00:00')),
 (Timestamp('2017-02-12 01:00:00'), Timestamp('2017-02-13 07:00:00')),
 (Timestamp('2017-02-24 05:00:00'), Timestamp('2017-02-28 08:00:00')),
 (Timestamp('2017-03-10 14:00:00'), Timestamp('2017-03-13 21:00:00')),
 (Timestamp('2017-03-25 20:00:00'), Timestamp('2017-03-27 01:00:00'))]

In [24]:
test_with_anom = test_with_anom.reset_index().rename(columns={"index": "timestamp"})
test_with_anom["pdDateTime"] = pd.to_datetime(test_with_anom["DATETIME"], format="%d/%m/%y %H")
test_with_anom = test_with_anom.set_index(["pdDateTime"])

In [25]:
test_with_anom["attack"] = 0
for start, end in test_anomalies_dt:
    test_with_anom.loc[start:end, "attack"] = 1

In [26]:
test_with_anom["attack"].value_counts()

0    1682
1     407
Name: attack, dtype: int64

In [28]:
test_with_anom[["timestamp"] + SENSOR_COLS + ["attack"]].to_csv(f"{output_dir}/test_with_anom.csv", index=False)

# Make copy of test set we want to use

In [31]:
!cp {output_dir}/test_with_anom.csv {output_dir}/test.csv