Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0

# Train GDN Model with BATADAL data

## Table of Contents

1. Load processed data
  * Load sensor columns
  * Load train and test CSVs
2. Load model and environment config files
3. Train GDN model 
4. Perform inference on test set

In [1]:
import sys 
sys.path.append('../../src/anomaly_detection_spatial_temporal_data/')

In [2]:
import pandas as pd
import numpy as np
import yaml
from model.GDN.GDNTrainer import GDNTrainer

  from .autonotebook import tqdm as notebook_tqdm


# Load data

### Load sensor columns

In [3]:
data_dir = "../../data/03_primary/iot"

In [4]:
with open(f"{data_dir}/iot_sensor_list_batadal.txt", "r") as f:
    sensors = f.read().split("\n")

In [5]:
print(f"Number of sensors: {len(sensors)}")

Number of sensors: 43


In [6]:
print(sensors)

['L_T1', 'L_T2', 'L_T3', 'L_T4', 'L_T5', 'L_T6', 'L_T7', 'F_PU1', 'S_PU1', 'F_PU2', 'S_PU2', 'F_PU3', 'S_PU3', 'F_PU4', 'S_PU4', 'F_PU5', 'S_PU5', 'F_PU6', 'S_PU6', 'F_PU7', 'S_PU7', 'F_PU8', 'S_PU8', 'F_PU9', 'S_PU9', 'F_PU10', 'S_PU10', 'F_PU11', 'S_PU11', 'F_V2', 'S_V2', 'P_J280', 'P_J269', 'P_J300', 'P_J256', 'P_J289', 'P_J415', 'P_J302', 'P_J306', 'P_J307', 'P_J317', 'P_J14', 'P_J422']


### Load train and test CSVs

In [7]:
train_df = pd.read_csv(f"{data_dir}/iot_gdn_train.csv")
test_df = pd.read_csv(f"{data_dir}/iot_gdn_test.csv")

print(train_df.shape)
print(test_df.shape)


(8761, 44)
(2089, 45)


### Load training and environment configurations

In [8]:
model_config_file = "../../conf/base/parameters/gdn.yml"

In [9]:
with open(model_config_file, "r") as stream:
    try:
        model_config = yaml.safe_load(stream)
        print(model_config)
    except yaml.YAMLError as exc:
        print(exc)

{'env_config_iot': {'checkpoint_save_dir': 'data/07_model_output/iot/gdn', 'report': 'best', 'device': 'cpu', 'load_model_path': ''}, 'env_config_wifi': {'checkpoint_save_dir': 'data/07_model_output/wifi/gdn', 'report': 'best', 'device': 'cpu', 'load_model_path': ''}, 'train_config': {'seed': 5, 'batch': 32, 'slide_win': 5, 'dim': 64, 'out_layer_num': 1, 'slide_stride': 1, 'topk': 5, 'out_layer_inter_dim': 128, 'val_ratio': 0.2, 'decay': 0, 'epoch': 3, 'comment': ''}}


In [10]:
train_config = model_config["train_config"]
env_config = model_config["env_config_wifi"]

env_config["checkpoint_save_dir"] = "../../data/07_model_output/gdn-iot-notebook"

# Train model

In [11]:
# by default this runs for 3 epochs
# we can change this by uncommenting the following line before creating the training object
# train_config["epoch"] = 5

trainer = GDNTrainer(
    sensors, train_df, test_df, 
    train_config, env_config
)

In [12]:
trainer.run()

epoch (0 / 3) (Loss:0.10123464, ACU_loss:22.17038678)
epoch (1 / 3) (Loss:0.05779031, ACU_loss:12.65607802)
epoch (2 / 3) (Loss:0.05135712, ACU_loss:11.24720880)


# Run inference

In [13]:
pred, labels = trainer.predict()

In [None]:
# we lose 5 items due to the windowing process from the TimeDataset constructor
# window is `slide_win`
# pred.shape

In [None]:
# np.array(labels).shape

# References
Riccardo Taormina and Stefano Galelli and Nils Ole Tippenhauer and Elad Salomons and Avi Ostfeld and Demetrios G. Eliades and Mohsen Aghashahi and Raanju Sundararajan and Mohsen Pourahmadi and M. Katherine Banks and B. M. Brentan and Enrique Campbell and G. Lima and D. Manzi and D. Ayala-Cabrera and M. Herrera and I. Montalvo and J. Izquierdo and E. Luvizotto and Sarin E. Chandy and Amin Rasekh and Zachary A. Barker and Bruce Campbell and M. Ehsan Shafiee and Marcio Giacomoni and Nikolaos Gatsis and Ahmad Taha and Ahmed A. Abokifa and Kelsey Haddad and Cynthia S. Lo and Pratim Biswas and M. Fayzul K. Pasha and Bijay Kc and Saravanakumar Lakshmanan Somasundaram and Mashor Housh and Ziv Ohar; "The Battle Of The Attack Detection Algorithms: Disclosing Cyber Attacks On Water Distribution Networks." Journal of Water Resources Planning and Management, 144 (8), August 2018

Ailin Deng and Bryan Hooi. 2021. Graph Neural Network-Based Anomaly Detection in Multivariate Time Series. CoRR abs/2106.06947, (2021). Retrieved from https://arxiv.org/abs/2106.06947 