# CIDS: Conventional Intrusion Detection System


---


This notebook contains code for training multiple conventional (non-federated) autoencoder-based anomaly detectors for the SynCAN dataset.

# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from psutil import virtual_memory
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(f'GPU: {gpu_info[599:619]}')
ram_gb = virtual_memory().total / 1e9
print(f'RAM: {ram_gb:.1f} GB')

GPU: NVIDIA A100-SXM...  
RAM: 89.6 GB


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import layers
import matplotlib.pyplot as plt

In [4]:
signal_counts = [2, 3, 2, 1, 2, 2, 2, 1, 1, 4] # number of signals contained in each message ID

In [5]:
START_ID = 1
END_ID = 10

TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1
TIME_STEPS = 100
SEQ_STRIDE = TIME_STEPS // 2
EPOCHS = 2
BATCH_SIZE = 64
LEARNING_RATE = 0.001
DROP_OUT = True
LOSS_FUNCTION = 'mse'
METRIC = 'mean_absolute_percentage_error'
PATIENCE = 1
DIRECTORY = "/content/drive/MyDrive/Colab Notebooks/models/cids/"

In [6]:
# Download SynCAN dataset from the ETAS github
!git clone https://github.com/etas/SynCAN.git &> /dev/null
!unzip ./SynCAN/\*.zip -d ./SynCAN/. &> /dev/null
!rm ./SynCAN/*.zip &> /dev/null

In [7]:
def train_csv2df(dir_path): # imports training files into a dataframe
    data_frames = []
    csv_path = dir_path + '/train_1.csv'    # train 1 contains header
    df_temp = pd.read_csv(csv_path)
    data_frames.append(df_temp)
    for i in range(2, 5):
        csv_path = dir_path + '/train_' + str(i) + '.csv'
        df_temp = pd.read_csv(csv_path, header=None, names=['Label',  'Time', 'ID',
                                                            'Signal1', 'Signal2',  'Signal3', 'Signal4'])
        data_frames.append(df_temp)
    df = pd.concat(data_frames)
    return df

def prepare_train_df(): # convert csv, isolate message ID and remove unused columns
    df = train_csv2df('/content/SynCAN')
    print(f'{len(df):,} total messages (id1,id2,...,id10)')

    df = df[:][df.ID==MSG_ID] # use only messages with MSG_ID
    df = df.dropna(axis=1, how='all') # remove unused signal columns
    df.reset_index(drop=True, inplace=True)

    df_labels = df.iloc[:,0:1].astype(int) # dataframe of labels (removing Time and ID columns)
    df = df.iloc[:,3:] # dataframe of signal values
    df = df_labels.join(df)
    print(f'{len(df):,} messages used ({MSG_ID})\n')
    return df

def timeseries_dataset(
    data,
    targets,
    sequence_length,
    data_is_target=False,
    sequence_stride=1,
    sampling_rate=1,
    batch_size=1,
    shuffle=False,
    seed=None,
    start_index=None,
    end_index=None):
  
  if start_index is None:
    start_index = 0
  if end_index is None:
    end_index = len(data)

  # Determine the lowest dtype to store start positions (to lower memory usage).
  num_seqs = end_index - start_index - (sequence_length * sampling_rate) + 1
  if targets is not None:
    num_seqs = min(num_seqs, len(targets))
  if num_seqs < 2147483647:
    index_dtype = 'int32'
  else:
    index_dtype = 'int64'

  # Generate start positions
  start_positions = np.arange(0, num_seqs, sequence_stride, dtype=index_dtype)
  if shuffle:
    if seed is None:
      seed = np.random.randint(1e6)
    rng = np.random.RandomState(seed)
    rng.shuffle(start_positions)

  sequence_length = tf.cast(sequence_length, dtype=index_dtype)
  sampling_rate = tf.cast(sampling_rate, dtype=index_dtype)

  positions_ds = tf.data.Dataset.from_tensors(start_positions).repeat()

  # For each initial window position, generates indices of the window elements
  indices = tf.data.Dataset.zip(
      (tf.data.Dataset.range(len(start_positions)), positions_ds)).map(
          lambda i, positions: tf.range(  # pylint: disable=g-long-lambda
              positions[i],
              positions[i] + sequence_length * sampling_rate,
              sampling_rate),
          num_parallel_calls=tf.data.AUTOTUNE)

  dataset = sequences_from_indices(data, indices, start_index, end_index)
  if targets is not None:
    indices = tf.data.Dataset.zip(
        (tf.data.Dataset.range(len(start_positions)), positions_ds)).map(
            lambda i, positions: positions[i],
            num_parallel_calls=tf.data.AUTOTUNE)
    target_ds = sequences_from_indices(
        targets, indices, start_index, end_index)
    dataset = tf.data.Dataset.zip((dataset, target_ds))
  else:
    if data_is_target:
        dataset = tf.data.Dataset.zip((dataset, dataset))
  dataset = dataset.prefetch(tf.data.AUTOTUNE)
  if batch_size is not None:
    if shuffle:
      # Shuffle locally at each iteration
      dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
    dataset = dataset.batch(batch_size)
  else:
    if shuffle:
      dataset = dataset.shuffle(buffer_size=1024, seed=seed)
  return dataset

def sequences_from_indices(array, indices_ds, start_index, end_index):
  dataset = tf.data.Dataset.from_tensors(array[start_index : end_index])
  dataset = tf.data.Dataset.zip((dataset.repeat(), indices_ds)).map(
      lambda steps, inds: tf.gather(steps, inds),  # pylint: disable=unnecessary-lambda
      num_parallel_calls=tf.data.AUTOTUNE)
  return dataset

def SynCAN_train_ds(df):
    values = df.to_numpy()[:,1:]
    ds = timeseries_dataset(
        values,
        None,
        data_is_target = True,
        sequence_length = TIME_STEPS,
        sequence_stride = SEQ_STRIDE,
        # batch_size = BATCH_SIZE
    )
    return ds

def get_CIDS_model(time_steps, input_dim, latent_dim, drop_out=False):
    inputs = layers.Input(shape=(time_steps, input_dim)) # shape = (time_steps, data_dimension/num_features)
    # encoder
    x = layers.Dense(latent_dim*2, activation='tanh')(inputs)
    if drop_out: x = layers.Dropout(0.2)(x)
    enc_out, enc_hidden = layers.GRU(latent_dim, return_sequences=True, return_state=True, activation="tanh")(x)
    if drop_out:
        enc_out = layers.Dropout(0.2)(enc_out)
        enc_hidden = layers.Dropout(0.2)(enc_hidden)
    # decoder
    dec_out , dec_hidden = layers.GRU(latent_dim, return_sequences=True, return_state=True, activation="tanh")(enc_out)
    if drop_out:
        dec_out = layers.Dropout(0.2)(dec_out)
        dec_hidden = layers.Dropout(0.2)(dec_hidden)
    
    outputs = layers.Dense(input_dim, activation='tanh')(dec_out)
    model = tf.keras.Model(inputs, outputs)
    return model


# Model Training Loop

In [8]:
for ID in range(START_ID, END_ID+1):
    MSG_ID = 'id'+str(ID)      # each message contains 1-4 signals that are associated with the specified message ID
    NUM_SIGNALS = signal_counts[ID-1]
    LATENT_DIM = 32 * NUM_SIGNALS
    FILEPATH = DIRECTORY+MSG_ID+"_model_"+str(TIME_STEPS)+".h5"
    df = prepare_train_df()
    data_length = len(df)
    train_size = (int(data_length*TRAIN_SPLIT) // SEQ_STRIDE) * SEQ_STRIDE
    val_size = (int(data_length*VAL_SPLIT) // SEQ_STRIDE) * SEQ_STRIDE
    test_size = ((data_length - train_size - val_size) // SEQ_STRIDE) * SEQ_STRIDE

    train_df = df.iloc[:train_size]
    val_df = df[train_size:train_size+val_size]
    test_df = df.iloc[-test_size:]

    train_ds = SynCAN_train_ds(train_df)
    val_ds = SynCAN_train_ds(val_df)
    test_ds = SynCAN_train_ds(test_df)

    datasets = {'train': train_ds, 'val': val_ds, 'test': test_ds}
    for key, item in datasets.items():
        print(f"{key.upper()}: \t{item.__len__().numpy():,} subsequences of length {TIME_STEPS}")

    callbacks_list = [
        tf.keras.callbacks.EarlyStopping(
            monitor="val_loss",
            patience=PATIENCE,
        ),
        tf.keras.callbacks.ModelCheckpoint(
            filepath=FILEPATH,
            monitor="val_loss",
            save_best_only=True,
        )
    ]

    model = get_CIDS_model(TIME_STEPS, input_dim=NUM_SIGNALS, latent_dim=LATENT_DIM, drop_out=DROP_OUT)

    model.compile(optimizer=tf.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss=LOSS_FUNCTION,
        metrics=[METRIC])
    model.summary()

    history = model.fit(train_ds,
                        epochs=EPOCHS,
                        validation_data=val_ds,
                        use_multiprocessing=True,
                        workers=6,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        callbacks=callbacks_list)

    loss, metric = model.evaluate(val_ds, verbose=0)
    print(f'Validation loss: {loss:.5}\nValidation {METRIC}: {metric:.5}')
    print()

    loss, metric = model.evaluate(test_ds, verbose=0)
    print(f'Test loss: {loss:.5}\nTest {METRIC}: {metric:.5}')        # current best: 0.000229148383368738

29,669,726 total messages (id1,id2,...,id10)
4,139,819 messages used (id1)

TRAIN: 	66,236 subsequences of length 100
VAL: 	8,278 subsequences of length 100
TEST: 	8,279 subsequences of length 100
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 2)]          0         
                                                                 
 dense (Dense)               (None, 100, 128)          384       
                                                                 
 dropout (Dropout)           (None, 100, 128)          0         
                                                                 
 gru (GRU)                   [(None, 100, 64),         37248     
                              (None, 64)]                        
                                                                 
 dropout_1 (Dropout)         (None, 100, 64)           0      