# Siamese Network for Intrusion Detection
- Dataset: NSL-KDD (https://www.unb.ca/cic/datasets/nsl.htmlhttps://www.unb.ca/cic/datasets/nsl.html)
- Model: Siamese Net with Triplet-loss (https://arxiv.org/abs/1503.03832https://arxiv.org/abs/1503.03832)
- Resuls: in-progress

## Declarations

In [1]:
# libraries
import numpy as np 
import pandas as pd 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.layers import *
import os, random

import matplotlib.pyplot as plt


pd.options.display.max_columns = None
pd.options.display.max_rows = None

%matplotlib inline

print('TensorFlow version:', tf.__version__)

TensorFlow version: 2.4.1


In [2]:
NID_dataset_json={'Name':'Network Intrusion Detection',
                  'path_train_csv':'dataset/Network_Intrusion_Detection/Train_data.csv',
                  'path_test_csv': 'dataset/Network_Intrusion_Detection/Train_data.csv',
                  'target_field':'class'
                 }

NSLKDD_dataset_json={'title':'NSL KDD',
                     'path_train_csv':'dataset/NSL_KDD/NSL_KDD_Train.csv',
                     'path_test_csv': 'dataset/NSL_KDD/NSL_KDD_Test.csv',
                     'target_field':'class'
                    }

hyperparameter={'epoch':150,
                'batch-size':1024,
                'loss-function':'triplet_loss'
               }


os.mkdir('model') if not os.path.isdir('model') else None
os.mkdir('model/checkpoint') if not os.path.isdir('model/checkpoint') else None

In [3]:
dataset_json=NID_dataset_json

## Data 

In [4]:

def dataset(path_csv):
    df=pd.read_csv(path_csv)
    
    X = df.iloc[:, :-1] 
    y = df.iloc[:, -1]   
    return X,y

def create_batch(batch_size, x_dataset, y_dataset ):
    feature_length=x_dataset.shape[0]
    x_anchors = np.zeros((batch_size, feature_length))
    x_positives = np.zeros((batch_size, feature_length))
    x_negatives = np.zeros((batch_size, feature_length))
    
    for i in range(0, batch_size):
        # We need to find an anchor, a positive example and a negative example
        random_index = random.randint(0, x_dataset.shape[0] - 1)
        x_anchor = x_dataset[random_index]
        y = y_dataset[random_index]
        
        indices_for_pos = np.squeeze(np.where(y_dataset == y))
        indices_for_neg = np.squeeze(np.where(y_dataset != y))
        
        x_positive = x_dataset[indices_for_pos[random.randint(0, len(indices_for_pos) - 1)]]
        x_negative = x_dataset[indices_for_neg[random.randint(0, len(indices_for_neg) - 1)]]
        
        x_anchors[i] = x_anchor
        x_positives[i] = x_positive
        x_negatives[i] = x_negative
        
    return [x_anchors, x_positives, x_negatives]

def preprocess_dataset(X,y):
    print ('Preprocessing...')
    shape_X=str(X.shape)
    shape_y=str(y.shape)
    nan_columns_X = [i for i in X.columns if X[i].isnull().any()]
    categorical_columns_X=X.select_dtypes(include=['object']).columns.tolist()
    
    print ("\tpre-shape: ",shape_X,shape_y)
    print ("\tnan coloumns:",nan_columns_X)
    print ("\tcategorical coloumns:",categorical_columns_X)
    
    # imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = imputer.fit(X[:, 1:])
    # X[:, 1:] = imputer.transform(X[:, 1:])
    
    X=pd.get_dummies(data=X,columns=categorical_columns_X) # transform categorized to one-hot encoding
    #y=pd.get_dummies(data=y)

    # one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
    # one_hot_encoder.fit(X,categorical_columns_X)
    # X=one_hot_encoder.transform(X)
    
    label_encoder = LabelEncoder().fit(y)
    y=label_encoder.transform(y)
    

    #X_train=X_train.values
    shape_X=str(X.shape)
    shape_y=str(y.shape)
    print ("\tpost-shape:",shape_X,shape_y)
    
    return X, y
 
print ('Data loading...')
X,y=dataset(dataset_json['path_train_csv'])
print ('\tFileloaded...',dataset_json['path_train_csv'])


X,y=preprocess_dataset(X,y)

print ('Train/Test sets...')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print ('\tTrain:',X_train.shape, y_train.shape)
print ('\tTest:',X_test.shape, y_test.shape)


print ('Reshape...done')
X_train = np.reshape(X_train.values,(X_train.values.shape[0], X_train.values.shape[1]) )/255.
X_test = np.reshape(X_test.values,(X_test.values.shape[0], X_test.values.shape[1]) )/255.

num_of_features=X_train.shape[1]

Data loading...
	Fileloaded... dataset/Network_Intrusion_Detection/Train_data.csv
Preprocessing...
	pre-shape:  (25192, 41) (25192,)
	nan coloumns: []
	categorical coloumns: ['protocol_type', 'service', 'flag']
	post-shape: (25192, 118) (25192,)
Train/Test sets...
	Train: (22672, 118) (22672,)
	Test: (2520, 118) (2520,)
Reshape...done


In [5]:
def create_batch(batch_size,n_features, x_dataset,y_dataset):
    x_anchors = np.zeros((batch_size, n_features))
    x_positives = np.zeros((batch_size, n_features))
    x_negatives = np.zeros((batch_size, n_features))
    
    for i in range(0, batch_size):
        # We need to find an anchor, a positive example and a negative example
        random_index = random.randint(0, x_dataset.shape[0] - 1)
        x_anchor = x_dataset[random_index]
        y = y_dataset[random_index]
        
        indices_for_pos = np.squeeze(np.where(y_dataset == y))
        indices_for_neg = np.squeeze(np.where(y_dataset != y))
        
        x_positive = x_dataset[indices_for_pos[random.randint(0, len(indices_for_pos) - 1)]]
        x_negative = x_dataset[indices_for_neg[random.randint(0, len(indices_for_neg) - 1)]]
        
        x_anchors[i] = x_anchor
        x_positives[i] = x_positive
        x_negatives[i] = x_negative
        
    return [x_anchors, x_positives, x_negatives]

examples = create_batch(1,num_of_features,X_train,y_train)
#print ('example:', (examples[0],examples[1],examples[2]))
#print ('example:', (y_train[0],y_train[1],y_train[2]))

## Model

In [6]:
emb_size = 64

embedding_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(num_of_features,)),
    tf.keras.layers.Dense(emb_size, activation='sigmoid')
])

embedding_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                7616      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
Total params: 11,776
Trainable params: 11,776
Non-trainable params: 0
_________________________________________________________________


2023-01-11 15:11:03.331740: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
alpha = 0.2
def triplet_loss(y_true, y_pred):
    anchor, positive, negative = y_pred[:,:emb_size], y_pred[:,emb_size:2*emb_size], y_pred[:,2*emb_size:]
    positive_dist = tf.reduce_mean(tf.square(anchor - positive), axis=1)
    negative_dist = tf.reduce_mean(tf.square(anchor - negative), axis=1)
    return tf.maximum(positive_dist - negative_dist + alpha, 0.)

def data_generator(batch_size,x_dataset,y_dataset):
    while True:
        x = create_batch(batch_size,num_of_features,x_dataset,y_dataset)
        y = np.zeros((batch_size, 3*emb_size))
        yield x, y

### siamese network

In [8]:
input_anchor = tf.keras.layers.Input(shape=(num_of_features,))
input_positive = tf.keras.layers.Input(shape=(num_of_features,))
input_negative = tf.keras.layers.Input(shape=(num_of_features,))

embedding_anchor = embedding_model(input_anchor)
embedding_positive = embedding_model(input_positive)
embedding_negative = embedding_model(input_negative)

output = tf.keras.layers.concatenate([embedding_anchor, embedding_positive, embedding_negative], axis=1)

model = tf.keras.models.Model([input_anchor, input_positive, input_negative], output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 118)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 118)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 118)]        0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 64)           11776       input_1[0][0]                    
                                                                 input_2[0][0]                

## Train

In [9]:
es=tf.keras.callbacks.EarlyStopping(
    monitor="loss",
    min_delta=0.01,
    patience=5,
    verbose=0,
    mode="min",
    baseline=None,
    restore_best_weights=False
)
chk=tf.keras.callbacks.ModelCheckpoint(
    'model/checkpoint/',
    monitor= "loss",
    verbose = 0,
    save_best_only  = True,
    save_weights_only = True,
    mode = "auto",
    save_freq="epoch",
    options=None,
    initial_value_threshold=None
)

In [10]:
batch_size = hyperparameter['batch-size']
epochs = hyperparameter['epoch']
loss_function=hyperparameter['loss-function']#'triplet_loss'


steps_per_epoch = int(X_train.shape[0]/batch_size)

model.compile(loss=triplet_loss, optimizer='adam')

In [11]:
history = model.fit(
    data_generator(batch_size, X_train,y_train),
    #validation_data=data_generator(batch_size, X_test, y_test),
    steps_per_epoch=steps_per_epoch,
    epochs=epochs, 
    callbacks=[es,chk],
    verbose=True)

Epoch 1/150
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmpo0lfe7z3.py, line 10)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmpo0lfe7z3.py, line 10)


2023-01-11 15:11:03.981503: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150


In [12]:
# evaluate
#test_loss, test_acc = model.evaluate(data_generator(batch_size, X_test, y_test))
#test_loss, test_acc = model.evaluate( X_test, y_test)

## Save/Load

### save model

In [13]:
model_json =  model.to_json()
with open(os.path.join('model',"model.json"), "w") as json_file:
    json_file.write(model_json)
    
# serialize weights to HDF5
model.save_weights(os.path.join('model',"model.h5"))


### load model

In [16]:
# load json and create model
json_file = open(os.path.join('model',"model.json"), "r")
loaded_model_json = json_file.read()
json_file.close()
model = tf.keras.models.model_from_json(loaded_model_json)
# load weights into new model
model.load_weights(os.path.join('model',"model.h5"))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Evaluate
#test_loss, test_acc = model.evaluate(X_test, y_test)