<a href="https://colab.research.google.com/github/arionas00/M2E5A2/blob/main/IDS_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
An example Intrusion Detection application using Dense, Conv1d and Lstm layers
please cite below works if you find it useful:
Akgun, Devrim, Selman Hizal, and Unal Cavusoglu. "A new DDoS attacks intrusion detection
model based on deep learning for cybersecurity." Computers & Security 118 (2022): 102748.

Hizal, Selman, Ünal ÇAVUŞOĞLU, and Devrim AKGÜN. "A New Deep Learning Based Intrusion
Detection System for Cloud Security." 2021 3rd International Congress on Human-Computer
Interaction, Optimization and Robotic Applications (HORA). IEEE, 2021.
"""


import matplotlib.pyplot as plt
from tensorflow.keras.utils import plot_model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.callbacks import ModelCheckpoint, EarlyStopping
import os
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

epochs = 100
nclass = 12

def loadDataset():
    # Load dataset
    filename = 'https://github.com/kdemertzis/EKPA/raw/main/Data/pcap_data.csv'
    trainfile = pd.read_csv(filename)
    print(trainfile.head())
    print(trainfile.columns)
    print(trainfile.shape)

    data = trainfile.to_numpy()
    # Assuming the target column is the last one
    target_col_index = -1  # Adjust index based on actual dataset structure
    data = data[data[:, target_col_index] != 'DrDoS_LDAP']
    np.random.shuffle(data)

    label = data[:, target_col_index].astype('str')

    # Encoding labels
    label[label == 'WebDDoS']       = 0
    label[label == 'BENIGN']        = 1
    label[label == 'UDP-lag']       = 2
    label[label == 'DrDoS_NTP']     = 3
    label[label == 'Syn']           = 4
    label[label == 'DrDoS_SSDP']    = 5
    label[label == 'DrDoS_UDP']     = 6
    label[label == 'DrDoS_NetBIOS'] = 7
    label[label == 'DrDoS_MSSQL']   = 8
    label[label == 'DrDoS_SNMP']    = 9
    label[label == 'TFTP']          = 10
    label[label == 'DrDoS_DNS']     = 11

    # SELECT FEATURES ----------------------------------------------------
    inx_sel = -1 + np.array([38, 47, 37, 48, 11, 9, 7, 52, 10, 36, 1, 34, 4, 17, 19, 57, 21,
                             18, 22, 24, 32, 50, 23, 55, 51, 5, 3, 39, 40, 43, 58, 12, 25,
                             20, 2, 35, 67, 33, 6, 53])

    # MIN-MAX normalization
    data = data[:, inx_sel]
    dmin = data.min(axis=0)
    dmax = data.max(axis=0)
    data = (data - dmin) / (dmax - dmin)

    # Test data 20%
    train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.20, stratify=label)

    # Train 70%, Validation 10%
    train_data, val_data, train_label, val_label = train_test_split(train_data, train_label, test_size=0.125, stratify=train_label)

    return train_data.astype('float32'), train_label.astype('int32'), val_data.astype('float32'), val_label.astype('int32'), test_data.astype('float32'), test_label.astype('int32')

# -- LOAD DATA -----------------------------------------------------------------
train_data, train_labelp, val_data, val_labelp, test_data, test_labelp = loadDataset()

# to_categorical
train_label = to_categorical(train_labelp, nclass)
val_label   = to_categorical(val_labelp, nclass)
test_label  = to_categorical(test_labelp, nclass)

print('train_data.shape=', train_data.shape)
print('test_data.shape=', test_data.shape)
print('val_data.shape=', val_data.shape)

#get the number of features
inshape = train_data.shape[1]

# Class balancing weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(train_labelp), y=train_labelp)
class_weights = {i: class_weights[i] for i in range(len(class_weights))}

# -- CALLBACKS -----------------------------------------------------------------
earlyStopping = EarlyStopping(monitor='val_loss', patience=30, verbose=0, mode='min')

modelCheckPoint = ModelCheckpoint('./savemodels/model5class.weights.{epoch:03d}-{val_accuracy:.4f}.hdf5',
                                  save_best_only=True, monitor='val_accuracy', mode='max')

# -- Baseline models-----------------------------------------------------------
# Note: Ensure `models_ddos` is defined or replace with appropriate model definitions.
# For example:
model = Sequential([
    Conv1D(64, 5, activation='relu', input_shape=(inshape, 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(nclass, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

# -- TRAIN MODEL --------------------------------------------------------------
history = model.fit(train_data, train_label, shuffle=True, epochs=epochs, batch_size=256,
                    validation_data=(val_data, val_label), callbacks=[modelCheckPoint],
                    class_weight=class_weights, workers=3)

# -- Load best model ----------------------------------------------------------
str_models = os.listdir('./savemodels')
str_models = np.sort(str_models)
best_model = str_models[-1]
print('best_model=', best_model)
model.load_weights('./savemodels/'+best_model)

# --Confusion matrix ----------------------------------------------------------
print('TEST DATA-Confusion matrix:')
pred = model.predict(test_data)
pred_y = pred.argmax(axis=-1)

cm = confusion_matrix(test_labelp.astype('int32'), pred_y)
print(cm)

print('Accuracy ratios for each class')
for i, label in enumerate(["WebDDoS", "BENIGN", "UDP-lag", "DrDoS_NTP", "Syn", "DrDoS_SSDP", "DrDoS_UDP", "DrDoS_NetBIOS", "DrDoS_MSSQL", "DrDoS_SNMP", "TFTP", "DrDoS_DNS"]):
    print(f'{label} =', cm[i, i] / np.sum(cm[i, :]))

# -- Confusion matrix plot
cmo = ConfusionMatrixDisplay(cm, display_labels=["WebDDoS", "BENIGN", "UDP-lag", "DrDoS_NTP", "Syn", "DrDoS_SSDP", "DrDoS_UDP", "DrDoS_NetBIOS", "DrDoS_MSSQL", "DrDoS_SNMP", "TFTP", "DrDoS_DNS"])
fig, ax = plt.subplots(figsize=(12,12))
cmo.plot(ax=ax, xticks_rotation=45)

# Plot training and validation accuracy and loss graphs
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

np.save('historydata.npy', [acc, val_acc, loss, val_loss])
[acc, val_acc, loss, val_loss] = np.load('historydata.npy', allow_pickle=True)

plt.figure()
epochs = range(len(acc))
plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.plot(epochs, val_acc, 'r.', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r.', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()



   Avg_syn_flag  Avg_urg_flag  Avg_fin_flag  Avg_ack_flag  Avg_psh_flag  \
0     -0.230455      4.523595     -1.276876      0.814947     -0.016410   
1     -2.088776      2.387204      0.338205      1.419440      0.114833   
2     -0.043220      3.578791     -1.073984      0.313930     -1.774397   
3      0.065305      3.761012      1.174174      0.761965      0.857883   
4      3.875263      1.385111     -2.633112     -0.545981     -1.570862   

   Avg_rst_flag  Avg_DNS_pkt  Avg_TCP_pkt  Avg_UDP_pkt  Avg_ICMP_pkt  ...  \
0     -2.995087     1.063889    -2.371085    -2.840079     -0.283463  ...   
1      0.913599     0.847367    -0.008734    -1.058475      0.342997  ...   
2     -1.006298     0.929811    -3.432328    -1.932374      0.318437  ...   
3     -0.597540    -0.022305     1.695764     3.305753      0.792997  ...   
4     -3.460744    -1.882090    -2.115882    -2.954608      0.986303  ...   

   Min_pkts_lenght  Max_pkts_lenght  StDev_pkts_lenght  Avg_small_payload_pkt  \
0    

IndexError: index 37 is out of bounds for axis 1 with size 26