# LuNet

##### Install dependencies

In [1]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl (17.7 MB)
[K     |████████████████████████████████| 17.7 MB 6.5 MB/s eta 0:00:01       | 7.0 MB 6.5 MB/s eta 0:00:02     |███████████████████████▎        | 12.9 MB 6.5 MB/s eta 0:00:01
Installing collected packages: pyarrow
Successfully installed pyarrow-2.0.0


In [2]:
import pandas as pd
import numpy as np
import os

DIR = './Datasets/local/'

dfs = {}
names = []
for filename in os.listdir(DIR):
    dfs[filename] = pd.read_feather(DIR + filename)
    dfs[filename].columns = dfs[filename].columns.str.replace(' ', '_')
    names.append(filename)

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.layers import LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Reshape, BatchNormalization, Flatten, GlobalAveragePooling1D
from tensorflow.python.keras.models import Sequential

def create_model():
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(76, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(BatchNormalization(axis=0))
    model.add(Reshape((-1, 64), input_shape=(64,)))
    model.add(LSTM(64, return_sequences = True, activation='tanh'))
    model.add(Dense(128))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(BatchNormalization(axis=0))
    model.add(Reshape((-1, 128), input_shape=(128,)))
    model.add(LSTM(128, return_sequences = True, activation='tanh'))
    model.add(Dense(256))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(BatchNormalization(axis=0))
    model.add(Reshape((-1, 256), input_shape=(256,)))
    model.add(LSTM(256, return_sequences = True, activation='tanh'))
    model.add(Dense(512))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=512, kernel_size=3, activation='relu'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
dataframe = dfs["cicdos2017"]
y = dataframe['label']
scaler = MinMaxScaler()
df = dataframe.drop(columns=['label'])
X = scaler.fit_transform(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
model = create_model()
model.compile(optimizer='Adam', loss = 'binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train.reshape(156264,76,1), y_train, validation_data=(X_val.reshape(39067,76,1), y_val), epochs = 200, batch_size=2048, callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, min_delta=0.0001)], verbose = 0)

ValueError: ('Input has undefined `axis` dimension. Input shape: ', TensorShape([None, 2368]))

In [None]:
loss, accuracy = model.evaluate(X_test.reshape(96209,76,1), y_test, batch_size = 2048, verbose = 0)
print('loss:', loss, ' accuracy: ', accuracy)

In [16]:
df2017_dos = [ "Wednesday-workingHours-DoS" ]
df2017_portscan = [ "Friday-WorkingHours-Afternoon-PortScan" ]
df2017_botnet = [ "Friday-WorkingHours-Morning-Botnet" ]
df2017_infiltration = [ "Thursday-WorkingHours-Afternoon-Infiltration" ]
df2017_webattacks = [ "Thursday-WorkingHours-Morning-WebAttacks" ]
df2017_bruteforce = [ "Tuesday-WorkingHours-Bruteforce" ]
df2017_ddos = [ "Friday-WorkingHours-Afternoon-DDoS" ]

df2018_botnet = [ "Friday-02-03-2018_TrafficForML_CICFlowMeter" ]
df2018_webattacks = [ "Thursday-22-02-2018_TrafficForML_CICFlowMeter",
                     "Friday-23-02-2018_TrafficForML_CICFlowMeter" ]
df2018_bruteforce = [ "Wednesday-14-02-2018_TrafficForML_CICFlowMeter" ]
df2018_infiltration = [ "Wednesday-28-02-2018_TrafficForML_CICFlowMeter",
                       "Thursday-01-03-2018_TrafficForML_CICFlowMeter" ]
df2018_dos = [ "Friday-16-02-2018_TrafficForML_CICFlowMeter",
                "Thursday-15-02-2018_TrafficForML_CICFlowMeter" ]
df2018_ddos = [ "Tuesday-20-02-2018_TrafficForML_CICFlowMeter",
               "Wednesday-21-02-2018_TrafficForML_CICFlowMeter",]

df2019_ddos = [ "01_12_DrDoS_DNS", "01_12_DrDoS_LDAP", "01_12_DrDoS_MSSQL", "01_12_DrDoS_NetBIOS",
                "01_12_DrDoS_NTP", "01_12_DrDoS_SNMP", "01_12_DrDoS_SSDP", "01_12_DrDoS_UDP",
                "01_12_Syn", "01_12_TFTP", "01_12_UDPLag", "03_11_LDAP", "03_11_MSSQL",
                "03_11_NetBIOS", "03_11_Portmap", "03_11_Syn", "03_11_UDP", "03_11_UDPLag" ]

dfs_ddos = [ df2019_ddos, df2018_ddos, df2017_ddos]
dfs_dos = [ df2018_dos, df2017_dos ]
dfs_botnet = [ df2017_botnet, df2018_botnet ]
dfs_infiltration = [ df2017_infiltration, df2018_infiltration]
dfs_webattacks = [ df2017_webattacks, df2018_webattacks ]
dfs_bruteforce = [ df2017_bruteforce, df2018_bruteforce ]
dfs_names = [ dfs_ddos, dfs_dos, dfs_botnet, dfs_infiltration, dfs_webattacks, dfs_bruteforce ]

In [17]:
dfs_2019 = dfs["01_12_DrDoS_DNS"]
for dataset in df2019_ddos:
    if dataset == "01_12_DrDoS_DNS":
        continue;
    dfs_2019.append(dfs[dataset])

dfs_2018 = dfs["Tuesday-20-02-2018_TrafficForML_CICFlowMeter"]
for dataset in df2018_ddos:
    if dataset == "01_12_DrDoS_DNS":
        continue;
dfs_2018.append(dfs[dataset])

dfs_2017 = dfs["Friday-WorkingHours-Afternoon-DDoS"]

ddos_datasets = {}
ddos_datasets["2017"] = dfs_2017
ddos_datasets["2018"] = dfs_2018
ddos_datasets["2019"] = dfs_2019

In [15]:
print(ddos_datasets["2019"].shape[0]*0.1)

665.5


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

dataset = ddos_datasets["2017"]
dataset.append(ddos_datasets["2018"])
#dataset.append(ddos_datasets["2018"].head(int(ddos_datasets["2019"].shape[0]*0.1)))
y = dataset['label']
scaler = MinMaxScaler()
df = dataset.drop(columns=['label'])
X = scaler.fit_transform(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

input_train_shape = X_train.shape
input_test_shape = X_test.shape 
input_val_shape = X_val.shape
input_shape = (input_train_shape[1], 1)
# Reshape the training data to include channels
input_train = X_train.reshape(input_train_shape[0], input_train_shape[1], 1)
input_val = X_val.reshape(input_val_shape[0], input_val_shape[1], 1)
input_test = X_test.reshape(input_test_shape[0], input_test_shape[1], 1)
# Parse numbers as floats
input_train = input_train.astype('float32')
input_val = input_val.astype('float32')
input_test = input_test.astype('float32')
# Normalize input data
input_train = input_train / 255
input_test = input_test / 255
input_val = input_val / 255

In [19]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.layers import LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Reshape, BatchNormalization, Flatten, GlobalAveragePooling1D
from tensorflow.python.keras.models import Sequential

def create_model():
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Reshape((-1, 64), input_shape=(64,)))
    model.add(BatchNormalization())
    model.add(LSTM(64, return_sequences = True, activation='tanh'))
    model.add(Dense(128))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Reshape((-1, 128), input_shape=(128,)))
    model.add(BatchNormalization())
    model.add(LSTM(128, return_sequences = True, activation='tanh'))
    model.add(Dense(256))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Reshape((-1, 256), input_shape=(256,)))
    model.add(BatchNormalization())
    model.add(LSTM(256, return_sequences = True, activation='tanh'))
    model.add(Dense(512))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=512, kernel_size=3, activation='relu'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [20]:
model = create_model()
model.compile(optimizer='Adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [21]:
history = model.fit(input_train, y_train, validation_data=(input_val, y_val), epochs = 30, batch_size=2048)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [22]:
loss, accuracy = model.evaluate(input_test, y_test, batch_size = 2048, verbose = 0)
print(str(int(accuracy*10000)/100) + "%")

99.07%


In [26]:
dataset_test = ddos_datasets["2019"]
y_true = dataset_test['label']
y_true = y_true.to_list()
scaler = MinMaxScaler()
df = dataset_test.drop(columns=['label'])
X = scaler.fit_transform(df)
y = model.predict(X.reshape(6655,76,1))
y_round = tf.make_ndarray(tf.make_tensor_proto(tf.round(y))).reshape([len(y)])
y_round = list(y_round)

In [27]:
from sklearn.metrics import balanced_accuracy_score

print("Gebalanceerde accuracy van model met een ongekende dataset:")
print(str(int(balanced_accuracy_score(y_true, y_round)*10000)/100) + "%")

Gebalanceerde accuracy van model met een ongekende dataset:
41.64%


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

dataset = ddos_datasets["2017"]
dataset.append(ddos_datasets["2018"])
trainingdataset, testingdataset = train_test_split(ddos_datasets["2019"], test_size=0.1)
dataset.append(trainingdataset)
y = dataset['label']
scaler = MinMaxScaler()
df = dataset.drop(columns=['label'])
X = scaler.fit_transform(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

input_train_shape = X_train.shape
input_test_shape = X_test.shape 
input_val_shape = X_val.shape
input_shape = (input_train_shape[1], 1)
# Reshape the training data to include channels
input_train = X_train.reshape(input_train_shape[0], input_train_shape[1], 1)
input_val = X_val.reshape(input_val_shape[0], input_val_shape[1], 1)
input_test = X_test.reshape(input_test_shape[0], input_test_shape[1], 1)
# Parse numbers as floats
input_train = input_train.astype('float32')
input_val = input_val.astype('float32')
input_test = input_test.astype('float32')
# Normalize input data
input_train = input_train / 255
input_test = input_test / 255
input_val = input_val / 255

In [29]:
def create_model():
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Reshape((-1, 64), input_shape=(64,)))
    model.add(BatchNormalization())
    model.add(LSTM(64, return_sequences = True, activation='tanh'))
    model.add(Dense(128))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Reshape((-1, 128), input_shape=(128,)))
    model.add(BatchNormalization())
    model.add(LSTM(128, return_sequences = True, activation='tanh'))
    model.add(Dense(256))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Reshape((-1, 256), input_shape=(256,)))
    model.add(BatchNormalization())
    model.add(LSTM(256, return_sequences = True, activation='tanh'))
    model.add(Dense(512))
    model.add(Dropout(.5))
    
    model.add(Conv1D(filters=512, kernel_size=3, activation='relu'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [30]:
model = create_model()
model.compile(optimizer='Adam', loss = 'binary_crossentropy', metrics=['accuracy'])
history = model.fit(input_train, y_train, validation_data=(input_val, y_val), epochs = 30, batch_size=2048)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [31]:
loss, accuracy = model.evaluate(input_test, y_test, batch_size = 2048, verbose = 0)
print(str(int(accuracy*10000)/100) + "%")

50.13%


In [35]:
dataset_test = testingdataset
y_true = dataset_test['label']
y_true = y_true.to_list()
scaler = MinMaxScaler()
df = dataset_test.drop(columns=['label'])
X = scaler.fit_transform(df)
y = model.predict(X.reshape(666,76,1))
y_round = tf.make_ndarray(tf.make_tensor_proto(tf.round(y))).reshape([len(y)])
y_round = list(y_round)

In [36]:
print("Gebalanceerde accuracy van model met een ongekende dataset:")
print(str(int(balanced_accuracy_score(y_true, y_round)*10000)/100) + "%")

Gebalanceerde accuracy van model met een ongekende dataset:
50.0%


In [38]:
from sklearn.metrics import f1_score
print("Gebalanceerde accuracy van model met een ongekende dataset:")
print(str(int(f1_score(y_true, y_round, average='weighted')*10000)/100) + "%")

Gebalanceerde accuracy van model met een ongekende dataset:
32.83%
