In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier

import xgboost, lightgbm
from mlxtend.classifier import EnsembleVoteClassifier

import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from kerastuner.tuners import RandomSearch, Hyperband, BayesianOptimization

In [8]:
training_df = pd.read_csv('../UNSW_NB15/UNSW_NB15_training-set.csv')
testing_df = pd.read_csv('../UNSW_NB15/UNSW_NB15_testing-set.csv')
combined_data = pd.concat([training_df, testing_df]).drop(['id'],axis=1)

combined_data.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [9]:
# Determine the categorical and numerical columns
categorical_columns = combined_data.select_dtypes(include=['object']).columns
print(f'The categorical columns are: {categorical_columns} \n')

# Determine how many unique values are in each categorical column
for column in categorical_columns:
    print(f'The column {column} has {combined_data[column].nunique()} unique values')

# Convert the categorical columns to numerical
le = LabelEncoder()
for column in categorical_columns:
    combined_data[column] = le.fit_transform(combined_data[column])

print("\n", combined_data.head())

The categorical columns are: Index(['proto', 'service', 'state', 'attack_cat'], dtype='object') 

The column proto has 133 unique values
The column service has 13 unique values
The column state has 11 unique values
The column attack_cat has 10 unique values

         dur  proto  service  state  spkts  dpkts  sbytes  dbytes         rate  \
0  0.000011    119        0      5      2      0     496       0   90909.0902   
1  0.000008    119        0      5      2      0    1762       0  125000.0003   
2  0.000005    119        0      5      2      0    1068       0  200000.0051   
3  0.000006    119        0      5      2      0     900       0  166666.6608   
4  0.000010    119        0      5      2      0    2126       0  100000.0025   

   sttl  ...  ct_dst_sport_ltm  ct_dst_src_ltm  is_ftp_login  ct_ftp_cmd  \
0   254  ...                 1               2             0           0   
1   254  ...                 1               2             0           0   
2   254  ...             

In [10]:
# Use the train_test_split function to split the data into training and testing sets
X = combined_data.drop(['label', 'attack_cat'], axis=1)
y = combined_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(206138, 42) (51535, 42) (206138,) (51535,)


In [35]:
# GRID SEARCH FOR BINARY LABEL CLASSIFICATION (whether the data is an attack or not)

tf.config.list_physical_devices('GPU')

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Choice('units_input', [100, hp.Int('units_input', 32, 256, step=32)]), 
                    activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate=hp.Float('dropout_1', 0.0, 0.5, step=0.1)))
    model.add(Dense(units=hp.Choice('units_hidden', [50, hp.Int('units_hidden', 32, 256, step=32)]), 
                    activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_2', 0.0, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    learning_rate_choice = hp.Choice('learning_rate', [0.001, hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate_choice),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# def build_model(hp):
#     model = Sequential()
#     # Since the best units_input was 128, we want to search around this value
#     model.add(Dense(units=hp.Choice('units_input', [128, 100, 150]), 
#                     activation='tanh', input_shape=(X_train.shape[1],)))
    
#     # Best dropout_1 was 0.0, suggesting dropout may not be needed here
#     model.add(Dropout(rate=hp.Choice('dropout_1', [0.0, 0.1, 0.2])))

#     # Since the best units_hidden was 224, we can narrow down around this number as well
#     model.add(Dense(units=hp.Choice('units_hidden', [224, 200, 250]), 
#                     activation='tanh'))

#     # Best dropout_2 was 0.2, we can try around this value to fine-tune
#     model.add(Dropout(rate=hp.Choice('dropout_2', [0.2, 0.1, 0.3])))

#     # Best learning rate was approximately 0.00099, search around this learning rate
#     model.add(Dense(1, activation='sigmoid'))
#     learning_rate_choice = hp.Choice('learning_rate', [0.00099, 0.001, 0.0001])
    
#     model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate_choice),
#                   loss='binary_crossentropy',
#                   metrics=['accuracy'])
#     return model

# Define the grid search function
def grid_search(build_model, project_name, X_train=X_train, y_train=y_train, max_epochs=300, factor=3, directory='my_dir'):
    tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=max_epochs, 
    factor=factor,
    directory='my_dir',
    project_name=project_name
    )

    tuner.search(X_train, y_train, epochs=max_epochs, 
             validation_split=0.2, callbacks=[tf.keras.callbacks.EarlyStopping('val_accuracy', patience=3)])
    
    return tuner

def random_search(build_model, project_name, X_train=X_train, y_train=y_train, max_trials=10, directory='my_dir'):
    tuner = RandomSearch(
        build_model,
        objective='val_accuracy',
        max_trials=max_trials,
        executions_per_trial=1,
        directory=directory,
        project_name=project_name
    )

    tuner.search(X_train, y_train, epochs=max_trials,
                 validation_split=0.2, callbacks=[tf.keras.callbacks.EarlyStopping('val_accuracy', patience=3)])
    
    return tuner



In [17]:
def evaluate_kera_tuner(tuner, X_test, y_test):
    best_model = tuner.get_best_models(num_models=1)[0]
    best_hyperparameters = tuner.get_best_hyperparameters(1)[0]

    print("Best model summary:")
    best_model.summary()
    print("Best hyperparameters:", best_hyperparameters.values)
    test_loss, test_acc = best_model.evaluate(X_test, y_test)
    print("Test Accuracy:", test_acc)

Best model summary:


Best hyperparameters: {'units_input': 128, 'dropout_1': 0.0, 'units_hidden': 224, 'dropout_2': 0.2, 'learning_rate': 0.0009895249588496025, 'tuner/epochs': 100, 'tuner/initial_epoch': 34, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0204'}
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 617us/step - accuracy: 0.9364 - loss: 0.1293
Test Accuracy: 0.9361211061477661


In [None]:
tuner = grid_search(build_model, 'binary_label', max_epochs=10)

evaluate_kera_tuner(tuner, X_test, y_test)

In [18]:
# Train the model using the best hyperparameters
model = MLPClassifier(hidden_layer_sizes=(100, 50),
                      activation='tanh',
                      solver='adam',
                      learning_rate_init= 0.001,
                      max_iter=300)

model.fit(X_train, y_train)

# Evaluate the model, turn the predictions into a numpy array
y_pred = model.predict(X_test)
y_pred = np.array(y_pred)

y_test = np.array(y_test)

accuracy = np.mean(y_pred == y_test)
print(f'The accuracy of the model is: {accuracy}')



The accuracy of the model is: 0.9399437275637916


In [28]:
def test_different_models(X_train, X_test, y_train, y_test, objective='binary'):
    RFC = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
    ETC = ExtraTreesClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    XGB = xgboost.XGBClassifier(n_estimators=150, n_jobs=-1)
    GBM = lightgbm.LGBMClassifier(objective=objective, n_estimators= 500, n_jobs=-1, verbosity=-1)

    list_of_CLFs_names = []
    list_of_CLFs = [RFC, ETC, XGB, GBM]
    ranking = []

    for clf in list_of_CLFs:
        _ = clf.fit(X_train,y_train)
        pred = clf.score(X_test,y_test)
        name = str(type(clf)).split(".")[-1][:-2]
        print("Acc: %0.5f for the %s" % (pred, name))

        ranking.append(pred)
        list_of_CLFs_names.append(name)

In [29]:
test_different_models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

Acc: 0.95184 for the RandomForestClassifier
Acc: 0.95021 for the ExtraTreesClassifier
Acc: 0.94916 for the XGBClassifier
Acc: 0.95128 for the LGBMClassifier


In [20]:
# Now lets train the model on multiple classifications of attacks
# TODO: Combine all of the datasets and then make sure that if there is no attack, 
# it's also labeled instead of there being nothing in that category

X_mc = combined_data.drop(['label', 'attack_cat'], axis=1)
y_mc = combined_data['attack_cat']

X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X_mc, y_mc, test_size=0.2, random_state=42)

# Standardize the data
scaler_mc = StandardScaler()
X_train_mc = scaler_mc.fit_transform(X_train_mc)
X_test_mc = scaler_mc.transform(X_test_mc)

print(X_train_mc.shape, X_test_mc.shape, y_train_mc.shape, y_test_mc.shape)


(206138, 42) (51535, 42) (206138,) (51535,)


In [39]:
tuner_multiclass = random_search(build_model, project_name='multiclass', X_train=X_train_mc, y_train=y_train_mc, max_trials=10, directory='my_dir')

evaluate_kera_tuner(tuner_multiclass, X_test_mc, y_test_mc)

Trial 5 Complete [00h 00m 27s]
val_accuracy: 0.009095760062336922

Best val_accuracy So Far: 0.009095760062336922
Total elapsed time: 00h 02m 11s

Search: Running Trial #6

Value             |Best Value So Far |Hyperparameter
224               |128               |units_input
0.3               |0.3               |dropout_1
96                |96                |units_hidden
0.2               |0.4               |dropout_2
0.00047575        |0.00010128        |learning_rate

Epoch 1/10
[1m4406/5154[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 908us/step - accuracy: 0.0090 - loss: -915195.9375

KeyboardInterrupt: 

In [26]:
# Train the model using the best hyperparameters
model_mc = MLPClassifier(hidden_layer_sizes=(100, 50),
                      activation='tanh',
                      solver='adam',
                      learning_rate_init= 0.001,
                      max_iter=300)

model_mc.fit(X_train_mc, y_train_mc)

# Evaluate the model, turn the predictions into a numpy array
y_pred_mc = model_mc.predict(X_test_mc)
y_pred_mc = np.array(y_pred)

y_test_mc = np.array(y_test_mc)

accuracy = np.mean(y_pred_mc == y_test_mc)
print(f'The accuracy of the model is: {accuracy}')

The accuracy of the model is: 0.7776268555350733


In [30]:
test_different_models(X_train=X_train_mc, X_test=X_test_mc, y_train=y_train_mc, y_test=y_test_mc, objective='multiclass')

Acc: 0.83000 for the RandomForestClassifier
Acc: 0.82775 for the ExtraTreesClassifier
Acc: 0.83766 for the XGBClassifier
Acc: 0.74408 for the LGBMClassifier
