In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier

import xgboost, lightgbm
from mlxtend.classifier import EnsembleVoteClassifier

import warnings
warnings.filterwarnings("ignore")

import cudf
import cuml
print("cuDF version:", cudf.__version__)
print("cuML version:", cuml.__version__)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ModuleNotFoundError: No module named 'xgboost'

In [4]:
training_df = pd.read_csv('../UNSW_NB15/UNSW_NB15_training-set.csv')
testing_df = pd.read_csv('../UNSW_NB15/UNSW_NB15_testing-set.csv')
combined_data = pd.concat([training_df, testing_df]).drop(['id'],axis=1)

combined_data.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [5]:
# Determine the categorical and numerical columns
categorical_columns = combined_data.select_dtypes(include=['object']).columns
print(f'The categorical columns are: {categorical_columns} \n')

# Determine how many unique values are in each categorical column
for column in categorical_columns:
    print(f'The column {column} has {combined_data[column].nunique()} unique values')

# Convert the categorical columns to numerical
le = LabelEncoder()
for column in categorical_columns:
    combined_data[column] = le.fit_transform(combined_data[column])

print("\n", combined_data.head())

In [6]:
# Use the train_test_split function to split the data into training and testing sets
X = combined_data.drop(['label', 'attack_cat'], axis=1)
y = combined_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(206138, 42) (51535, 42) (206138,) (51535,)


In [8]:
# Run a grid search to find the best hyperparameters
parameter_space = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150, 100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate_init': [0.001, 0.01, 0.1]
}

mlp = MLPClassifier(max_iter=100)
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

print("Best parameters found:\n", clf.best_params_)

Best parameters found:
 {'activation': 'tanh', 'hidden_layer_sizes': (100, 50), 'learning_rate_init': 0.001, 'solver': 'adam'}


In [11]:
# Train the model using the best hyperparameters
model = MLPClassifier(hidden_layer_sizes=(100, 50),
                      activation='tanh',
                      solver='adam',
                      learning_rate_init= 0.001,
                      max_iter=300)

model.fit(X_train, y_train)

# Evaluate the model, turn the predictions into a numpy array
y_pred = model.predict(X_test)
y_pred = np.array(y_pred)

y_test = np.array(y_test)

accuracy = np.mean(y_pred == y_test)
print(f'The accuracy of the model is: {accuracy}')



The accuracy of the model is: 0.9400601532938779


: 

In [10]:
RFC = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
ETC = ExtraTreesClassifier(n_estimators=200, random_state=42, n_jobs=-1)
XGB = xgboost.XGBClassifier(n_estimators=150, n_jobs=-1)
GBM = lightgbm.LGBMClassifier(objective='binary', n_estimators= 500, n_jobs=-1, verbosity=-1)

list_of_CLFs_names = []
list_of_CLFs = [RFC, ETC, XGB, GBM]
ranking = []

for clf in list_of_CLFs:
    _ = clf.fit(X_train,y_train)
    pred = clf.score(X_test,y_test)
    name = str(type(clf)).split(".")[-1][:-2]
    print("Acc: %0.5f for the %s" % (pred, name))

    ranking.append(pred)
    list_of_CLFs_names.append(name)

Acc: 0.95184 for the RandomForestClassifier
Acc: 0.95021 for the ExtraTreesClassifier
Acc: 0.94916 for the XGBClassifier
Acc: 0.95128 for the LGBMClassifier


In [None]:
# Now lets train the model on multiple classifications of attacks
X = combined_data.drop(['label', 'attack_cat'], axis=1)
y = combined_data['attack_cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)