# ***0. Data Loading***

In [1]:
import pandas as pd
import numpy as np

train_path = '/Users/akhilmakes/Library/CloudStorage/GoogleDrive-akhil.makes@gmail.com/My Drive/University of Surrey/AI and AI Programming/EEEM005 data for coursework/UNSWNB15_training_coursework.csv'
test_1_path = '/Users/akhilmakes/Library/CloudStorage/GoogleDrive-akhil.makes@gmail.com/My Drive/University of Surrey/AI and AI Programming/EEEM005 data for coursework/UNSWNB15_testing1_coursework.csv'
test_2_path = '/Users/akhilmakes/Library/CloudStorage/GoogleDrive-akhil.makes@gmail.com/My Drive/University of Surrey/AI and AI Programming/EEEM005 data for coursework/UNSWNB15_testing2_coursework_no_label.csv'


train_data = pd.read_csv(train_path) # Data used to train the MLP
test_1_data = pd.read_csv(test_1_path) # Data used to evaluate the performance of the model
test_2_data = pd.read_csv(test_2_path) # Data used to predict the class labels and display


# ***1. Data Pre-Processing (Task 1)***

In [25]:
# Encode the Categorical Variables - 'proto', 'service', 'state'
categorical_columns = ['proto', 'service', 'state']

for col in categorical_columns:
    unique_values = train_data[col].unique()
    mapping = {v: i for i, v in enumerate(unique_values)}
    train_data[col] = train_data[col].map(mapping)

# Ensure that all the columns in the data are have numbers in them and drop the label column
features = train_data.select_dtypes(include=['float64', 'int64']).drop(columns=['label'])

# Normalize the values in the data to values between 0 and 1
for col in features.columns:
    min = features[col].min()
    max = features[col].max()
    if max - min == 0: ## Checks if the max - min value is 0 to avoid division by 0
        features[col] = 0
    else:
        features[col] = (features[col] - min) / (max - min)

# Perform PCA to reduce dimensions of feature space and extract the most useful features
# 1. Subtract the data from its mean
# 2. Calculate the covariance matrix to measure the relationship between the features
# 3. Calculate the Eigenvalues and Eigenvectors from the covariance matrix
# 4. Sort by eigenvalue in descending order
# (A high eigenvalue indicates that the data is highly spread in that features direction.)
# 5. Calculate cumulative variance ratio
# (Used to find the number of components that retain a certain variance e.g. 97%)
# 6. Reduce the centered data to the k number of useful features

mean_sub_features = features - np.mean(features, axis=0)

cov_matrix = np.cov(mean_sub_features, rowvar=False)

eig_val, eig_vec = np.linalg.eigh(cov_matrix)

sort_index = np.argsort(eig_val)[::-1]
eig_val = eig_val[sort_index]
eig_vec = eig_vec[:, sort_index]

cumulative_variance = np.cumsum(eig_val) / np.sum(eig_val)
target_variance = 0.97
k = np.argmax(cumulative_variance >= target_variance) + 1

eigen_vectors_k = eig_vec[:, :k]
features_reduced = np.dot(mean_sub_features, eigen_vectors_k)

print(f"{k} Features retain {target_variance * 100}% of the variance of the data")

# Training data is now processed
X_train = features_reduced
y_train = train_data['label'] # Labels - 0 => Benign, 1 => Malicious

# As we reduce the dimensions of the training data, we must also do the same processing to the test data.

#testing1
for c in categorical_columns:
    unique_values = test_1_data[c].unique()
    mapping = {v: i for i, v in enumerate(unique_values)}
    test_1_data[c] = test_1_data[c].map(mapping)

test_1_features = test_1_data.select_dtypes(include=['float64', 'int64']).drop(columns=['label'])

for col in test_1_features.columns:
    test_min = test_1_features[col].min()
    test_max = test_1_features[col].max()
    if test_max - test_min == 0:
        test_1_features[col] = 0
    else:
        test_1_features[col] = (test_1_features[col] - test_min) / (test_max - test_min)

# Center test data using training mean
mean_sub_test = test_1_features - np.mean(features, axis=0)

# Project test data using training PCA
X_test_1 = np.dot(mean_sub_test, eigen_vectors_k)
y_test_1 = test_1_data['label']

#testing2
for col in categorical_columns:
    unique_values = test_2_data[col].unique()
    mapping = {v: i for i, v in enumerate(unique_values)}
    test_2_data[col] = test_2_data[col].map(mapping)

test_2_features = test_2_data.select_dtypes(include=['float64', 'int64'])

for c in test_2_features.columns:
    test_2_min = test_2_features[c].min()
    test_2_max = test_2_features[c].max()

    if test_2_max - test_2_min == 0:
        test_2_features[c] = 0
    else:
        test_2_features[c] = (test_2_features[c] - test_2_min) / (test_2_max - test_min)

# Center test data using training mean
mean_sub_test_2 = test_2_features - np.mean(features, axis=0)

# Project test data using training PCA
X_test_2 = np.dot(mean_sub_test_2, eigen_vectors_k)

print(f"Original Training Data Shape: {train_data.shape}")
print(f"Training Data Shape after PCA: {X_train.shape}")


15 Features retain 97.0% of the variance of the data
Original Training Data Shape: (20000, 44)
Training Data Shape after PCA: (20000, 15)


# ***2. Model Implementation and Training (Task 2)***

In [28]:
## MULTI LAYER PERCEPTRON

# Define the activation functions and the derivatives which will be used for backpropagation
# ReLU is used as it prevents the occurence of vanishing gradients during backpropagation
# Sigmoid function is used in the final layer to output a probability

def relu(x):
  return np.maximum(x, 0)

def relu_derivative(x):
  return (x > 0).astype(float)

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

# Define the input, hidden and output layer sizes
input_layer_size = X_train.shape[1]
hidden_layer_size = int(input_layer_size * 1.5)
output_layer_size = 1

# Initialize a random seed for reproducible results
np.random.seed(20)

# Use He Initialization to assign the weights as we are using the ReLU activation
# It prevents the neurons outputting 0

## Stage 1 - Input to Hidden Layer
w1 = np.random.randn(input_layer_size, hidden_layer_size) * np.sqrt(2 / input_layer_size)
b1 = np.zeros((1, hidden_layer_size))
## Stage 2 - Hidden Layer 1
w2= np.random.randn(hidden_layer_size, hidden_layer_size) * np.sqrt(2 / hidden_layer_size)
b2 = np.zeros((1, hidden_layer_size))
## Stage 3 - Hidden Layer 2 to Output
w3 = np.random.randn(hidden_layer_size, 1) * np.sqrt(2 / hidden_layer_size)
b3 = np.zeros((1, output_layer_size))

## Forward pass of the network from the input layer to the output layer
def forward_pass(x):
  z1 = np.dot(x, w1) + b1 ## Multiply the input to the corresponding weights and add the bias term
  a1 = relu(z1) ## Apply the ReLU activation function to introduce non-linearity

## Hidden Layer 1
  z2 = np.dot(a1, w2) + b2 ## Multiply the output of the first hidden layer to the corresponding weights and bias term
  a2 = relu(z2) ## Apply the ReLU activation function

## Hidden Layer 2
  z3 = np.dot(a2, w3) + b3 ## Mulitply the output of the hidden layer with weights and add the bias term
  y_prediction = sigmoid(z3) ## Apply the sigmoid activation function to output a probability

  return z1, a1, z2, a2, z3, y_prediction

## Binary cross entropy loss function as this is a binary classification problem
def bce_loss(y_true, y_pred):
  N = y_true.shape[0]
  loss = -1/N * np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1-y_pred))

  return np.squeeze(loss) ## Removes singleton dimensions to always produce a scalar value


# Backpropagation through the network to calculate gradients w.r.t to the loss.
def back_prop(x, y, z1, a1, z2, a2, z3, y_prediction):
  m = x.shape[0]

# Gradients for the output layer
  dz3 = y_prediction - y
  dw3 = 1/m * np.dot(a1.T, dz3)
  db3 = 1/m * np.sum(dz3, axis=0, keepdims=True)

# Gradients for hidden layer 2
  da2 = np.dot(dz3, w3.T)
  dz2 = da2 * relu_derivative(z2)
  dw2 = 1/m * np.dot(a1.T, dz2)
  db2 = 1/m * np.sum(dz2, axis=0, keepdims=True)


# Gradients for hidden layer 1
  da1 = np.dot(dz2, w2.T)
  dz1 = da1 * relu_derivative(z1)
  dw1 = 1/m * np.dot(x.T, dz1)
  db1 = 1/m * np.sum(dz1, axis=0, keepdims=True)

  return dw1, db1, dw2, db2, dw3, db3

# Function to update the initial weights and biases using a defined learning rate according to the loss
def update_gradients(dw1, db1, dw2, db2, dw3, db3, learning_rate):
  global w1, b1, w2, b2, w3, b3
  w1 -= learning_rate * dw1
  b1 -= learning_rate * db1
  w2 -= learning_rate * dw2
  b2 -= learning_rate * db2
  w3 -= learning_rate * dw3
  b3 -= learning_rate * db3

# The main training loop for the model
# default number of epochs is 100 and learning rate is 0.01
def train(x, y, epochs=100, learning_rate=0.01):

  for i in range(epochs):
    z1, a1, z2, a2, z3, y_prediction = forward_pass(x)
    loss = bce_loss(y, y_prediction)
    dw1, db1, dw2, db2, dw3, db3 = back_prop(x, y, z1, a1, z2, a2, z3, y_prediction)
    update_gradients(dw1, db1, dw2, db2, dw3, db3, learning_rate)

    if i % 100 == 0:
      print(f"Epoch {i}: Loss = {loss:.4f}")

# Given a data point X, make a prediction either 0 or 1.
def predict(X):
    Z1, A1, Z2, A2, Z3, A3 = forward_pass(X)
    return (A3 > 0.5).astype(int)

# Train the MLP
train(X_train, y_train.values.reshape(-1, 1), epochs=1000, learning_rate=0.5)

Epoch 0: Loss = 0.6870
Epoch 100: Loss = 0.3375
Epoch 200: Loss = 0.3041
Epoch 300: Loss = 0.3039
Epoch 400: Loss = 0.2907
Epoch 500: Loss = 0.2847
Epoch 600: Loss = 0.2797
Epoch 700: Loss = 0.2759
Epoch 800: Loss = 0.2723
Epoch 900: Loss = 0.2694


# ***3. Model Performance Evaluation (Task 3)***

In [30]:
y_pred_1 = predict(X_test_1) # Model Prediction on testing1
y_true_1 = y_test_1.values.reshape(-1, 1) # Ground truth values for testing1

y_pred_2 = predict(X_test_2) # Model prediction on testing2

def evaluate(predictions, true_values):

    accuracy = np.mean(predictions == true_values) * 100 # Accuracy of the model as a percentage

    predictions = predictions.flatten()
    true_values = true_values.flatten()

    true_pos = np.sum((predictions == 1) & (true_values == 1)) # Number of malicious packet that were predicted as malicious
    false_pos = np.sum((predictions == 1) & (true_values == 0)) # Number of malicious packets that were predicted as benign
    false_neg = np.sum((predictions == 0) & (true_values == 1)) # Number of benign packets that were predicted as malicious


    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    f1_score = (2 * (precision * recall)) / (precision + recall)

    return accuracy, precision, recall, f1_score

accuracy, precision, recall, f1_score = evaluate(y_pred_1, y_true_1)

print("-------Performance Evaluation-------\n")
print(f"Accuracy: {round(accuracy,2)} %")
print(f"Precision: {round(precision,2)}")
print(f"Recall: {round(recall,2)}")
print(f"F1 Score: {round(f1_score,2)}\n")

print("-------Class Predictions-------\n")
print("UNSWNB15_testing1:")
print(f"Prediction: {y_pred_1[:25].flatten()}") # Display the first 25 predictions for testing1
print(f"Ground Truth: {y_true_1[:25].flatten()}\n")
print("UNSWNB15_testing2:")
print(f"Prediction: {y_pred_2.flatten()}")# Display the predictions for testing2


-------Performance Evaluation-------

Accuracy: 93.22 %
Precision: 0.94
Recall: 0.97
F1 Score: 0.95

-------Class Predictions-------

UNSWNB15_testing1:
Prediction: [1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 0 1 1]
Ground Truth: [1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1]

UNSWNB15_testing2:
Prediction: [1 0 1 1 0 1 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1]


# ***4. Performance Evaluation***

**Please report the classification accuracy your model achieved on data samples in the testing set 1 below (in percentage)**

93.22%



**Please provide the predicted class labels of the data samples in the testing set 2 below (0 or 1)**

| **Sample ID** |**Predicted Label** |
| --- | --- |
| 1 |  1 |
| 2 | 0  |
| 3 |  1 |
| 4 | 1  |
| 5 |  0 |
| 6 |  1 |
| 7 |  0 |
| 8 |  0 |
| 9 |  1 |
| 10 | 0  |
| 11 | 0  |
| 12 | 1 |
| 13 |  1 |
| 14 |  1 |
| 15 |  1 |
| 16 | 1  |
| 17 | 1  |
| 18 | 1  |
| 19 | 0  |
| 20 | 1  |
| 21 | 1  |
| 22 | 1  |
| 23 |  1 |
| 24 | 1  |
| 25 | 1  |