# Practice run of analysing/testing different models on the UNSW_NB15 dataset, before trying Deep Learning.

Prior research suggests this is a largely non-linear, less separable dataset so deep learning may be necessary, but I will try simpler, more interpretable models first for the sake of completeness, and to gain Variable Importances

Let's load our packages and data

In [8]:
#import packages:

import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

from google.colab import drive

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  # Check if drive is mounted by looking for the mount point in the file system.
  # This is a more robust approach than relying on potentially internal variables.
  import os
  if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
  else:
    print("Google Drive is already mounted.")
else:
  print("Not running in Google Colab. Drive mounting skipped.")


import os
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from tqdm import tqdm


print("New run: Packages loaded")

Google Drive is already mounted.
New run: Packages loaded


In [2]:
#if using colabs - will need to first mount your drive

#change these for different users

# File paths (update if needed)
test_set_filepath = '/content/drive/MyDrive/Colab_Notebooks/Data/UNSW_NB15_testing-set.parquet'
training_set_filepath = '/content/drive/MyDrive/Colab_Notebooks/Data/UNSW_NB15_training-set.parquet'

# Load data
test_set = pd.read_parquet(test_set_filepath)
train_set = pd.read_parquet(training_set_filepath)

print("Data loaded")


Data loaded


The next cell does some basic analysis, and one hot encodes some of the features:

In [5]:

# Preprocessing function (modified for pandas)
def preprocess_data(data_set):
  if 'attack_cat' in data_set.columns.tolist():
    data_set = data_set.drop('attack_cat', axis=1)

  if 'proto' in data_set.columns.tolist():
    category_percentages = data_set['proto'].value_counts(normalize=True) * 100

    top_6_categories = category_percentages.head(6).index.tolist()

    data_set['proto_grouped'] = data_set['proto'].apply(lambda x: x if x in top_6_categories else 'other')

    data_set = pd.get_dummies(data_set, columns=['proto_grouped'], prefix='proto_grouped')

    data_set = data_set.drop('proto', axis=1)

  if 'proto_grouped' in data_set.columns.tolist():
      data_set = data_set.drop(['proto_grouped'], axis=1)

  categorical_cols = data_set.select_dtypes(include=['category']).columns.tolist()
  data_set = pd.get_dummies(data_set, columns=categorical_cols, prefix_sep='_')

  binary_cols = data_set.select_dtypes(include=['bool']).columns
  data_set[binary_cols] = data_set[binary_cols].astype(int)

  print(f"Data set preprocessed, columns = {data_set.columns.tolist()}")
  return data_set

train_set = preprocess_data(train_set)

class_counts = train_set['label'].value_counts()

print("Class Sizes:")
print(f"Genuine count, Label 0: {class_counts[0]} rows")
print(f"Positive class, Label 1: {class_counts[1]} rows")



Data set preprocessed, columns = ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports', 'label', 'proto_grouped_arp', 'proto_grouped_ospf', 'proto_grouped_other', 'proto_grouped_sctp', 'proto_grouped_tcp', 'proto_grouped_udp', 'proto_grouped_unas', 'service_-', 'service_dhcp', 'service_dns', 'service_ftp', 'service_ftp-data', 'service_http', 'service_irc', 'service_pop3', 'service_radius', 'service_smtp', 'service_snmp', 'service_ssh', 'service_ssl', 'state_CON', 'state_ECO', 'state_FIN', 'state_INT', 'state_PAR', 'state_REQ', 'state_RST', 'state_URN', 'state_no']
Class Sizes:
Genuine count, Label 0: 56000 rows
Positive class, Label 1: 119341 rows


NOTE TO SELF -
1. THIS IS FOR BINARY CLASSIFICATION, WE WANT MULTICLASS EVENTUALLY, BUT FOR NOW WE WILL JUST DO BN


Based on the high number of columns in the Proto column, we may want to consider an Embeddings layer with the Deep Learning that we plan to undertake later. However since DT/RF perform somewhat poorly on sparse vector datasets (like one hot encoded ones) we will group all the extremely rare categories into an 'other'.


In [None]:


def run_models(model_type, X, y):
    """
    Runs Logistic Regression (LR), Decision Tree (DT), or Random Forest (RF) model using nested cross-validation with oversampling.
    """
    # Scale data only for Logistic Regression
    if model_type.upper() == 'LR':
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    else:
        X = X.values  # Convert to NumPy array for consistency

    # Define outer and inner cross-validation
    outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    # Initialize the model and hyperparameter grid
    if model_type.upper() == 'LR':
        model = LogisticRegression(max_iter=1000)
        param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
        print("Running nested cross-validation for Logistic Regression with oversampling.")
    elif model_type.upper() == 'DT':
        model = DecisionTreeClassifier()
        param_grid = {
            'max_depth': [3, 5, 10],
            'min_samples_split': [2, 10, 20],
            'min_samples_leaf': [1, 5, 10],
            'criterion': ['gini', 'entropy']
        }
        print("Running nested cross-validation for Decision Tree with oversampling.")
    elif model_type.upper() == 'RF':
        model = RandomForestClassifier()
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 10],
            'min_samples_split': [2, 10, 20],
            'min_samples_leaf': [1, 5, 10],
        }
        print("Running nested cross-validation for Random Forest with oversampling.")
    else:
        print("Invalid model type. Please choose 'LR', 'DT', or 'RF'.")
        return

    # Store outer fold scores
    outer_scores = []

    # Loop through outer folds
    for train_index, val_index in tqdm(outer_cv.split(X, y)):
        # Split data into training and validation sets
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        # Oversample the training data to account for slight class imbalance
        ros = RandomOverSampler(random_state=42)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_train_fold, y_train_fold)

        # Inner cross-validation with oversampling
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=inner_cv,
            scoring='roc_auc',
            n_jobs=-1
        )

        # Fit the model on the resampled training data
        grid_search.fit(X_train_resampled, y_train_resampled)

        # Evaluate the best model on the validation data
        best_model = grid_search.best_estimator_
        y_val_pred_proba = best_model.predict_proba(X_val_fold)[:, 1]
        best_score = roc_auc_score(y_val_fold, y_val_pred_proba)

        # Store the best score
        outer_scores.append(best_score)

    # Print the average ROC AUC score
    print(f"Average Validation ROC AUC from nested cross-validation ({model_type.upper()}): {np.mean(outer_scores):.4f}")

X = train_set.drop('label', axis=1)
y = train_set['label']

# Example usage:
run_models('LR', X, y)
run_models('DT', X, y)
run_models('RF', X, y)

Running nested cross-validation for Logistic Regression with oversampling.


3it [00:47, 15.97s/it]


Average Validation ROC AUC from nested cross-validation (LR): 0.9641
Running nested cross-validation for Decision Tree with oversampling.


3it [01:50, 36.90s/it]


Average Validation ROC AUC from nested cross-validation (DT): 0.9847
Running nested cross-validation for Random Forest with oversampling.


3it [29:04, 581.47s/it]

Average Validation ROC AUC from nested cross-validation (RF): 0.9892



