# Practice run of analysing/testing different models on the UNSW_NB15 dataset, before trying Deep Learning.

Prior research suggests this is a largely non-linear, less separable dataset so deep learning may be necessary, but I will try simpler, more interpretable models first for the sake of completeness, and to gain Variable Importances

In [1]:
#import packages:


from google.colab import drive

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  # Check if drive is mounted by looking for the mount point in the file system.
  # This is a more robust approach than relying on potentially internal variables.
  import os
  if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
  else:
    print("Google Drive is already mounted.")
else:
  print("Not running in Google Colab. Drive mounting skipped.")

from IPython import get_ipython
from IPython.display import display
import cudf
try:
    import cuml
    print("RAPIDS is already installed.")
except ImportError:
    print("RAPIDS not found, installing...")
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
    !python rapidsai-csp-utils/colab/pip-install.py
    print("RAPIDS installed successfully.")
finally:
  import cuml

print("RAPIDS version:", cuml.__version__)
from cuml.preprocessing import StandardScaler
from cuml.model_selection import StratifiedKFold, GridSearchCV
from cuml.linear_model import LogisticRegression
from cuml.pipeline import Pipeline
from cuml.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm


Google Drive is already mounted.


ModuleNotFoundError: No module named 'cuml'

Let's load our packages and data

In [None]:
#if using colabs - will need to first mount your drive

#change these for different users
test_set_filepath = '/content/drive/MyDrive/Colab_Notebooks/Data/UNSW_NB15_testing-set.parquet'
training_set_filepath = '/content/drive/MyDrive/Colab_Notebooks/Data/UNSW_NB15_training-set.parquet'

# Import the two CSV files
test_set = pd.read_parquet(test_set_filepath)
train_set = pd.read_parquet(training_set_filepath)

print("Data loaded")


The next cell does some basic analysis, and one hot encodes some of the features:

In [None]:
def preprocess_data(data_set):
    # Remove 'attack_cat' column if it exists
    if 'attack_cat' in data_set.columns.tolist():
        data_set.drop('attack_cat', axis=1, inplace=True)

    if 'proto' in data_set.columns.tolist():
        # Ensure 'proto' is of type 'object' to avoid categorical issues
        data_set['proto'] = data_set['proto'].astype(str)

        # Calculate percentage occurrences of each category
        category_percentages = data_set['proto'].value_counts(normalize=True) * 100
        top_6_categories = category_percentages.head(6).index.tolist()

        # Group less frequent categories under 'other' using vectorized operations
        data_set['proto_grouped'] = data_set['proto']
        data_set.loc[~data_set['proto'].isin(top_6_categories), 'proto_grouped'] = 'other'

        # Drop the original 'proto' column
        data_set.drop('proto', axis=1, inplace=True)

        # One-hot encode the 'proto_grouped' column
        data_set = pd.get_dummies(data_set, columns=['proto_grouped'], prefix='proto_grouped')

    # One-hot encode any remaining categorical columns
    categorical_cols = data_set.select_dtypes(include=['object', 'category']).columns.tolist()
    if categorical_cols:
        data_set = pd.get_dummies(data_set, columns=categorical_cols, prefix_sep='_')

    # Convert boolean columns to integers
    binary_cols = data_set.select_dtypes(include=['bool']).columns

    if not binary_cols.empty:
        data_set[binary_cols] = data_set[binary_cols].astype(int)

    return data_set


# Preprocess the datasets
train_set = preprocess_data(train_set)
test_set = preprocess_data(test_set)

print(f"Column sizes before further processing: Train: {len(train_set.columns)}, Test: {len(test_set.columns)}")

# Identify missing columns
train_columns = set(train_set.columns)
test_columns = set(test_set.columns)

missing_in_test = train_columns - test_columns
missing_in_train = test_columns - train_columns

# Remove 'label' from missing columns if present
missing_in_test.discard('label')
missing_in_train.discard('label')

# Add missing columns to test_set
for col in missing_in_test:
    test_set[col] = 0

# Add missing columns to train_set
for col in missing_in_train:
    train_set[col] = 0

# Ensure the columns are in the same order
common_columns = sorted(train_set.columns)

train_set = train_set[common_columns]
test_set = test_set[common_columns]

# Verify the columns
print(f"Number of columns in train_set: {len(train_set.columns)}")
print(f"Number of columns in test_set: {len(test_set.columns)}")

print(f"Columns in train_set: {train_set.columns.tolist()}")
print(f"Columns in test_set: {test_set.columns.tolist()}")

#turn both pandas dataframes into cudf
train_set = cudf.DataFrame.from_pandas(train_set)
#we might not go ahead and use the test set but in case we do, it's here
test_set = cudf.DataFrame.from_pandas(test_set)

print("Data preprocessed")

NOTE TO SELF -
1. THIS IS FOR BINARY CLASSIFICATION, WE WANT MULTICLASS EVENTUALLY, BUT FOR NOW WE WILL JUST DO BN


Based on the high number of columns in the Proto column, we may want to consider an Embeddings layer with the Deep Learning that we plan to undertake later. However since DT/RF perform somewhat poorly on sparse vector datasets (like one hot encoded ones) we will group all the extremely rare categories into an 'other'.


In [None]:
def run_models(model_type, X, y):
    """
    Runs Logistic Regression (LR) or Random Forest (RF) model using nested cross-validation with cuML.
    """

    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    #=============================== Logistic Regression (LR) ========================================#
    if model_type.upper() == 'LR':
        model = cuml.linear_model.LogisticRegression(max_iter=1000)

        param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

        outer_cv = cuml.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        inner_cv = cuml.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

        print("Running nested cross-validation for Logistic Regression.")

        outer_scores = []

        for train_index, val_index in tqdm(outer_cv.split(X_scaled, y)):
            X_train_fold, X_val_fold = X_scaled[train_index], X_scaled[val_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

            grid_search = cuml.model_selection.GridSearchCV(
                estimator=model,
                param_grid=param_grid,
                cv=inner_cv,
                scoring='roc_auc',
            )

            grid_search.fit(X_train_fold, y_train_fold)
            best_score = grid_search.score(X_val_fold, y_val_fold)
            outer_scores.append(best_score)

        print(f"Average Validation ROC AUC from nested cross-validation: {np.mean(outer_scores)}")

    #=============================== Random Forest (RF) ========================================#
    elif model_type.upper() == 'RF':
        model = cuml.ensemble.RandomForestClassifier()
        param_grid_rf = {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 10],
            'min_samples_split': [2, 10, 20],
            'min_samples_leaf': [1, 5, 10],
        }

        outer_cv = cuml.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        inner_cv = cuml.model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

        print("Running nested cross-validation for Random Forest.")

        outer_scores = []

        for train_index, val_index in tqdm(outer_cv.split(X_scaled, y)):
            X_train_fold, X_val_fold = X_scaled[train_index], X_scaled[val_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

            grid_search = cuml.model_selection.GridSearchCV(
                estimator=model,
                param_grid=param_grid_rf,
                cv=inner_cv,
                scoring='roc_auc',
            )

            grid_search.fit(X_train_fold, y_train_fold)
            best_score = grid_search.score(X_val_fold, y_val_fold)
            outer_scores.append(best_score)

        print(f"Average Validation ROC AUC from nested cross-validation: {np.mean(outer_scores)}")

    else:
        print("Invalid model type. Please choose 'LR' for Logistic Regression or 'RF' for Random Forest.")