# UNSW-NB15 IDS Model Training

This notebook loads the **UNSW-NB15 training and testing sets**, preprocesses the data,
trains a PyTorch MLP classifier, evaluates it, and saves the model.

Required dataset files (relative to project root):
- `datasets/UNSW_NB15_training-set.csv`
- `datasets/UNSW_NB15_testing-set.csv`

Open this notebook from your `ml/notebooks` folder.

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

import torch
from torch import nn

import joblib

print('PyTorch version:', torch.__version__)

PyTorch version: 2.9.1+cpu


## 1. Load UNSW-NB15 training and testing sets

In [3]:
# assumes this notebook is in ml/notebooks
project_root = Path.cwd().parents[1]
data_dir = project_root / 'datasets'

train_path = data_dir / 'UNSW_NB15_training-set.csv'
test_path = data_dir / 'UNSW_NB15_testing-set.csv'

print('Training path:', train_path)
print('Testing path :', test_path)

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print('Train shape:', train_df.shape)
print('Test shape :', test_df.shape)
train_df.head()

Training path: g:\new_DT_project\ai-soc-project\datasets\UNSW_NB15_training-set.csv
Testing path : g:\new_DT_project\ai-soc-project\datasets\UNSW_NB15_testing-set.csv
Train shape: (175341, 45)
Test shape : (82332, 45)


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


## 2. Combine for consistent preprocessing

In [4]:
train_df = train_df.copy()
test_df = test_df.copy()
train_df['split'] = 'train'
test_df['split'] = 'test'

df = pd.concat([train_df, test_df], ignore_index=True)
print('Combined shape:', df.shape)
df[['label', 'attack_cat']].value_counts().head()

Combined shape: (257673, 46)


label  attack_cat
0      Normal        93000
1      Generic       58871
       Exploits      44525
       Fuzzers       24246
       DoS           16353
Name: count, dtype: int64

## 3. Preprocess
- Drop `srcip` and `dstip` (strings)
- Encode `proto`, `service`, `state` as integer codes
- Use all remaining numeric columns (except `attack_cat`, `label`, `split`) as features

In [5]:
df = df.copy()

# Drop IP address columns if present
for col in ['srcip', 'dstip']:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Categorical columns to encode
cat_cols = [c for c in ['proto', 'service', 'state'] if c in df.columns]
print('Categorical columns:', cat_cols)

for c in cat_cols:
    df[c] = df[c].astype('category').cat.codes

# Ensure label is int 0/1
df['label'] = df['label'].astype(int)
print('Label distribution:\n', df['label'].value_counts())

# Feature columns: everything except attack_cat, label, split
exclude = {'attack_cat', 'label', 'split'}
feature_cols = [c for c in df.columns if c not in exclude]
print('Number of feature columns:', len(feature_cols))
print('First 10 features:', feature_cols[:10])

# Make sure all features are numeric
for c in feature_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df[feature_cols] = df[feature_cols].fillna(0)

df[feature_cols].head()

Categorical columns: ['proto', 'service', 'state']
Label distribution:
 label
1    164673
0     93000
Name: count, dtype: int64
Number of feature columns: 43
First 10 features: ['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate']


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,1,0.121478,113,0,4,6,4,258,172,74.08749,...,1,1,1,1,0,0,0,1,1,0
1,2,0.649902,113,0,4,14,38,734,42014,78.473372,...,1,1,1,2,0,0,0,1,6,0
2,3,1.623129,113,0,4,8,16,364,13186,14.170161,...,2,1,1,3,0,0,0,2,6,0
3,4,1.681642,113,3,4,12,12,628,770,13.677108,...,2,1,1,3,1,1,0,2,1,0
4,5,0.449454,113,0,4,10,6,534,268,33.373826,...,2,2,1,40,0,0,0,2,39,0


## 4. Split back into train/test and scale

In [6]:
train_df_proc = df[df['split'] == 'train'].copy()
test_df_proc = df[df['split'] == 'test'].copy()

X_train = train_df_proc[feature_cols].values
y_train = train_df_proc['label'].values

X_test = test_df_proc[feature_cols].values
y_test = test_df_proc['label'].values

print('X_train shape:', X_train.shape)
print('X_test shape :', X_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

X_train_t.shape, y_train_t.shape

X_train shape: (175341, 43)
X_test shape : (82332, 43)


(torch.Size([175341, 43]), torch.Size([175341, 1]))

## 5. Define and train the PyTorch MLP

In [7]:
class IDSModel(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.net(x)


input_dim = X_train_t.shape[1]
model = IDSModel(input_dim)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_t)
    loss = criterion(outputs, y_train_t)
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        model.eval()
        val_outputs = model(X_test_t)
        val_preds = (val_outputs > 0.5).float()
        val_acc = (val_preds.eq(y_test_t).sum().item()) / len(y_test_t)

    print(f'Epoch {epoch+1}/{n_epochs} - loss: {loss.item():.4f} - val_acc: {val_acc:.4f}')

Epoch 1/10 - loss: 0.6934 - val_acc: 0.5753
Epoch 2/10 - loss: 0.6760 - val_acc: 0.7349
Epoch 3/10 - loss: 0.6597 - val_acc: 0.7485
Epoch 4/10 - loss: 0.6446 - val_acc: 0.7521
Epoch 5/10 - loss: 0.6303 - val_acc: 0.7525
Epoch 6/10 - loss: 0.6165 - val_acc: 0.7532
Epoch 7/10 - loss: 0.6030 - val_acc: 0.7539
Epoch 8/10 - loss: 0.5893 - val_acc: 0.7538
Epoch 9/10 - loss: 0.5754 - val_acc: 0.7537
Epoch 10/10 - loss: 0.5612 - val_acc: 0.7531


## 6. Evaluate

In [8]:
model.eval()
with torch.no_grad():
    preds_prob = model(X_test_t).cpu().numpy().ravel()
    preds = (preds_prob > 0.5).astype(int)

print(classification_report(y_test, preds))
print('Confusion matrix:\n', confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

           0       0.96      0.47      0.63     37000
           1       0.69      0.99      0.81     45332

    accuracy                           0.75     82332
   macro avg       0.83      0.73      0.72     82332
weighted avg       0.82      0.75      0.73     82332

Confusion matrix:
 [[17300 19700]
 [  628 44704]]


## 7. Save model, scaler, and feature metadata

In [9]:
model_dir = project_root / 'ml' / 'models'
model_dir.mkdir(parents=True, exist_ok=True)

model_path = model_dir / 'unsw_ids_mlp.pt'
scaler_path = model_dir / 'unsw_scaler.pkl'
meta_path = model_dir / 'unsw_features_metadata.txt'

torch.save(model.state_dict(), model_path)
joblib.dump(scaler, scaler_path)

with open(meta_path, 'w') as f:
    f.write('\n'.join(feature_cols))

print('Saved model to :', model_path)
print('Saved scaler to:', scaler_path)
print('Saved feature list to:', meta_path)

Saved model to : g:\new_DT_project\ai-soc-project\ml\models\unsw_ids_mlp.pt
Saved scaler to: g:\new_DT_project\ai-soc-project\ml\models\unsw_scaler.pkl
Saved feature list to: g:\new_DT_project\ai-soc-project\ml\models\unsw_features_metadata.txt
