# Support Vector Machine (SVM) Training
## Network Intrusion Detection using UNSW-NB15 Dataset

## Prerequisites
- Ensure the UNSW-NB15 dataset files are placed in the `data/` directory:
  - `UNSW_NB15_training-set.csv`
  - `UNSW_NB15_testing-set.csv`
- Install dependencies: `uv sync`
- Run this notebook from the project root directory

## 1. Import Libraries

In [1]:
import sys
from pathlib import Path
import numpy as np
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import time

# Add project root to Python path
project_root = Path.cwd().parent.parent.parent
sys.path.insert(0, str(project_root))

from app.utils.preprocessor import UNSWNB15Preprocessor
from app.classifiers.svm.model import SVM

## 2. Initialize Model and Preprocessor

In [2]:
preprocessor = UNSWNB15Preprocessor()
# Using sklearn's optimized SVC for memory efficiency on large datasets
model = SVM(C=1.0, kernel='rbf', gamma='scale', max_iter=-1, random_state=42, cache_size=200)

## 3. Load and Preprocess Data

In [3]:
# Define paths to data files
data_dir = project_root / 'data'
train_path = data_dir / 'UNSW_NB15_training-set.csv'
test_path = data_dir / 'UNSW_NB15_testing-set.csv'

# Load data
X_train, X_test, y_train, y_test = preprocessor.load_data(
    train_path=str(train_path),
    test_path=str(test_path)
)

Loading training data from: c:\Users\ACER\OneDrive\Documents\Kerja\self\ensemble-project\data\UNSW_NB15_training-set.csv
Loading testing data from: c:\Users\ACER\OneDrive\Documents\Kerja\self\ensemble-project\data\UNSW_NB15_testing-set.csv
Training samples: 82332
Testing samples: 175341
Features: 42
Classes: 2
Training samples: 82332
Testing samples: 175341
Features: 42
Classes: 2


In [4]:
# Preprocess data
X_train_processed, y_train_encoded = preprocessor.fit_transform(X_train, y_train)
X_test_processed, y_test_encoded = preprocessor.transform(X_test, y_test)


=== Preprocessing Training Data ===
Handling missing values...
Encoding categorical features...
Creating engineered features...
Scaling features...
Encoding labels...
Final feature dimension: 52
Preprocessing complete!

=== Preprocessing Test Data ===
Encoding labels...
Final feature dimension: 52
Preprocessing complete!

=== Preprocessing Test Data ===
Test samples processed: 175341
Preprocessing complete!
Test samples processed: 175341
Preprocessing complete!


## 4. Dataset Information

In [5]:
classes = preprocessor.get_class_names()
class_names = [str(c) for c in classes]
features = preprocessor.get_feature_names()

input_dim = X_train_processed.shape[1]
num_classes = len(classes)

print(f'\n{"="*60}')
print(f'DATASET INFORMATION')
print(f'{"="*60}')
print(f'Input dimension: {input_dim}')
print(f'Number of classes: {num_classes}')
print(f'Training samples: {X_train_processed.shape[0]}')
print(f'Testing samples: {X_test_processed.shape[0]}')
print(f'Classes: {class_names}')


DATASET INFORMATION
Input dimension: 52
Number of classes: 2
Training samples: 82332
Testing samples: 175341
Classes: ['0', '1']


## 5. Train SVM Model

In [6]:
print(f'\n{"="*60}')
print(f'TRAINING SVM MODEL')
print(f'{"="*60}')
print(f'C (Regularization): {model.C}')
print(f'Kernel: {model.kernel}')
print(f'Gamma: {model.gamma}')
print(f'Max iterations: {model.max_iter}')
print(f'\nTraining started...')

start_time = time.time()
model.fit(X_train_processed, y_train_encoded)
training_time = time.time() - start_time

print(f"Training completed in {training_time:.2f} seconds")


TRAINING SVM MODEL
C (Regularization): 1.0
Kernel: rbf
Gamma: scale
Max iterations: -1

Training started...
Training completed in 98.21 seconds
Training completed in 98.21 seconds


## 6. Make Predictions

In [7]:
# Make predictions
y_train_pred = model.predict(X_train_processed)

if y_test_encoded is not None:
    y_test_pred = model.predict(X_test_processed)
else:
    raise ValueError("y_test_encoded is None - labels are required for evaluation")

## 7. Calculate Performance Metrics

In [8]:
# Calculate metrics
train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
test_accuracy = accuracy_score(y_test_encoded, y_test_pred)

train_precision = precision_score(y_train_encoded, y_train_pred, average='weighted', zero_division=0)
test_precision = precision_score(y_test_encoded, y_test_pred, average='weighted', zero_division=0)

train_recall = recall_score(y_train_encoded, y_train_pred, average='weighted', zero_division=0)
test_recall = recall_score(y_test_encoded, y_test_pred, average='weighted', zero_division=0)

train_f1 = f1_score(y_train_encoded, y_train_pred, average='weighted', zero_division=0)
test_f1 = f1_score(y_test_encoded, y_test_pred, average='weighted', zero_division=0)

## 8. Display Overall Performance Metrics

In [9]:
print(f'\n{"="*60}')
print(f'MODEL EVALUATION')
print(f'{"="*60}')
print(f'\nOverall Performance Metrics:')
print(f'{"-"*60}')
print(f'{"Metric":<20} {"Training":<20} {"Testing":<20}')
print(f'{"-"*60}')
print(f'{"Accuracy":<20} {train_accuracy:<20.4f} {test_accuracy:<20.4f}')
print(f'{"Precision":<20} {train_precision:<20.4f} {test_precision:<20.4f}')
print(f'{"Recall":<20} {train_recall:<20.4f} {test_recall:<20.4f}')
print(f'{"F1-Score":<20} {train_f1:<20.4f} {test_f1:<20.4f}')
print(f'{"-"*60}')


MODEL EVALUATION

Overall Performance Metrics:
------------------------------------------------------------
Metric               Training             Testing             
------------------------------------------------------------
Accuracy             0.9391               0.8801              
Precision            0.9397               0.9035              
Recall               0.9391               0.8801              
F1-Score             0.9392               0.8834              
------------------------------------------------------------


## 9. Detailed Classification Report

In [10]:
print(f'\n{"="*60}')
print(f'DETAILED CLASSIFICATION REPORT (Test Set)')
print(f'{"="*60}')
print(classification_report(y_test_encoded, y_test_pred, target_names=class_names))


DETAILED CLASSIFICATION REPORT (Test Set)
              precision    recall  f1-score   support

           0       0.74      0.96      0.84     56000
           1       0.98      0.84      0.91    119341

    accuracy                           0.88    175341
   macro avg       0.86      0.90      0.87    175341
weighted avg       0.90      0.88      0.88    175341



## 10. Confusion Matrix

In [11]:
print(f'\n{"="*60}')
print(f'CONFUSION MATRIX (Test Set)')
print(f'{"="*60}')
cm = confusion_matrix(y_test_encoded, y_test_pred)
print(f'\nRows: True labels, Columns: Predicted labels')
print(f'Classes: {class_names}\n')
print(cm)


CONFUSION MATRIX (Test Set)

Rows: True labels, Columns: Predicted labels
Classes: ['0', '1']

[[ 54012   1988]
 [ 19042 100299]]


## 11. Per-Class Accuracy

In [12]:
print(f'\n{"="*60}')
print(f'PER-CLASS ACCURACY (Test Set)')
print(f'{"="*60}')
for i, class_name in enumerate(class_names):
    class_mask = (y_test_encoded == i)
    num_samples = int(np.sum(class_mask))
    if num_samples > 0:
        class_accuracy = float(np.sum(y_test_pred[class_mask] == i)) / num_samples
        print(f'{class_name:<20}: {class_accuracy:.4f} ({num_samples} samples)')


PER-CLASS ACCURACY (Test Set)
0                   : 0.9645 (56000 samples)
1                   : 0.8404 (119341 samples)


## 12. Save Report to File

In [13]:
print(f'\n{"="*60}')
print(f'SAVING REPORT')
print(f'{"="*60}')

report_dir = project_root / 'results'
report_dir.mkdir(exist_ok=True)
report_file = report_dir / 'svm_classification_report.txt'

# Calculate confusion matrix for the report
cm = confusion_matrix(y_test_encoded, y_test_pred)

with open(report_file, 'w') as f:
    f.write(f'{"="*60}\n')
    f.write(f'SVM CLASSIFICATION REPORT\n')
    f.write(f'{"="*60}\n\n')
    
    f.write(f'Training Date: {time.strftime("%Y-%m-%d %H:%M:%S")}\n')
    f.write(f'Training Time: {training_time:.2f} seconds\n\n')
    
    f.write(f'{"="*60}\n')
    f.write(f'MODEL CONFIGURATION\n')
    f.write(f'{"="*60}\n')
    f.write(f'C (Regularization): {model.C}\n')
    f.write(f'Kernel: {model.kernel}\n')
    f.write(f'Gamma: {model.gamma}\n')
    f.write(f'Max iterations: {model.max_iter}\n\n')
    
    f.write(f'{"="*60}\n')
    f.write(f'DATASET INFORMATION\n')
    f.write(f'{"="*60}\n')
    f.write(f'Input dimension: {input_dim}\n')
    f.write(f'Number of classes: {num_classes}\n')
    f.write(f'Training samples: {X_train_processed.shape[0]}\n')
    f.write(f'Testing samples: {X_test_processed.shape[0]}\n')
    f.write(f'Classes: {", ".join(class_names)}\n\n')
    
    f.write(f'{"="*60}\n')
    f.write(f'OVERALL PERFORMANCE METRICS\n')
    f.write(f'{"="*60}\n')
    f.write(f'{"Metric":<20} {"Training":<20} {"Testing":<20}\n')
    f.write(f'{"-"*60}\n')
    f.write(f'{"Accuracy":<20} {train_accuracy:<20.4f} {test_accuracy:<20.4f}\n')
    f.write(f'{"Precision":<20} {train_precision:<20.4f} {test_precision:<20.4f}\n')
    f.write(f'{"Recall":<20} {train_recall:<20.4f} {test_recall:<20.4f}\n')
    f.write(f'{"F1-Score":<20} {train_f1:<20.4f} {test_f1:<20.4f}\n')
    f.write(f'{"-"*60}\n\n')
    
    f.write(f'{"="*60}\n')
    f.write(f'DETAILED CLASSIFICATION REPORT (Test Set)\n')
    f.write(f'{"="*60}\n')
    report_str = classification_report(y_test_encoded, y_test_pred, target_names=class_names)
    f.write(str(report_str))
    f.write('\n')
    
    f.write(f'{"="*60}\n')
    f.write(f'CONFUSION MATRIX (Test Set)\n')
    f.write(f'{"="*60}\n')
    f.write(f'Rows: True labels, Columns: Predicted labels\n')
    f.write(f'Classes: {", ".join(class_names)}\n\n')
    f.write(str(cm))
    f.write('\n\n')
    
    f.write(f'{"="*60}\n')
    f.write(f'PER-CLASS ACCURACY (Test Set)\n')
    f.write(f'{"="*60}\n')
    for i, class_name in enumerate(class_names):
        class_mask = (y_test_encoded == i)
        num_samples = int(np.sum(class_mask))
        if num_samples > 0:
            class_accuracy = float(np.sum(y_test_pred[class_mask] == i)) / num_samples
            f.write(f'{class_name:<20}: {class_accuracy:.4f} ({num_samples} samples)\n')

print(f'\nReport saved to: {report_file}')
print(f'\n{"="*60}')
print(f'TRAINING COMPLETE')
print(f'{"="*60}')


SAVING REPORT

Report saved to: c:\Users\ACER\OneDrive\Documents\Kerja\self\ensemble-project\results\svm_classification_report.txt

TRAINING COMPLETE
