# Network Intrusion Detection - Master Training Notebook

This notebook trains all 4 models:
1. **Isolation Forest** - Fast classical baseline (~3s)
2. **One-Class SVM** - High precision classical method (~1s)
3. **Autoencoder** - Deep learning baseline (~40s)
4. **VAE** - Advanced deep learning (~5-10 min)

**Training Data:** Monday BENIGN (50K samples)

**Test Data:** Wednesday DoS/DDoS (100K samples)

**Features:**
- ‚úÖ Automatic checkpointing (deep learning models)
- ‚úÖ Early stopping
- ‚úÖ Model persistence
- ‚úÖ Comprehensive evaluation
- ‚úÖ Visualizations

**Estimated Runtime:** 15-20 minutes total

## Setup

In [1]:
import sys
import time
import pickle
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))

from data.preprocessing import CICIDS2017Preprocessor
from models.isolation_forest import IsolationForestDetector
from models.one_class_svm import OneClassSVMDetector
from models.autoencoder import AutoencoderDetector
from models.vae import VAEDetector

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Imports complete")

ModuleNotFoundError: No module named 'pandas'

## Configuration

In [None]:
# Directories
DATA_DIR = Path.cwd().parent / 'data' / 'raw'
MODELS_DIR = Path.cwd().parent / 'models'
CHECKPOINTS_DIR = Path.cwd().parent / 'checkpoints'
RESULTS_DIR = Path.cwd().parent / 'results'

# Create directories
MODELS_DIR.mkdir(exist_ok=True)
CHECKPOINTS_DIR.mkdir(exist_ok=True)
(CHECKPOINTS_DIR / 'autoencoder').mkdir(exist_ok=True)
(CHECKPOINTS_DIR / 'vae').mkdir(exist_ok=True)
RESULTS_DIR.mkdir(exist_ok=True)

# Training config
TRAIN_SAMPLES = 50000  # Monday BENIGN
TEST_SAMPLES = 100000  # Wednesday DoS/DDoS
RANDOM_STATE = 42

print(f"üìÅ Data directory: {DATA_DIR}")
print(f"üìÅ Models directory: {MODELS_DIR}")
print(f"üìÅ Checkpoints directory: {CHECKPOINTS_DIR}")
print(f"üìÅ Results directory: {RESULTS_DIR}")
print(f"\n‚úÖ Directories configured")

## Load and Preprocess Data

In [None]:
print("Loading Monday BENIGN data (training)...")
df_monday = pd.read_csv(DATA_DIR / 'Monday-WorkingHours.pcap_ISCX.csv')
print(f"Monday data shape: {df_monday.shape}")

print("\nLoading Wednesday DoS/DDoS data (testing)...")
df_wednesday = pd.read_csv(DATA_DIR / 'Wednesday-workingHours.pcap_ISCX.csv')
print(f"Wednesday data shape: {df_wednesday.shape}")

# Sample for faster training
print(f"\nSampling {TRAIN_SAMPLES} training samples...")
df_train = df_monday.sample(n=min(TRAIN_SAMPLES, len(df_monday)), random_state=RANDOM_STATE)

print(f"Sampling {TEST_SAMPLES} test samples...")
df_test = df_wednesday.sample(n=min(TEST_SAMPLES, len(df_wednesday)), random_state=RANDOM_STATE)

print(f"\n‚úÖ Data loaded")
print(f"   Training samples: {len(df_train)}")
print(f"   Test samples: {len(df_test)}")
print(f"   Test attack distribution:")
print(df_test[' Label'].value_counts())

In [None]:
# Preprocess data
print("Preprocessing data...")
preprocessor = CICIDS2017Preprocessor()

# Fit on training data (BENIGN only)
X_train, y_train = preprocessor.fit_transform(df_train)
print(f"Training data shape: {X_train.shape}")
print(f"Training labels (should be all BENIGN=0): {np.unique(y_train, return_counts=True)}")

# Transform test data
X_test, y_test = preprocessor.transform(df_test)
print(f"\nTest data shape: {X_test.shape}")
print(f"Test labels (0=BENIGN, 1=ATTACK): {np.unique(y_test, return_counts=True)}")
print(f"Attack rate in test set: {y_test.mean():.1%}")

print(f"\n‚úÖ Preprocessing complete")

## Helper Functions

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    """Evaluate model performance."""
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"\n{'='*60}")
    print(f"{model_name} Results")
    print(f"{'='*60}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    
    # Check quality gate
    if f1 >= 0.85:
        print(f"‚úÖ QUALITY GATE PASSED (F1 >= 0.85)")
    else:
        print(f"‚ùå QUALITY GATE FAILED (F1 < 0.85, gap: {0.85-f1:.4f})")
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    fp_rate = fp / (fp + tn)
    
    print(f"\nConfusion Matrix:")
    print(f"  TN: {tn:,} | FP: {fp:,}")
    print(f"  FN: {fn:,} | TP: {tp:,}")
    print(f"\nFalse Positive Rate: {fp_rate:.2%}")
    print(f"{'='*60}")
    
    return {
        'model': model_name,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'fp_rate': fp_rate,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp
    }

def save_results(results, filename):
    """Save results to pickle file."""
    with open(RESULTS_DIR / filename, 'wb') as f:
        pickle.dump(results, f)
    print(f"\nüíæ Results saved to {filename}")

print("‚úÖ Helper functions defined")

## 1. Train Isolation Forest

In [None]:
print("\n" + "="*80)
print("TRAINING ISOLATION FOREST")
print("="*80)

start_time = time.time()

# Train
if_detector = IsolationForestDetector(
    contamination=0.1,
    n_estimators=100,
    random_state=RANDOM_STATE
)

print("Training Isolation Forest...")
if_detector.fit(X_train)

training_time = time.time() - start_time
print(f"‚úÖ Training complete in {training_time:.2f}s")

# Predict
print("Predicting on test set...")
y_pred_if = if_detector.predict(X_test)

# Evaluate
if_results = evaluate_model(y_test, y_pred_if, "Isolation Forest")
if_results['training_time'] = training_time

# Save model
model_path = MODELS_DIR / 'isolation_forest_final.pkl'
if_detector.save(model_path)
print(f"üíæ Model saved to {model_path}")

# Save results
save_results(if_results, 'isolation_forest_results.pkl')

## 2. Train One-Class SVM

In [None]:
print("\n" + "="*80)
print("TRAINING ONE-CLASS SVM")
print("="*80)

start_time = time.time()

# Train (subsample for speed)
ocsvm_detector = OneClassSVMDetector(
    kernel='rbf',
    nu=0.01,
    gamma='scale'
)

# Subsample training data for OCSVM (faster)
X_train_ocsvm = X_train[:20000]
print(f"Training One-Class SVM on {len(X_train_ocsvm)} samples...")
ocsvm_detector.fit(X_train_ocsvm)

training_time = time.time() - start_time
print(f"‚úÖ Training complete in {training_time:.2f}s")

# Predict
print("Predicting on test set...")
y_pred_ocsvm = ocsvm_detector.predict(X_test)

# Evaluate
ocsvm_results = evaluate_model(y_test, y_pred_ocsvm, "One-Class SVM")
ocsvm_results['training_time'] = training_time

# Save model
model_path = MODELS_DIR / 'ocsvm_final.pkl'
ocsvm_detector.save(model_path)
print(f"üíæ Model saved to {model_path}")

# Save results
save_results(ocsvm_results, 'ocsvm_results.pkl')

## 3. Train Autoencoder

In [None]:
print("\n" + "="*80)
print("TRAINING AUTOENCODER")
print("="*80)

start_time = time.time()

# Train
ae_detector = AutoencoderDetector(
    encoding_dims=[40, 20],
    dropout_rate=0.2,
    l2_reg=1e-5
)

print("Training Autoencoder...")
print("  - Checkpoints will be saved to checkpoints/autoencoder/")
print("  - Early stopping enabled (patience=10)")
print("  - This will take ~40 seconds...\n")

# Train with checkpointing
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

checkpoint_callback = ModelCheckpoint(
    str(CHECKPOINTS_DIR / 'autoencoder' / 'ae_checkpoint_epoch_{epoch:02d}.h5'),
    save_freq='epoch',
    period=5,  # Save every 5 epochs
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

history = ae_detector.fit(
    X_train,
    epochs=100,
    batch_size=256,
    validation_split=0.2,
    callbacks=[checkpoint_callback, early_stopping],
    verbose=1
)

training_time = time.time() - start_time
print(f"\n‚úÖ Training complete in {training_time:.2f}s")

# Predict
print("Predicting on test set...")
y_pred_ae = ae_detector.predict(X_test)

# Evaluate
ae_results = evaluate_model(y_test, y_pred_ae, "Autoencoder")
ae_results['training_time'] = training_time
ae_results['history'] = history.history

# Save model
model_path = MODELS_DIR / 'autoencoder_final.h5'
metadata_path = MODELS_DIR / 'autoencoder_final.pkl'
ae_detector.save(model_path)
print(f"üíæ Model saved to {model_path}")

# Save results
save_results(ae_results, 'autoencoder_results.pkl')

## 4. Train VAE

In [None]:
print("\n" + "="*80)
print("TRAINING VARIATIONAL AUTOENCODER (VAE)")
print("="*80)

start_time = time.time()

# Train
vae_detector = VAEDetector(
    latent_dim=20,
    encoder_dims=[50, 30],
    kl_weight=0.001,
    dropout_rate=0.2,
    l2_reg=1e-5
)

print("Training VAE...")
print("  - Checkpoints will be saved to checkpoints/vae/")
print("  - Early stopping enabled (patience=10)")
print("  - This will take ~5-10 minutes...\n")

# Train with checkpointing
checkpoint_callback = ModelCheckpoint(
    str(CHECKPOINTS_DIR / 'vae' / 'vae_checkpoint_epoch_{epoch:02d}.h5'),
    save_freq='epoch',
    period=5,  # Save every 5 epochs
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

history = vae_detector.fit(
    X_train,
    epochs=100,
    batch_size=256,
    validation_split=0.2,
    callbacks=[checkpoint_callback, early_stopping],
    verbose=1
)

training_time = time.time() - start_time
print(f"\n‚úÖ Training complete in {training_time:.2f}s ({training_time/60:.1f} minutes)")

# Predict
print("Predicting on test set...")
y_pred_vae = vae_detector.predict(X_test)

# Evaluate
vae_results = evaluate_model(y_test, y_pred_vae, "VAE")
vae_results['training_time'] = training_time
vae_results['history'] = history.history

# Save model
model_path = MODELS_DIR / 'vae_final.h5'
metadata_path = MODELS_DIR / 'vae_final.pkl'
vae_detector.save(model_path)
print(f"üíæ Model saved to {model_path}")

# Save results
save_results(vae_results, 'vae_results.pkl')

## Summary Comparison

In [None]:
# Compile all results
all_results = pd.DataFrame([
    if_results,
    ocsvm_results,
    ae_results,
    vae_results
])

print("\n" + "="*80)
print("FINAL COMPARISON")
print("="*80)
print(all_results[['model', 'f1', 'precision', 'recall', 'fp_rate', 'training_time']].to_string(index=False))

# Find best model
best_model = all_results.loc[all_results['f1'].idxmax()]
print(f"\nüèÜ BEST MODEL: {best_model['model']} (F1={best_model['f1']:.4f})")

# Check quality gate
if best_model['f1'] >= 0.85:
    print(f"‚úÖ QUALITY GATE PASSED - Ready for production!")
else:
    gap = 0.85 - best_model['f1']
    print(f"‚ùå QUALITY GATE NOT MET - Need F1 improvement of {gap:.4f}")
    print(f"\nRecommendations:")
    print(f"  - Try ensemble methods (combine OCSVM + VAE)")
    print(f"  - Hyperparameter tuning")
    print(f"  - Train on full dataset (530K samples vs current 50K)")
    print(f"  - Feature engineering")

# Save final comparison
all_results.to_csv(RESULTS_DIR / 'final_comparison.csv', index=False)
print(f"\nüíæ Final comparison saved to results/final_comparison.csv")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# F1 scores
axes[0].barh(all_results['model'], all_results['f1'])
axes[0].axvline(0.85, color='red', linestyle='--', label='Quality Gate (0.85)')
axes[0].set_xlabel('F1 Score')
axes[0].set_title('Model Comparison - F1 Scores')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Precision vs Recall
axes[1].scatter(all_results['recall'], all_results['precision'], s=200)
for idx, row in all_results.iterrows():
    axes[1].annotate(row['model'], (row['recall'], row['precision']), 
                    xytext=(5, 5), textcoords='offset points')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision vs Recall')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'final_comparison.png', dpi=300, bbox_inches='tight')
print(f"üíæ Comparison plot saved to results/final_comparison.png")
plt.show()

## Next Steps

### 1. Review Results
- Check the comparison table and plots above
- Identify which model performed best
- Check if quality gate (F1 > 0.85) was met

### 2. Analyze Per-Attack Performance
- Open `notebooks/03_model_comparison.ipynb`
- Analyze per-attack-type detection rates
- Identify which attacks are hardest to detect

### 3. Report Back
Come back with:
- Best model name and F1 score
- Whether quality gate was met
- Any issues encountered during training

### 4. Decide Next Phase
**If F1 >= 0.85:**
- Proceed to API development
- Deploy to production

**If F1 < 0.85:**
- Try ensemble methods
- Hyperparameter tuning
- Train on full dataset
- Feature engineering