# Network Anomaly Detection - Training on Google Colab

This notebook is set up for training the network anomaly detection model on Google Colab with GPU support.


## 1. Setup and Installation


In [None]:
# Install dependencies
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install numpy pandas scikit-learn matplotlib seaborn tqdm huggingface-hub transformers tensorboard pyyaml wandb xgboost joblib


In [None]:
# Clone repository or upload files
# Option 1: If using git
!git clone https://github.com/ajipalar/network_anomaly_detection.git

# Option 2: Upload files manually using Colab's file upload
# Then unzip if needed
# !unzip network_anomaly_detection.zip


In [None]:
%cd network_anomaly_detection

## 3. Check GPU Availability


In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")


## 4. Configure Training


In [None]:
# Update config for Colab if needed
import yaml

# Load config
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Update device settings for Colab
config['device']['use_cuda'] = torch.cuda.is_available()
config['device']['cuda_device'] = 0

# Enable augmented data usage
config['data']['use_augmented_data'] = True

# Optionally enable wandb logging (requires wandb login)
# config['logging']['use_wandb'] = True  # Uncomment to enable

# Model type selection
# Set to "pytorch" for neural network or "xgboost" for gradient boosting
# config['model']['type'] = "xgboost"  # Uncomment to train XGBoost instead

# Save updated config
with open('config.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("Configuration updated for Colab")
print(f"CUDA enabled: {config['device']['use_cuda']}")
print(f"Using augmented data: {config['data']['use_augmented_data']}")
print(f"Model type: {config['model'].get('type', 'pytorch')}")
print(f"W&B logging: {config['logging']['use_wandb']}")


## 5. Data Augmentation


In [None]:
# Run data augmentation
# This will create augmented training/validation data and hold out test data
print("Running data augmentation...")

In [None]:
!python augment_data.py  --noise-scale 0.1 --target-positive-ratio 0.3 

In [None]:
print("\nAugmentation complete! Check data/augmented/ directory for:")
print("  - train_data_augmented.csv")
print("  - val_data.csv")
print("  - test_data.csv")

In [None]:
# Reset config to PyTorch for neural network training

with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Set model type to PyTorch
config['model']['type'] = 'pytorch'

# Save updated config
with open('config.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("Configuration reset to PyTorch for neural network training")
print(f"Model type: {config['model']['type']}")


In [None]:
## 6. Train PyTorch Model (K-Fold Cross Validation)


# Train PyTorch model with K-fold cross validation
Note: Make sure config['model']['type'] is set to "pytorch" (default)
This will create models for each fold in checkpoints/fold_1/, fold_2/, etc.

In [None]:
!python train.py --config config.yaml --use-cv

In [None]:
print("\nPyTorch K-fold CV complete! Check checkpoints/ directory for fold models.")

In [None]:
## 7. Select Best Model and Train Final PyTorch Model


# Train final PyTorch model using best weights from k-fold CV
# This script automatically:
# 1. Finds all fold checkpoints
# 2. Selects the best fold (lowest validation loss)
# 3. Initializes model with those weights
# 4. Trains on full training data (train + val combined)
# 5. Saves to checkpoints/final_model/best_model.pt

In [None]:
print("Training final PyTorch model from best CV fold...")

In [None]:
!python train_final_model.py --config config.yaml

In [None]:
print("\nFinal PyTorch model training complete! Model saved to checkpoints/final_model/best_model.pt")

In [None]:
## 8. Train XGBoost Model


In [None]:
# Train XGBoost model
# First, update config to use XGBoost
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Set model type to XGBoost
config['model']['type'] = 'xgboost'

# Save updated config
with open('config.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("Configuration updated for XGBoost training")
print(f"Model type: {config['model']['type']}")

# Train XGBoost model
print("\nStarting XGBoost training...")
!python train.py --config config.yaml

print("\nXGBoost training complete! Model saved to checkpoints/best_xgboost_model.pkl")


## 9. Test PyTorch Models (K-Fold and Final)


In [None]:
# Test each PyTorch fold model on the test set
import os
import glob
import subprocess

checkpoint_dir = "checkpoints"
fold_pattern = os.path.join(checkpoint_dir, "fold_*", "best_model.pt")
fold_checkpoints = glob.glob(fold_pattern)

print(f"Found {len(fold_checkpoints)} PyTorch fold checkpoints to test\n")

fold_results = []
for checkpoint_path in sorted(fold_checkpoints):
    fold_num = checkpoint_path.split('/')[-2].split('_')[1]
    print(f"{'='*60}")
    print(f"Testing PyTorch Fold {fold_num}")
    print(f"{'='*60}")
    print(f"Checkpoint: {checkpoint_path}\n")
    
    # Test this fold model
    result = subprocess.run(
        ['python', 'test.py', '--config', 'config.yaml', '--checkpoint', checkpoint_path],
        capture_output=True,
        text=True
    )
    print(result.stdout)
    if result.stderr:
        print("Errors:", result.stderr)
    
    print(f"\nFold {fold_num} testing complete!\n")

print(f"\n{'='*60}")
print("All PyTorch fold models tested!")
print(f"{'='*60}")

# Test final PyTorch model
print("\n" + "="*60)
print("Testing Final PyTorch Model")
print("="*60)
!python test.py --config config.yaml --final-model --model-name "final_model_pytorch"

print("\nFinal PyTorch model testing complete!")


## 10. Test XGBoost Model


In [None]:
# Test XGBoost model
print("Testing XGBoost model...")
!python test.py --config config.yaml --checkpoint checkpoints/best_xgboost_model.pkl --model-name "xgboost"

print("\nXGBoost model testing complete!")


## 11. Compare Model Results


In [None]:
# View test results CSV to compare all models
import pandas as pd

try:
    results_df = pd.read_csv('test_results.csv')
    print("="*60)
    print("Model Comparison Results")
    print("="*60)
    print("\nResults sorted by precision (descending):")
    print(results_df.sort_values('precision', ascending=False).to_string(index=False))
    
    print("\n\nSummary Statistics:")
    print(results_df[['accuracy', 'precision', 'recall', 'f1_score', 'auc']].describe())
    
except FileNotFoundError:
    print("test_results.csv not found. Run test.py first to generate results.")


## 12. Monitor Training with TensorBoard


In [None]:
# Load TensorBoard extension
%load_ext tensorboard

# Start TensorBoard to view all training logs
# This will show:
# - K-fold CV training logs (runs/cv_fold_1_*, cv_fold_2_*, etc.)
# - Final model training logs (runs/final_model_*)
# - Test evaluation logs (runs/test_*)
# Note: XGBoost models don't use TensorBoard, only PyTorch models
%tensorboard --logdir runs --port 6006
