# MSCNN-LSTM-AE: Two-Stage Unsupervised NIDS

**Stage 1**: Multi-Scale CNN Autoencoder (per-flow spatial features)
**Stage 2**: LSTM Autoencoder (temporal patterns on latent sequences)

- Train: Benign CIC-IDS-2017 only
- Primary eval: CSE-CIC-IDS-2018 (unseen)
- Secondary eval: CIC-IDS-2017 all-label

In [None]:
# Cell 1: Mount Google Drive & setup
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_ROOT = '/content/drive/MyDrive/mscnn-lstm-ae-nids'
os.makedirs(PROJECT_ROOT, exist_ok=True)
print(f'Project root: {PROJECT_ROOT}')

In [None]:
# Cell 2: Install dependencies
!pip install -q pyyaml joblib tqdm seaborn scikit-learn scipy

In [None]:
# Cell 3: Copy source code to Colab (upload from local or clone)
# Option A: Upload the src/ folder to Google Drive at PROJECT_ROOT/src/
# Option B: Clone from git

import sys
sys.path.insert(0, PROJECT_ROOT)

# Verify source files exist
src_dir = os.path.join(PROJECT_ROOT, 'src')
if os.path.isdir(src_dir):
    print('Source directory found')
    for root, dirs, files in os.walk(src_dir):
        level = root.replace(src_dir, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 2 * (level + 1)
        for f in files:
            print(f'{subindent}{f}')
else:
    print(f'ERROR: {src_dir} not found. Upload src/ folder to Google Drive.')

In [None]:
# Cell 4: Configure dataset paths
# Update these paths to match your Google Drive structure
CONFIG = {
    'runtime': {
        'colab_mode': True,
        'drive_root': PROJECT_ROOT,
        'random_seed': 42,
    },
    'paths': {
        'data_raw_cic': 'data/raw/CIC-IDS2017',
        'data_raw_cse': 'data/raw/CSE-CIC-IDS2018',
        'data_processed': 'data/processed',
        'models_dir': 'models',
        'results_dir': 'results',
    },
    'preprocessing': {
        'drop_columns': ['Flow ID'],
        'session_columns': {
            'src_ip': ['Source IP', 'Src IP'],
            'dst_ip': ['Destination IP', 'Dst IP'],
            'protocol': ['Protocol'],
            'timestamp': ['Timestamp'],
        },
        'label_candidates': ['Label', 'label', 'Class'],
        'benign_label': 'BENIGN',
        'scaler': 'robust',
        'post_scale_clip': 5.0,
        'fillna_strategy': 'median',
        'feature_filter': {
            'nzv_threshold': 1e-5,
            'corr_threshold': 0.98,
        },
        'chunksize': 50000,
    },
    'windowing': {
        'mode': 'auto',
        'window_size': 5,
        'min_session_length': 3,
        'fallback_mode': 'per_flow',
    },
    'stage1': {
        'latent_dim': 'auto',
        'conv_filters': [32, 32, 32],
        'conv_kernels': [1, 3, 5],
        'reduction_filters': 64,
        'batch_size': 256,
        'epochs': 100,
        'learning_rate': 0.001,
        'clipnorm': 1.0,
        'early_stopping_patience': 10,
        'reduce_lr_patience': 5,
        'reduce_lr_factor': 0.5,
        'min_lr': 1e-6,
    },
    'stage2': {
        'temporal_latent_dim': 'auto',
        'lstm_units': 32,
        'dropout': 0.3,
        'batch_size': 256,
        'epochs': 100,
        'learning_rate': 0.001,
        'clipnorm': 1.0,
        'early_stopping_patience': 10,
        'reduce_lr_patience': 5,
        'reduce_lr_factor': 0.5,
        'min_lr': 1e-6,
    },
    'scoring': {
        'alpha': 0.5,
        'alpha_degenerate': 0.7,
    },
    'threshold': {
        'zscore_k': [1.5, 2.0, 2.5, 3.0],
        'percentiles': [95, 97, 99, 99.5],
        'iqr_k': [1.5, 2.0, 3.0],
        'target_fpr': 0.05,
    },
    'split': {
        'val_size': 0.2,
        'split_by_file': True,
    },
}

In [None]:
# Cell 5: Verify dataset directories
from src.utils import resolve_paths, get_path
from src.data.loader import list_csv_files

cfg = resolve_paths(CONFIG.copy())

cic_dir = get_path(cfg, 'data_raw_cic')
cse_dir = get_path(cfg, 'data_raw_cse')

print(f'CIC-IDS-2017 dir: {cic_dir}')
print(f'CSE-CIC-IDS-2018 dir: {cse_dir}')

cic_files = list_csv_files(cic_dir)
cse_files = list_csv_files(cse_dir)

print(f'\nCIC CSV files: {len(cic_files)}')
for f in cic_files:
    print(f'  {f.name}')

print(f'\nCSE CSV files: {len(cse_files)}')
for f in cse_files:
    print(f'  {f.name}')

In [None]:
# Cell 6: Run the full pipeline
import logging
from src.utils import setup_logging, set_global_seed
from src.main import run_pipeline

setup_logging('INFO')
report = run_pipeline(CONFIG)

In [None]:
# Cell 7: Display results summary
import json

print('=' * 60)
print('RESULTS SUMMARY')
print('=' * 60)

print(f"\nFeatures: {report['n_features_original']} -> {report['n_features_final']}")
print(f"2D reshape: {report['reshape_2d']}")
print(f"Latent dim: {report['latent_dim']}")
print(f"Window size: {report['effective_window_size']}")

print(f"\nStage 1 (MSCNN-AE):")
print(f"  Params: {report['stage1']['total_params']}")
print(f"  Best val loss: {report['stage1']['best_val_loss']:.6f}")
print(f"  Epochs: {report['stage1']['n_epochs']}")

print(f"\nStage 2 ({report['stage2']['model_type']}):")
print(f"  Params: {report['stage2']['total_params']}")
print(f"  Best val loss: {report['stage2']['best_val_loss']:.6f}")
print(f"  Epochs: {report['stage2']['n_epochs']}")

print(f"\nThreshold: {report['thresholds']['selected']} = {report['thresholds']['selected_threshold']:.6f}")

print(f"\nCIC-2017:")
cic = report['cic_metrics']
print(f"  ROC-AUC: {cic['roc_auc']:.4f}")
print(f"  PR-AUC:  {cic['pr_auc']:.4f}")
print(f"  F1:      {cic['f1']:.4f}")
print(f"  FPR:     {cic['fpr']:.4f}")

print(f"\nCSE-2018 (PRIMARY):")
cse = report['cse_metrics']
print(f"  ROC-AUC: {cse['roc_auc']:.4f}")
print(f"  PR-AUC:  {cse['pr_auc']:.4f}")
print(f"  F1:      {cse['f1']:.4f}")
print(f"  FPR:     {cse['fpr']:.4f}")

print(f"\nGeneralization: {report['generalization']['verdict']}")
print(f"  AUC drop: {report['generalization']['auc_drop']:.4f}")
print(f"  F1 drop:  {report['generalization']['f1_drop']:.4f}")

In [None]:
# Cell 8: Display generated plots
from IPython.display import Image, display
from pathlib import Path

results_path = get_path(cfg, 'results_dir')

plots = [
    'stage1_training_curves.png',
    'stage2_training_curves.png',
    'domain_shift.png',
    'session_lengths.png',
    'roc_curves_combined.png',
    'pr_curves_combined.png',
    'cic2017_error_dist.png',
    'cse2018_error_dist.png',
    'cic2017_cm.png',
    'cse2018_cm.png',
    'cic2017_dr.png',
    'cse2018_dr.png',
    'cic2017_violin.png',
    'cse2018_violin.png',
    'threshold_comparison.png',
]

for p in plots:
    fp = results_path / p
    if fp.exists():
        print(f'\n--- {p} ---')
        display(Image(filename=str(fp), width=700))
    else:
        print(f'  [not found] {p}')

In [None]:
# Cell 9: Per-attack detection rates
import pandas as pd

for ds in ['cic2017', 'cse2018']:
    dr_path = results_path / f'{ds}_detection_rates.csv'
    if dr_path.exists():
        print(f'\n{ds.upper()} Detection Rates:')
        dr = pd.read_csv(dr_path)
        display(dr)

In [None]:
# Cell 10: Domain shift details
shift_path = results_path / 'domain_shift_features.csv'
if shift_path.exists():
    shift_df = pd.read_csv(shift_path)
    print('Top 15 features with highest domain shift:')
    display(shift_df.head(15))