# Anomaly Detection in Cloud Network Traffic using CSE-CIC-IDS2018 Dataset

## Deep Learning Comparison: LSTM, CNN, and Autoencoder Models

This notebook implements an end-to-end anomaly detection system for network traffic analysis using the CSE-CIC-IDS2018 dataset. We compare three deep learning approaches:

1. **LSTM** - For sequential analysis of network flow features
2. **1D CNN** - For local feature extraction from flow vectors  
3. **Autoencoder** - For unsupervised anomaly detection

### Project Objectives:
- Handle severe class imbalance in network traffic data
- Implement scalable deep learning models
- Provide comprehensive comparative analysis
- Achieve high accuracy (LSTM ~94%, CNN ~92%) with proper evaluation metrics

### Dataset: CSE-CIC-IDS2018
- Contains realistic network traffic with various attack types
- Includes DoS, Botnet, Brute Force, Web Attacks, and Normal traffic
- Highly imbalanced dataset requiring special handling techniques

## 1. Environment Setup and Dependencies

Install and import all required libraries for the anomaly detection pipeline.

In [None]:
# Install required packages (uncomment for first run)
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install scikit-learn==1.3.0 pandas==2.0.3 numpy==1.24.3 matplotlib==3.7.2 seaborn==0.12.2
# !pip install imbalanced-learn==0.11.0 tqdm==4.65.0 plotly==5.15.0

# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import pickle
from tqdm import tqdm
import time
from collections import Counter

# Deep learning frameworks
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.nn.functional as F

# Scikit-learn for preprocessing and metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                           precision_recall_fscore_support, roc_auc_score, 
                           precision_recall_curve, roc_curve, auc)
from sklearn.utils.class_weight import compute_class_weight

# Imbalanced learning
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set random seeds for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_STATE)
    torch.cuda.manual_seed_all(RANDOM_STATE)

# Configure device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

print("Environment setup complete!")
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

## 2. Dataset Loading and Exploration

Load the CSE-CIC-IDS2018 dataset and perform initial exploration.

In [None]:
# Dataset configuration
DATA_PATH = "./CSE-CIC-IDS2018/"  # Update this path to your dataset location

# CSE-CIC-IDS2018 files mapping
DATASET_FILES = {
    "Normal": ["Thuesday-20-02-2018_TrafficForML_CIC_IoT.csv"],
    "DoS": ["Thuesday-20-02-2018_TrafficForML_CIC_IoT.csv", 
            "Wednesday-21-02-2018_TrafficForML_CIC_IoT.csv"],
    "DDoS": ["Thursday-22-02-2018_TrafficForML_CIC_IoT.csv"],
    "Botnet": ["Friday-23-02-2018_TrafficForML_CIC_IoT.csv"],
    "Web_Attack": ["Thursday-22-02-2018_TrafficForML_CIC_IoT.csv"],
    "Brute_Force": ["Wednesday-28-02-2018_TrafficForML_CIC_IoT.csv", 
                    "Thursday-01-03-2018_TrafficForML_CIC_IoT.csv"],
    "Infiltration": ["Friday-02-03-2018_TrafficForML_CIC_IoT.csv"]
}

def load_cic_ids2018_data(data_path, sample_size=None, file_list=None):
    """
    Load CSE-CIC-IDS2018 dataset from CSV files
    
    Args:
        data_path: Path to the dataset directory
        sample_size: Number of samples to load (None for all)
        file_list: Specific files to load (None for all)
    
    Returns:
        pandas.DataFrame: Combined dataset
    """
    print("Loading CSE-CIC-IDS2018 dataset...")
    
    # For demo purposes, create synthetic data if files not available
    if not os.path.exists(data_path):
        print("Dataset path not found. Creating synthetic data for demonstration...")
        return create_synthetic_network_data(sample_size or 10000)
    
    all_data = []
    
    # Get list of CSV files
    if file_list is None:
        file_list = [f for f in os.listdir(data_path) if f.endswith('.csv')]
    
    for file in file_list:
        file_path = os.path.join(data_path, file)
        if os.path.exists(file_path):
            print(f"Loading {file}...")
            try:
                df = pd.read_csv(file_path)
                if sample_size and len(all_data) == 0:
                    df = df.sample(n=min(sample_size, len(df)), random_state=RANDOM_STATE)
                all_data.append(df)
            except Exception as e:
                print(f"Error loading {file}: {e}")
    
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"Successfully loaded {len(combined_df)} samples from {len(all_data)} files")
        return combined_df
    else:
        print("No data files found. Creating synthetic data...")
        return create_synthetic_network_data(sample_size or 10000)

def create_synthetic_network_data(n_samples=10000):
    """
    Create synthetic network traffic data for demonstration
    """
    print(f"Creating {n_samples} synthetic network traffic samples...")
    
    np.random.seed(RANDOM_STATE)
    
    # Define feature names (based on CIC-IDS2018 features)
    features = [
        'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
        'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
        'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean',
        'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean',
        'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std',
        'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Bwd IAT Total',
        'Bwd IAT Mean', 'Bwd IAT Std', 'Fwd PSH Flags', 'Bwd PSH Flags',
        'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length',
        'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length',
        'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
        'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count',
        'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count',
        'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size',
        'Avg Bwd Segment Size', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk',
        'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk',
        'Bwd Avg Bulk Rate', 'Subflow Fwd Packets', 'Subflow Fwd Bytes',
        'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init Win bytes forward',
        'Init Win bytes backward', 'act data pkt fwd', 'min seg size forward',
        'Active Mean', 'Active Std', 'Active Max', 'Active Min',
        'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'
    ]
    
    # Create synthetic data
    data = {}
    
    # Generate features with different distributions
    for i, feature in enumerate(features):
        if 'Bytes' in feature or 'Length' in feature or 'Size' in feature:
            # Packet size features - log-normal distribution
            data[feature] = np.random.lognormal(mean=6, sigma=2, size=n_samples)
        elif 'Time' in feature or 'IAT' in feature or 'Duration' in feature:
            # Time-based features - exponential distribution
            data[feature] = np.random.exponential(scale=1000, size=n_samples)
        elif 'Flag' in feature or 'Count' in feature:
            # Flag counts - Poisson distribution
            data[feature] = np.random.poisson(lam=2, size=n_samples)
        elif 'Ratio' in feature or '/s' in feature:
            # Rate features - gamma distribution
            data[feature] = np.random.gamma(shape=2, scale=0.5, size=n_samples)
        else:
            # Other features - normal distribution
            data[feature] = np.random.normal(loc=0, scale=1, size=n_samples)
    
    # Create labels with class imbalance
    attack_types = ['BENIGN', 'DoS Hulk', 'DoS GoldenEye', 'DoS slowloris', 'DoS Slowhttptest',
                   'DDoS', 'Bot', 'FTP-Patator', 'SSH-Patator', 'Web Attack – Brute Force',
                   'Web Attack – XSS', 'Web Attack – Sql Injection', 'Infiltration']
    
    # Create highly imbalanced distribution (realistic for network traffic)
    label_probs = [0.8, 0.05, 0.03, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.005, 0.005]
    
    labels = np.random.choice(attack_types, size=n_samples, p=label_probs)
    data['Label'] = labels
    
    # Add some categorical features
    data['Protocol'] = np.random.choice(['TCP', 'UDP', 'ICMP'], size=n_samples, p=[0.7, 0.25, 0.05])
    data['Source Port'] = np.random.randint(1024, 65535, size=n_samples)
    data['Destination Port'] = np.random.choice([80, 443, 22, 21, 25, 53, 110, 143], 
                                               size=n_samples, p=[0.3, 0.3, 0.1, 0.05, 0.05, 0.1, 0.05, 0.05])
    
    df = pd.DataFrame(data)
    
    # Add some correlations for attack types
    attack_mask = df['Label'] != 'BENIGN'
    df.loc[attack_mask, 'SYN Flag Count'] *= 2  # Attacks often have more SYN flags
    df.loc[attack_mask, 'Flow Duration'] *= 0.5  # Attacks often have shorter flows
    
    return df

# Load the dataset
print("=" * 60)
print("CSE-CIC-IDS2018 DATASET LOADING")
print("=" * 60)

# Load data (using synthetic data for demonstration)
df = load_cic_ids2018_data(DATA_PATH, sample_size=50000)

print(f"\nDataset shape: {df.shape}")
print(f"Features: {df.shape[1] - 1}")  # Excluding label column
print(f"Samples: {df.shape[0]}")

# Display basic info
print("\nFirst few rows:")
print(df.head())

print("\nDataset info:")
print(df.info())

In [None]:
# Explore class distribution
print("=" * 60)
print("CLASS DISTRIBUTION ANALYSIS")
print("=" * 60)

# Label distribution
label_counts = df['Label'].value_counts()
print("\nClass Distribution:")
print(label_counts)

# Calculate class imbalance ratio
total_samples = len(df)
benign_samples = label_counts.get('BENIGN', 0)
attack_samples = total_samples - benign_samples

print(f"\nClass Imbalance Analysis:")
print(f"Total samples: {total_samples:,}")
print(f"Benign samples: {benign_samples:,} ({benign_samples/total_samples*100:.1f}%)")
print(f"Attack samples: {attack_samples:,} ({attack_samples/total_samples*100:.1f}%)")
print(f"Imbalance ratio (Benign:Attack): {benign_samples/attack_samples:.1f}:1")

# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Bar plot
label_counts.plot(kind='bar', ax=axes[0], color='skyblue', alpha=0.8)
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Attack Type')
axes[0].set_ylabel('Number of Samples')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)

# Pie chart for top classes
top_classes = label_counts.head(8)
other_count = label_counts.tail(-8).sum()
if other_count > 0:
    top_classes['Others'] = other_count

top_classes.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', startangle=90)
axes[1].set_title('Class Distribution (Top Classes)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

# Check for missing values
print("\n" + "=" * 60)
print("MISSING VALUES ANALYSIS")
print("=" * 60)

missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print(f"\nColumns with missing values: {(missing_values > 0).sum()}")
print(f"Total missing values: {missing_values.sum()}")

if missing_values.sum() > 0:
    print("\nTop 10 columns with missing values:")
    print(missing_df.head(10))
else:
    print("No missing values found!")

# Check for infinite values
print("\n" + "=" * 60)
print("INFINITE VALUES ANALYSIS")
print("=" * 60)

numeric_cols = df.select_dtypes(include=[np.number]).columns
infinite_counts = {}

for col in numeric_cols:
    inf_count = np.isinf(df[col]).sum()
    if inf_count > 0:
        infinite_counts[col] = inf_count

if infinite_counts:
    print(f"Columns with infinite values: {len(infinite_counts)}")
    for col, count in sorted(infinite_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"{col}: {count} infinite values")
else:
    print("No infinite values found!")

# Data types analysis
print("\n" + "=" * 60)
print("DATA TYPES ANALYSIS")
print("=" * 60)

print("\nData types summary:")
dtype_counts = df.dtypes.value_counts()
print(dtype_counts)

print(f"\nNumerical features: {len(df.select_dtypes(include=[np.number]).columns)}")
print(f"Categorical features: {len(df.select_dtypes(include=['object']).columns)}")

# Display categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns: {categorical_cols}")

for col in categorical_cols:
    unique_vals = df[col].nunique()
    print(f"{col}: {unique_vals} unique values")
    if unique_vals <= 10:
        print(f"  Values: {df[col].unique()}")
    print()

## 3. Data Preprocessing and Feature Engineering

Clean the dataset and prepare features for model training.

In [None]:
def preprocess_cic_ids2018(df):
    """
    Comprehensive preprocessing for CSE-CIC-IDS2018 dataset
    
    Args:
        df: Raw dataset DataFrame
        
    Returns:
        X: Processed features
        y: Processed labels
        feature_names: List of feature names
        label_encoder: Fitted label encoder
    """
    print("=" * 60)
    print("DATA PREPROCESSING PIPELINE")
    print("=" * 60)
    
    # Make a copy to avoid modifying original data
    df_processed = df.copy()
    
    # 1. Handle missing values
    print("Step 1: Handling missing values...")
    initial_missing = df_processed.isnull().sum().sum()
    print(f"Initial missing values: {initial_missing}")
    
    # Fill missing values with median for numerical columns
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != 'Label':  # Don't fill missing labels
            df_processed[col] = df_processed[col].fillna(df_processed[col].median())
    
    # Fill missing values with mode for categorical columns
    categorical_cols = df_processed.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if col != 'Label':  # Don't fill missing labels
            df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
    
    final_missing = df_processed.isnull().sum().sum()
    print(f"Final missing values: {final_missing}")
    
    # 2. Handle infinite values
    print("\nStep 2: Handling infinite values...")
    infinite_count = 0
    for col in numeric_cols:
        if col != 'Label':
            inf_mask = np.isinf(df_processed[col])
            infinite_count += inf_mask.sum()
            if inf_mask.any():
                # Replace infinite values with column maximum
                max_val = df_processed[col][~inf_mask].max()
                df_processed.loc[inf_mask, col] = max_val
    
    print(f"Infinite values handled: {infinite_count}")
    
    # 3. Remove duplicate rows
    print("\nStep 3: Removing duplicates...")
    initial_rows = len(df_processed)
    df_processed = df_processed.drop_duplicates()
    final_rows = len(df_processed)
    print(f"Duplicates removed: {initial_rows - final_rows}")
    
    # 4. Handle categorical features
    print("\nStep 4: Encoding categorical features...")
    
    # Separate features and labels
    if 'Label' in df_processed.columns:
        X = df_processed.drop('Label', axis=1)
        y = df_processed['Label']
    else:
        X = df_processed
        y = None
    
    # Encode categorical features
    categorical_features = X.select_dtypes(include=['object']).columns
    print(f"Categorical features to encode: {list(categorical_features)}")
    
    # Simple label encoding for categorical features
    label_encoders = {}
    for col in categorical_features:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le
        print(f"  {col}: {len(le.classes_)} unique values")
    
    # 5. Encode target labels
    if y is not None:
        print("\nStep 5: Encoding target labels...")
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        
        print(f"Classes: {len(label_encoder.classes_)}")
        print("Label mapping:")
        for i, label in enumerate(label_encoder.classes_):
            print(f"  {i}: {label}")
    else:
        y_encoded = None
        label_encoder = None
    
    # 6. Feature selection and cleaning
    print("\nStep 6: Feature selection...")
    
    # Remove features with very low variance
    numeric_features = X.select_dtypes(include=[np.number]).columns
    low_variance_features = []
    
    for col in numeric_features:
        if X[col].var() < 1e-8:  # Very low variance threshold
            low_variance_features.append(col)
    
    if low_variance_features:
        print(f"Removing {len(low_variance_features)} low-variance features")
        X = X.drop(columns=low_variance_features)
    
    # Remove highly correlated features (correlation > 0.95)
    print("\nRemoving highly correlated features...")
    numeric_features = X.select_dtypes(include=[np.number]).columns
    
    if len(numeric_features) > 1:
        corr_matrix = X[numeric_features].corr().abs()
        upper_triangle = corr_matrix.where(
            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        )
        
        high_corr_features = [column for column in upper_triangle.columns 
                            if any(upper_triangle[column] > 0.95)]
        
        if high_corr_features:
            print(f"Removing {len(high_corr_features)} highly correlated features")
            X = X.drop(columns=high_corr_features)
    
    # 7. Final feature preparation
    feature_names = X.columns.tolist()
    X_array = X.values.astype(np.float32)
    
    print(f"\nFinal dataset shape: {X_array.shape}")
    print(f"Features selected: {len(feature_names)}")
    
    if y_encoded is not None:
        print(f"Classes: {len(np.unique(y_encoded))}")
        class_counts = pd.Series(y_encoded).value_counts().sort_index()
        print("Class distribution:")
        for i, count in enumerate(class_counts):
            class_name = label_encoder.classes_[i]
            print(f"  {i} ({class_name}): {count} samples ({count/len(y_encoded)*100:.1f}%)")
    
    return X_array, y_encoded, feature_names, label_encoder, label_encoders

# Apply preprocessing
X, y, feature_names, label_encoder, categorical_encoders = preprocess_cic_ids2018(df)

print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE")
print("=" * 60)
print(f"Feature matrix shape: {X.shape}")
print(f"Label vector shape: {y.shape}")
print(f"Data type: {X.dtype}")
print(f"Memory usage: {X.nbytes / 1024**2:.2f} MB")

## 4. Class Imbalance Handling

Address the severe class imbalance using SMOTE, undersampling, and weighted loss functions.

In [None]:
def handle_class_imbalance(X, y, strategy='combined', sampling_ratio=0.3):
    """
    Handle class imbalance using various techniques
    
    Args:
        X: Feature matrix
        y: Label vector
        strategy: 'smote', 'undersample', 'combined', or 'none'
        sampling_ratio: Ratio for resampling
        
    Returns:
        X_resampled, y_resampled: Resampled data
        class_weights: Computed class weights for loss functions
    """
    print("=" * 60)
    print("CLASS IMBALANCE HANDLING")
    print("=" * 60)
    
    # Calculate class weights for weighted loss
    unique_classes = np.unique(y)
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=unique_classes,
        y=y
    )
    
    # Convert to dictionary format for PyTorch
    class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
    
    print("Original class distribution:")
    original_counts = pd.Series(y).value_counts().sort_index()
    for i, count in enumerate(original_counts):
        class_name = label_encoder.classes_[i]
        weight = class_weights[i]
        print(f"  Class {i} ({class_name}): {count} samples (weight: {weight:.3f})")
    
    print(f"\nClass weight tensor: {torch.tensor(class_weights, dtype=torch.float32)}")
    
    if strategy == 'none':
        print("\nNo resampling applied.")
        return X, y, torch.tensor(class_weights, dtype=torch.float32)
    
    # Apply resampling strategies
    print(f"\nApplying resampling strategy: {strategy}")
    
    if strategy == 'smote':
        # Use SMOTE for oversampling minority classes
        # Limit k_neighbors based on smallest class size
        min_class_size = min(pd.Series(y).value_counts())
        k_neighbors = min(5, min_class_size - 1) if min_class_size > 1 else 1
        
        smote = SMOTE(
            sampling_strategy='auto',
            k_neighbors=k_neighbors,
            random_state=RANDOM_STATE
        )
        X_resampled, y_resampled = smote.fit_resample(X, y)
        
    elif strategy == 'undersample':
        # Use random undersampling for majority class
        undersampler = RandomUnderSampler(
            sampling_strategy='auto',
            random_state=RANDOM_STATE
        )
        X_resampled, y_resampled = undersampler.fit_resample(X, y)
        
    elif strategy == 'combined':
        # Combine SMOTE for minority classes and undersampling for majority
        
        # First, apply SMOTE to increase minority classes
        min_class_size = min(pd.Series(y).value_counts())
        k_neighbors = min(3, min_class_size - 1) if min_class_size > 1 else 1
        
        # Calculate target sampling strategy for SMOTE
        class_counts = pd.Series(y).value_counts()
        majority_count = class_counts.max()
        target_count = int(majority_count * sampling_ratio)
        
        # SMOTE strategy: bring minority classes to target_count
        smote_strategy = {}\n        for class_idx in class_counts.index:\n            if class_counts[class_idx] < target_count:\n                smote_strategy[class_idx] = target_count\n        \n        if smote_strategy:\n            smote = SMOTE(\n                sampling_strategy=smote_strategy,\n                k_neighbors=k_neighbors,\n                random_state=RANDOM_STATE\n            )\n            X_temp, y_temp = smote.fit_resample(X, y)\n        else:\n            X_temp, y_temp = X, y\n        \n        # Then apply undersampling to reduce majority class\n        temp_counts = pd.Series(y_temp).value_counts()\n        max_count = int(temp_counts.max() * 0.7)  # Reduce majority class\n        \n        undersample_strategy = {}\n        for class_idx in temp_counts.index:\n            if temp_counts[class_idx] > max_count:\n                undersample_strategy[class_idx] = max_count\n        \n        if undersample_strategy:\n            undersampler = RandomUnderSampler(\n                sampling_strategy=undersample_strategy,\n                random_state=RANDOM_STATE\n            )\n            X_resampled, y_resampled = undersampler.fit_resample(X_temp, y_temp)\n        else:\n            X_resampled, y_resampled = X_temp, y_temp\n    \n    print("\\nResampled class distribution:")\n    resampled_counts = pd.Series(y_resampled).value_counts().sort_index()\n    for i, count in enumerate(resampled_counts):\n        class_name = label_encoder.classes_[i]\n        original_count = original_counts.get(i, 0)\n        change = count - original_count\n        print(f"  Class {i} ({class_name}): {count} samples (change: {change:+d})")\n    \n    print(f"\\nDataset size change: {len(X)} → {len(X_resampled)} ({len(X_resampled)/len(X):.2f}x)")\n    \n    return X_resampled, y_resampled, torch.tensor(class_weights, dtype=torch.float32)\n\n# Apply different resampling strategies\nprint("Evaluating different resampling strategies...\\n")\n\n# Strategy 1: No resampling (baseline)\nX_baseline, y_baseline, class_weights = handle_class_imbalance(X, y, strategy='none')\n\n# Strategy 2: SMOTE only\nX_smote, y_smote, _ = handle_class_imbalance(X, y, strategy='smote')\n\n# Strategy 3: Combined SMOTE + Undersampling (recommended for this dataset)\nX_combined, y_combined, _ = handle_class_imbalance(X, y, strategy='combined', sampling_ratio=0.3)\n\n# Visualize the impact of resampling\nfig, axes = plt.subplots(2, 2, figsize=(15, 10))\n\n# Original distribution\noriginal_counts = pd.Series(y).value_counts().sort_index()\naxes[0, 0].bar(range(len(original_counts)), original_counts.values, alpha=0.7, color='skyblue')\naxes[0, 0].set_title('Original Distribution')\naxes[0, 0].set_ylabel('Sample Count')\naxes[0, 0].grid(True, alpha=0.3)\n\n# SMOTE distribution\nsmote_counts = pd.Series(y_smote).value_counts().sort_index()\naxes[0, 1].bar(range(len(smote_counts)), smote_counts.values, alpha=0.7, color='lightgreen')\naxes[0, 1].set_title('SMOTE Resampled')\naxes[0, 1].grid(True, alpha=0.3)\n\n# Combined distribution\ncombined_counts = pd.Series(y_combined).value_counts().sort_index()\naxes[1, 0].bar(range(len(combined_counts)), combined_counts.values, alpha=0.7, color='coral')\naxes[1, 0].set_title('Combined SMOTE + Undersampling')\naxes[1, 0].set_xlabel('Class Index')\naxes[1, 0].set_ylabel('Sample Count')\naxes[1, 0].grid(True, alpha=0.3)\n\n# Comparison chart\ncomparison_data = pd.DataFrame({\n    'Original': original_counts,\n    'SMOTE': smote_counts,\n    'Combined': combined_counts\n}).fillna(0)\n\ncomparison_data.plot(kind='bar', ax=axes[1, 1], alpha=0.8)\naxes[1, 1].set_title('Resampling Strategy Comparison')\naxes[1, 1].set_xlabel('Class Index')\naxes[1, 1].legend()\naxes[1, 1].grid(True, alpha=0.3)\n\nplt.tight_layout()\nplt.show()\n\n# Choose the best resampling strategy for training\nprint("\\n" + "=" * 60)\nprint("SELECTING RESAMPLING STRATEGY")\nprint("=" * 60)\nprint("Using combined SMOTE + undersampling for optimal balance")\nprint("This strategy will be used for model training")\n\n# Use combined strategy for final dataset\nX_final, y_final = X_combined, y_combined\n\nprint(f"\\nFinal dataset for training:")\nprint(f"Shape: {X_final.shape}")\nprint(f"Classes: {len(np.unique(y_final))}")\nprint(f"Balance improvement: {pd.Series(y_final).value_counts().std():.2f} (lower is better)")

## 5. Data Splitting and Normalization

Split the dataset and apply feature normalization for optimal model performance.

In [None]:
def create_train_val_test_split(X, y, test_size=0.2, val_size=0.2, random_state=42):
    """
    Create stratified train/validation/test splits
    
    Args:
        X: Feature matrix
        y: Label vector
        test_size: Proportion of data for test set
        val_size: Proportion of remaining data for validation set
        random_state: Random seed
        
    Returns:
        Train, validation, and test sets
    """
    print("=" * 60)
    print("DATA SPLITTING AND NORMALIZATION")
    print("=" * 60)
    
    # First split: train+val vs test
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=y
    )
    
    # Second split: train vs val
    val_size_adjusted = val_size / (1 - test_size)  # Adjust val_size for remaining data
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp,
        test_size=val_size_adjusted,
        random_state=random_state,
        stratify=y_temp
    )
    
    print(f"Dataset splitting complete:")
    print(f"  Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
    print(f"  Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
    print(f"  Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
    
    # Verify stratification
    print("\\nClass distribution verification:")
    for split_name, y_split in [("Train", y_train), ("Val", y_val), ("Test", y_test)]:\n        split_dist = pd.Series(y_split).value_counts(normalize=True).sort_index()\n        print(f"  {split_name}: {split_dist.values}")\n    \n    return X_train, X_val, X_test, y_train, y_val, y_test\n\ndef normalize_features(X_train, X_val, X_test, method='minmax'):\n    """\n    Normalize features using specified method\n    \n    Args:\n        X_train, X_val, X_test: Feature matrices\n        method: 'minmax' or 'standard'\n        \n    Returns:\n        Normalized feature matrices and fitted scaler\n    """\n    print(f"\\nApplying {method} normalization...")\n    \n    if method == 'minmax':\n        scaler = MinMaxScaler()\n    elif method == 'standard':\n        scaler = StandardScaler()\n    else:\n        raise ValueError("Method must be 'minmax' or 'standard'")\n    \n    # Fit scaler on training data only\n    X_train_scaled = scaler.fit_transform(X_train)\n    X_val_scaled = scaler.transform(X_val)\n    X_test_scaled = scaler.transform(X_test)\n    \n    print(f"Feature scaling complete!")\n    print(f"  Training data range: [{X_train_scaled.min():.3f}, {X_train_scaled.max():.3f}]")\n    print(f"  Validation data range: [{X_val_scaled.min():.3f}, {X_val_scaled.max():.3f}]")\n    print(f"  Test data range: [{X_test_scaled.min():.3f}, {X_test_scaled.max():.3f}]")\n    \n    return X_train_scaled, X_val_scaled, X_test_scaled, scaler\n\n# Create splits\nX_train, X_val, X_test, y_train, y_val, y_test = create_train_val_test_split(\n    X_final, y_final, test_size=0.2, val_size=0.2\n)\n\n# Apply normalization\nX_train_scaled, X_val_scaled, X_test_scaled, scaler = normalize_features(\n    X_train, X_val, X_test, method='minmax'\n)\n\n# Create PyTorch datasets and data loaders\nclass NetworkTrafficDataset(Dataset):\n    """Custom Dataset for network traffic data"""\n    \n    def __init__(self, X, y, sequence_length=1):\n        self.X = torch.FloatTensor(X)\n        self.y = torch.LongTensor(y)\n        self.sequence_length = sequence_length\n        \n    def __len__(self):\n        return len(self.X)\n    \n    def __getitem__(self, idx):\n        return self.X[idx], self.y[idx]\n\ndef create_data_loaders(X_train, X_val, X_test, y_train, y_val, y_test, \n                       batch_size=512, num_workers=0):\n    """Create PyTorch data loaders"""\n    \n    # Create datasets\n    train_dataset = NetworkTrafficDataset(X_train, y_train)\n    val_dataset = NetworkTrafficDataset(X_val, y_val)\n    test_dataset = NetworkTrafficDataset(X_test, y_test)\n    \n    # Create data loaders\n    train_loader = DataLoader(\n        train_dataset, \n        batch_size=batch_size, \n        shuffle=True, \n        num_workers=num_workers,\n        pin_memory=True if torch.cuda.is_available() else False\n    )\n    \n    val_loader = DataLoader(\n        val_dataset, \n        batch_size=batch_size, \n        shuffle=False, \n        num_workers=num_workers,\n        pin_memory=True if torch.cuda.is_available() else False\n    )\n    \n    test_loader = DataLoader(\n        test_dataset, \n        batch_size=batch_size, \n        shuffle=False, \n        num_workers=num_workers,\n        pin_memory=True if torch.cuda.is_available() else False\n    )\n    \n    return train_loader, val_loader, test_loader\n\n# Create data loaders\nBATCH_SIZE = 512\ntrain_loader, val_loader, test_loader = create_data_loaders(\n    X_train_scaled, X_val_scaled, X_test_scaled, \n    y_train, y_val, y_test, \n    batch_size=BATCH_SIZE\n)\n\nprint(f"\\nData loaders created:")\nprint(f"  Batch size: {BATCH_SIZE}")\nprint(f"  Training batches: {len(train_loader)}")\nprint(f"  Validation batches: {len(val_loader)}")\nprint(f"  Test batches: {len(test_loader)}")\n\n# Store important variables for model training\nnum_features = X_train_scaled.shape[1]\nnum_classes = len(np.unique(y_final))\n\nprint(f"\\nModel configuration:")\nprint(f"  Input features: {num_features}")\nprint(f"  Output classes: {num_classes}")\nprint(f"  Device: {device}")\n\n# Display feature statistics\nprint(f"\\nFeature statistics after normalization:")\nprint(f"  Mean: {X_train_scaled.mean():.6f}")\nprint(f"  Std: {X_train_scaled.std():.6f}")\nprint(f"  Min: {X_train_scaled.min():.6f}")\nprint(f"  Max: {X_train_scaled.max():.6f}")

## 6. LSTM Model Implementation

Build LSTM architecture for sequential analysis of network flow features.

In [None]:
class LSTMClassifier(nn.Module):\n    """\n    LSTM-based classifier for network traffic anomaly detection\n    \n    This model treats network flow features as sequential data,\n    using LSTM layers to capture temporal dependencies.\n    """\n    \n    def __init__(self, input_size, hidden_size=128, num_layers=2, num_classes=2, \n                 dropout=0.3, bidirectional=True):\n        super(LSTMClassifier, self).__init__()\n        \n        self.input_size = input_size\n        self.hidden_size = hidden_size\n        self.num_layers = num_layers\n        self.num_classes = num_classes\n        self.bidirectional = bidirectional\n        \n        # Input projection layer to create sequence-like data\n        # We'll reshape the feature vector into a sequence\n        self.sequence_length = min(16, input_size // 4)  # Adaptive sequence length\n        self.feature_per_step = input_size // self.sequence_length\n        if input_size % self.sequence_length != 0:\n            # Add padding for remaining features\n            self.padding_size = self.sequence_length - (input_size % self.sequence_length)\n        else:\n            self.padding_size = 0\n            \n        self.adjusted_input_size = input_size + self.padding_size\n        self.feature_per_step = self.adjusted_input_size // self.sequence_length\n        \n        # LSTM layers\n        self.lstm = nn.LSTM(\n            input_size=self.feature_per_step,\n            hidden_size=hidden_size,\n            num_layers=num_layers,\n            batch_first=True,\n            dropout=dropout if num_layers > 1 else 0,\n            bidirectional=bidirectional\n        )\n        \n        # Determine LSTM output size\n        lstm_output_size = hidden_size * 2 if bidirectional else hidden_size\n        \n        # Classification head\n        self.classifier = nn.Sequential(\n            nn.Dropout(dropout),\n            nn.Linear(lstm_output_size, hidden_size // 2),\n            nn.ReLU(),\n            nn.Dropout(dropout),\n            nn.Linear(hidden_size // 2, num_classes)\n        )\n        \n        # Initialize weights\n        self.init_weights()\n        \n    def init_weights(self):\n        """Initialize model weights"""\n        for name, param in self.named_parameters():\n            if 'weight' in name:\n                if 'lstm' in name:\n                    nn.init.orthogonal_(param)\n                else:\n                    nn.init.xavier_uniform_(param)\n            elif 'bias' in name:\n                nn.init.constant_(param, 0)\n                \n    def forward(self, x):\n        batch_size = x.size(0)\n        \n        # Add padding if necessary\n        if self.padding_size > 0:\n            padding = torch.zeros(batch_size, self.padding_size, device=x.device)\n            x = torch.cat([x, padding], dim=1)\n        \n        # Reshape input to sequence format\n        # (batch_size, features) -> (batch_size, sequence_length, features_per_step)\n        x = x.view(batch_size, self.sequence_length, self.feature_per_step)\n        \n        # LSTM forward pass\n        lstm_out, (hidden, cell) = self.lstm(x)\n        \n        # Use the last output for classification\n        if self.bidirectional:\n            # For bidirectional LSTM, concatenate forward and backward hidden states\n            hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)\n        else:\n            hidden = hidden[-1]\n        \n        # Classification\n        output = self.classifier(hidden)\n        \n        return output\n    \n    def get_model_info(self):\n        """Get model architecture information"""\n        total_params = sum(p.numel() for p in self.parameters())\n        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)\n        \n        info = {\n            'model_type': 'LSTM',\n            'input_size': self.input_size,\n            'hidden_size': self.hidden_size,\n            'num_layers': self.num_layers,\n            'num_classes': self.num_classes,\n            'bidirectional': self.bidirectional,\n            'sequence_length': self.sequence_length,\n            'feature_per_step': self.feature_per_step,\n            'total_params': total_params,\n            'trainable_params': trainable_params\n        }\n        \n        return info\n\n# Create LSTM model\nprint("=" * 60)\nprint("LSTM MODEL INITIALIZATION")\nprint("=" * 60)\n\n# Model hyperparameters\nLSTM_CONFIG = {\n    'input_size': num_features,\n    'hidden_size': 128,\n    'num_layers': 2,\n    'num_classes': num_classes,\n    'dropout': 0.3,\n    'bidirectional': True\n}\n\n# Initialize model\nlstm_model = LSTMClassifier(**LSTM_CONFIG).to(device)\n\n# Display model information\nmodel_info = lstm_model.get_model_info()\nprint("LSTM Model Configuration:")\nfor key, value in model_info.items():\n    print(f"  {key}: {value}")\n\nprint(f"\\nModel Memory Usage: {sum(p.numel() * p.element_size() for p in lstm_model.parameters()) / 1024**2:.2f} MB")\n\n# Model summary\nprint("\\nModel Architecture:")\nprint(lstm_model)\n\n# Test forward pass\nwith torch.no_grad():\n    sample_batch = next(iter(train_loader))\n    sample_input, sample_target = sample_batch\n    sample_input = sample_input.to(device)\n    \n    output = lstm_model(sample_input)\n    print(f"\\nForward pass test:")\n    print(f"  Input shape: {sample_input.shape}")\n    print(f"  Output shape: {output.shape}")\n    print(f"  Target shape: {sample_target.shape}")\n    print(f"  Output range: [{output.min().item():.3f}, {output.max().item():.3f}]")

## 7. CNN Model Implementation

Create 1D CNN model for local feature extraction from flow vectors.

In [None]:
class CNN1DClassifier(nn.Module):
    """
    1D CNN classifier for network traffic anomaly detection
    
    This model uses 1D convolutional layers to extract local patterns
    from network flow feature vectors.
    """
    
    def __init__(self, input_size, num_classes=2, dropout=0.3):
        super(CNN1DClassifier, self).__init__()
        
        self.input_size = input_size
        self.num_classes = num_classes
        
        # Calculate conv layer parameters
        # We'll treat the feature vector as a 1D sequence
        # Add padding to make it divisible by expected filter sizes
        self.sequence_length = input_size
        
        # Convolutional layers
        self.conv_layers = nn.Sequential(
            # First conv block
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Dropout(dropout),
            
            # Second conv block
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Dropout(dropout),
            
            # Third conv block
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Dropout(dropout),
            
            # Fourth conv block
            nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)  # Global average pooling
        )
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )
        
        # Initialize weights
        self.init_weights()
        
    def init_weights(self):
        """Initialize model weights"""
        for module in self.modules():
            if isinstance(module, nn.Conv1d):
                nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.BatchNorm1d):
                nn.init.constant_(module.weight, 1)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)
                
    def forward(self, x):
        batch_size = x.size(0)
        
        # Reshape input for 1D convolution
        # (batch_size, features) -> (batch_size, 1, features)
        x = x.unsqueeze(1)
        
        # Convolutional feature extraction
        features = self.conv_layers(x)
        
        # Flatten for classification
        features = features.view(batch_size, -1)
        
        # Classification
        output = self.classifier(features)
        
        return output
    
    def get_model_info(self):
        """Get model architecture information"""
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        
        info = {
            'model_type': 'CNN1D',
            'input_size': self.input_size,
            'num_classes': self.num_classes,
            'total_params': total_params,
            'trainable_params': trainable_params
        }
        
        return info

# Create CNN model
print("=" * 60)
print("CNN MODEL INITIALIZATION")
print("=" * 60)

# Model hyperparameters
CNN_CONFIG = {
    'input_size': num_features,
    'num_classes': num_classes,
    'dropout': 0.3
}

# Initialize model
cnn_model = CNN1DClassifier(**CNN_CONFIG).to(device)

# Display model information
model_info = cnn_model.get_model_info()
print("CNN Model Configuration:")
for key, value in model_info.items():
    print(f"  {key}: {value}")

print(f"\nModel Memory Usage: {sum(p.numel() * p.element_size() for p in cnn_model.parameters()) / 1024**2:.2f} MB")

# Model summary
print("\nModel Architecture:")
print(cnn_model)

# Test forward pass
with torch.no_grad():
    sample_batch = next(iter(train_loader))
    sample_input, sample_target = sample_batch
    sample_input = sample_input.to(device)
    
    output = cnn_model(sample_input)
    print(f"\nForward pass test:")
    print(f"  Input shape: {sample_input.shape}")
    print(f"  Output shape: {output.shape}")
    print(f"  Target shape: {sample_target.shape}")
    print(f"  Output range: [{output.min().item():.3f}, {output.max().item():.3f}]")

## 8. Autoencoder Model Implementation

Design autoencoder architecture for unsupervised anomaly detection.

In [None]:
class Autoencoder(nn.Module):
    """
    Autoencoder for unsupervised anomaly detection
    
    This model learns to reconstruct normal network traffic patterns.
    Anomalies are detected based on high reconstruction errors.
    """
    
    def __init__(self, input_size, encoding_dim=32, hidden_dims=None, dropout=0.2):
        super(Autoencoder, self).__init__()
        
        self.input_size = input_size
        self.encoding_dim = encoding_dim
        
        if hidden_dims is None:
            # Create a symmetric architecture
            hidden_dims = [input_size // 2, input_size // 4, encoding_dim * 2]
        
        self.hidden_dims = hidden_dims
        
        # Encoder
        encoder_layers = []
        current_dim = input_size
        
        for hidden_dim in hidden_dims:
            encoder_layers.extend([
                nn.Linear(current_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            current_dim = hidden_dim
        
        # Bottleneck layer
        encoder_layers.append(nn.Linear(current_dim, encoding_dim))
        encoder_layers.append(nn.ReLU())
        
        self.encoder = nn.Sequential(*encoder_layers)
        
        # Decoder (symmetric to encoder)
        decoder_layers = []
        current_dim = encoding_dim
        
        # Reverse the hidden dimensions for decoder
        decoder_hidden_dims = hidden_dims[::-1]
        
        for hidden_dim in decoder_hidden_dims:
            decoder_layers.extend([
                nn.Linear(current_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            current_dim = hidden_dim
        
        # Output layer (no activation for reconstruction)
        decoder_layers.append(nn.Linear(current_dim, input_size))
        
        self.decoder = nn.Sequential(*decoder_layers)
        
        # Initialize weights
        self.init_weights()
        
    def init_weights(self):
        """Initialize model weights"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)
                
    def forward(self, x):
        # Encode
        encoded = self.encoder(x)
        
        # Decode
        reconstructed = self.decoder(encoded)
        
        return reconstructed, encoded
    
    def encode(self, x):
        """Get encoded representation"""
        return self.encoder(x)
    
    def decode(self, encoded):
        """Decode from encoded representation"""
        return self.decoder(encoded)
    
    def get_model_info(self):
        """Get model architecture information"""
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        
        info = {
            'model_type': 'Autoencoder',
            'input_size': self.input_size,
            'encoding_dim': self.encoding_dim,
            'hidden_dims': self.hidden_dims,
            'total_params': total_params,
            'trainable_params': trainable_params
        }
        
        return info

class AutoencoderClassifier(nn.Module):
    """
    Autoencoder-based classifier for anomaly detection
    
    Combines reconstruction error with a classification head
    """
    
    def __init__(self, input_size, encoding_dim=32, num_classes=2, dropout=0.2):
        super(AutoencoderClassifier, self).__init__()
        
        self.input_size = input_size
        self.encoding_dim = encoding_dim
        self.num_classes = num_classes
        
        # Autoencoder component
        self.autoencoder = Autoencoder(input_size, encoding_dim, dropout=dropout)
        
        # Classification head (uses encoded representation)
        self.classifier = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(encoding_dim // 2, num_classes)
        )
        
    def forward(self, x):
        # Get reconstruction and encoded representation
        reconstructed, encoded = self.autoencoder(x)
        
        # Get classification logits
        logits = self.classifier(encoded)
        
        return reconstructed, logits, encoded
    
    def compute_reconstruction_error(self, x, reconstructed):
        """Compute reconstruction error (MSE)"""
        return F.mse_loss(reconstructed, x, reduction='none').mean(dim=1)
    
    def get_model_info(self):
        """Get model architecture information"""
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        
        info = {
            'model_type': 'AutoencoderClassifier',
            'input_size': self.input_size,
            'encoding_dim': self.encoding_dim,
            'num_classes': self.num_classes,
            'total_params': total_params,
            'trainable_params': trainable_params
        }
        
        return info

# Create Autoencoder models
print("=" * 60)
print("AUTOENCODER MODEL INITIALIZATION")
print("=" * 60)

# Model hyperparameters
AUTOENCODER_CONFIG = {
    'input_size': num_features,
    'encoding_dim': 64,
    'num_classes': num_classes,
    'dropout': 0.2
}

# Initialize models
autoencoder_model = AutoencoderClassifier(**AUTOENCODER_CONFIG).to(device)

# Display model information
model_info = autoencoder_model.get_model_info()
print("Autoencoder Model Configuration:")
for key, value in model_info.items():
    print(f"  {key}: {value}")

print(f"\nModel Memory Usage: {sum(p.numel() * p.element_size() for p in autoencoder_model.parameters()) / 1024**2:.2f} MB")

# Model summary
print("\nModel Architecture:")
print(autoencoder_model)

# Test forward pass
with torch.no_grad():
    sample_batch = next(iter(train_loader))
    sample_input, sample_target = sample_batch
    sample_input = sample_input.to(device)
    
    reconstructed, logits, encoded = autoencoder_model(sample_input)
    reconstruction_error = autoencoder_model.compute_reconstruction_error(sample_input, reconstructed)
    
    print(f"\nForward pass test:")
    print(f"  Input shape: {sample_input.shape}")
    print(f"  Reconstructed shape: {reconstructed.shape}")
    print(f"  Logits shape: {logits.shape}")
    print(f"  Encoded shape: {encoded.shape}")
    print(f"  Reconstruction error shape: {reconstruction_error.shape}")
    print(f"  Reconstruction error range: [{reconstruction_error.min().item():.6f}, {reconstruction_error.max().item():.6f}]")

## 9. Model Training Pipeline

Implement comprehensive training loops for all three models with early stopping and checkpointing.

In [None]:
# Training configuration
TRAINING_CONFIG = {
    'epochs': 50,
    'learning_rate': 0.001,
    'weight_decay': 1e-5,
    'patience': 10,  # For early stopping
    'save_best': True
}

class ModelTrainer:
    """Unified trainer for all model types"""
    
    def __init__(self, model, model_type, device, class_weights=None):
        self.model = model
        self.model_type = model_type
        self.device = device
        self.class_weights = class_weights
        
        # Initialize optimizer
        self.optimizer = optim.Adam(
            model.parameters(), 
            lr=TRAINING_CONFIG['learning_rate'], 
            weight_decay=TRAINING_CONFIG['weight_decay']
        )
        
        # Initialize loss functions
        if class_weights is not None:
            self.classification_loss = nn.CrossEntropyLoss(weight=class_weights.to(device))
        else:
            self.classification_loss = nn.CrossEntropyLoss()
        
        self.reconstruction_loss = nn.MSELoss()
        
        # Initialize scheduler
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='min', patience=5, factor=0.5, verbose=True
        )
        
        # Training history
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'train_acc': [],
            'val_acc': []
        }
        
        self.best_val_loss = float('inf')
        self.best_model_state = None
        self.patience_counter = 0
        
    def train_epoch(self, train_loader):
        """Train for one epoch"""
        self.model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        pbar = tqdm(train_loader, desc=f"Training {self.model_type}")
        
        for batch_idx, (data, target) in enumerate(pbar):
            data, target = data.to(self.device), target.to(self.device)
            
            self.optimizer.zero_grad()
            
            if self.model_type == 'autoencoder':
                # Autoencoder training
                reconstructed, logits, encoded = self.model(data)
                
                # Combined loss: reconstruction + classification
                recon_loss = self.reconstruction_loss(reconstructed, data)
                class_loss = self.classification_loss(logits, target)
                loss = recon_loss + 0.5 * class_loss  # Weight the losses
                
                # Predictions for accuracy
                pred = logits.argmax(dim=1)
            else:
                # Standard classification training
                output = self.model(data)
                loss = self.classification_loss(output, target)
                pred = output.argmax(dim=1)
            
            loss.backward()
            self.optimizer.step()
            
            total_loss += loss.item()
            total += target.size(0)
            correct += pred.eq(target).sum().item()
            
            # Update progress bar
            pbar.set_postfix({
                'Loss': f'{loss.item():.4f}',
                'Acc': f'{100.*correct/total:.2f}%'
            })
        
        avg_loss = total_loss / len(train_loader)
        accuracy = 100. * correct / total
        
        return avg_loss, accuracy
    
    def validate(self, val_loader):
        """Validate the model"""
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(self.device), target.to(self.device)
                
                if self.model_type == 'autoencoder':
                    reconstructed, logits, encoded = self.model(data)
                    recon_loss = self.reconstruction_loss(reconstructed, data)
                    class_loss = self.classification_loss(logits, target)
                    loss = recon_loss + 0.5 * class_loss
                    pred = logits.argmax(dim=1)
                else:
                    output = self.model(data)
                    loss = self.classification_loss(output, target)
                    pred = output.argmax(dim=1)
                
                total_loss += loss.item()
                total += target.size(0)
                correct += pred.eq(target).sum().item()
        
        avg_loss = total_loss / len(val_loader)
        accuracy = 100. * correct / total
        
        return avg_loss, accuracy
    
    def train(self, train_loader, val_loader, epochs=None):
        """Full training loop"""
        if epochs is None:
            epochs = TRAINING_CONFIG['epochs']
        
        print(f"\\nTraining {self.model_type.upper()} model...")
        print(f"Epochs: {epochs}, Learning Rate: {TRAINING_CONFIG['learning_rate']}")
        print("=" * 60)
        
        for epoch in range(epochs):
            # Training
            train_loss, train_acc = self.train_epoch(train_loader)
            
            # Validation
            val_loss, val_acc = self.validate(val_loader)
            
            # Update history
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            self.history['train_acc'].append(train_acc)
            self.history['val_acc'].append(val_acc)
            
            # Learning rate scheduling
            self.scheduler.step(val_loss)
            
            # Early stopping and model saving
            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.best_model_state = self.model.state_dict().copy()
                self.patience_counter = 0
                
                if TRAINING_CONFIG['save_best']:
                    torch.save(self.model.state_dict(), f'best_{self.model_type}_model.pt')
            else:
                self.patience_counter += 1
            
            # Print epoch results
            print(f"Epoch {epoch+1:2d}/{epochs} | "
                  f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
                  f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | "
                  f"LR: {self.optimizer.param_groups[0]['lr']:.6f}")
            
            # Early stopping
            if self.patience_counter >= TRAINING_CONFIG['patience']:
                print(f"\\nEarly stopping triggered after {epoch+1} epochs")
                break
        
        # Load best model
        if self.best_model_state is not None:
            self.model.load_state_dict(self.best_model_state)
            print(f"\\nBest validation loss: {self.best_val_loss:.4f}")
        
        return self.history

# Initialize trainers for all models
print("=" * 60)
print("INITIALIZING MODEL TRAINERS")
print("=" * 60)

# LSTM Trainer
lstm_trainer = ModelTrainer(lstm_model, 'lstm', device, class_weights)
print(f"✓ LSTM trainer initialized")

# CNN Trainer  
cnn_trainer = ModelTrainer(cnn_model, 'cnn', device, class_weights)
print(f"✓ CNN trainer initialized")

# Autoencoder Trainer
autoencoder_trainer = ModelTrainer(autoencoder_model, 'autoencoder', device, class_weights)
print(f"✓ Autoencoder trainer initialized")

print(f"\\nAll trainers ready for training with:")
print(f"  - Learning rate: {TRAINING_CONFIG['learning_rate']}")
print(f"  - Weight decay: {TRAINING_CONFIG['weight_decay']}")
print(f"  - Early stopping patience: {TRAINING_CONFIG['patience']}")
print(f"  - Class weights: {class_weights is not None}")

# Quick training demonstration (reduced epochs for demo)
DEMO_EPOCHS = 5  # Set to 50+ for full training

print(f"\\n" + "=" * 60)
print("STARTING MODEL TRAINING DEMONSTRATION")
print(f"Training for {DEMO_EPOCHS} epochs (increase for full training)")
print("=" * 60)

In [None]:
# Train all models
training_results = {}

# Train LSTM
print("\\n🚀 Training LSTM Model...")
lstm_history = lstm_trainer.train(train_loader, val_loader, epochs=DEMO_EPOCHS)
training_results['LSTM'] = lstm_history

# Train CNN
print("\\n🚀 Training CNN Model...")
cnn_history = cnn_trainer.train(train_loader, val_loader, epochs=DEMO_EPOCHS)
training_results['CNN'] = cnn_history

# Train Autoencoder
print("\\n🚀 Training Autoencoder Model...")
autoencoder_history = autoencoder_trainer.train(train_loader, val_loader, epochs=DEMO_EPOCHS)
training_results['Autoencoder'] = autoencoder_history

print("\\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)

## 10. Model Evaluation and Metrics

Calculate comprehensive evaluation metrics for all models.

In [None]:
def evaluate_model(model, model_type, test_loader, device, label_encoder):
    """Comprehensive model evaluation"""
    model.eval()
    
    all_predictions = []
    all_targets = []
    all_probabilities = []
    reconstruction_errors = []
    
    print(f"Evaluating {model_type.upper()} model...")
    
    with torch.no_grad():
        for data, target in tqdm(test_loader, desc="Evaluation"):
            data, target = data.to(device), target.to(device)
            
            if model_type == 'autoencoder':
                reconstructed, logits, encoded = model(data)
                probabilities = F.softmax(logits, dim=1)
                predictions = logits.argmax(dim=1)
                
                # Calculate reconstruction errors
                recon_error = model.compute_reconstruction_error(data, reconstructed)
                reconstruction_errors.extend(recon_error.cpu().numpy())
            else:
                output = model(data)
                probabilities = F.softmax(output, dim=1)
                predictions = output.argmax(dim=1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(target.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
    
    # Convert to numpy arrays
    y_true = np.array(all_targets)
    y_pred = np.array(all_predictions)
    y_prob = np.array(all_probabilities)
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    
    # Multi-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average='weighted', zero_division=0
    )
    
    # Per-class metrics
    precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
        y_true, y_pred, average=None, zero_division=0
    )
    
    # Binary classification metrics (Normal vs Attack)
    y_true_binary = (y_true != 0).astype(int)  # Assuming class 0 is 'BENIGN'
    y_pred_binary = (y_pred != 0).astype(int)
    y_prob_binary = 1 - y_prob[:, 0]  # Probability of attack
    
    binary_accuracy = accuracy_score(y_true_binary, y_pred_binary)
    binary_precision, binary_recall, binary_f1, _ = precision_recall_fscore_support(
        y_true_binary, y_pred_binary, average='binary', zero_division=0
    )
    
    # ROC-AUC and PR-AUC for binary classification
    try:
        roc_auc = roc_auc_score(y_true_binary, y_prob_binary)
        precision_curve, recall_curve, _ = precision_recall_curve(y_true_binary, y_prob_binary)
        pr_auc = auc(recall_curve, precision_curve)
    except ValueError:
        roc_auc = 0.0
        pr_auc = 0.0
    
    # Compile results
    results = {
        'model_type': model_type,
        'multi_class': {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'precision_per_class': precision_per_class,
            'recall_per_class': recall_per_class,
            'f1_per_class': f1_per_class
        },
        'binary': {
            'accuracy': binary_accuracy,
            'precision': binary_precision,
            'recall': binary_recall,
            'f1_score': binary_f1,
            'roc_auc': roc_auc,
            'pr_auc': pr_auc
        },
        'predictions': {
            'y_true': y_true,
            'y_pred': y_pred,
            'y_prob': y_prob,
            'y_true_binary': y_true_binary,
            'y_pred_binary': y_pred_binary,
            'y_prob_binary': y_prob_binary
        }
    }
    
    if model_type == 'autoencoder':
        results['reconstruction_errors'] = np.array(reconstruction_errors)
    
    return results

# Evaluate all models
print("=" * 60)
print("MODEL EVALUATION")
print("=" * 60)

evaluation_results = {}

# Evaluate LSTM
lstm_results = evaluate_model(lstm_model, 'lstm', test_loader, device, label_encoder)
evaluation_results['LSTM'] = lstm_results

# Evaluate CNN
cnn_results = evaluate_model(cnn_model, 'cnn', test_loader, device, label_encoder)
evaluation_results['CNN'] = cnn_results

# Evaluate Autoencoder
autoencoder_results = evaluate_model(autoencoder_model, 'autoencoder', test_loader, device, label_encoder)
evaluation_results['Autoencoder'] = autoencoder_results

# Print summary results
print("\\n" + "=" * 80)
print("EVALUATION RESULTS SUMMARY")
print("=" * 80)

print(f"{'Model':<12} | {'Accuracy':<8} | {'Precision':<9} | {'Recall':<8} | {'F1-Score':<8} | {'ROC-AUC':<8}")
print("-" * 80)

for model_name, results in evaluation_results.items():
    accuracy = results['multi_class']['accuracy']
    precision = results['multi_class']['precision']
    recall = results['multi_class']['recall']
    f1 = results['multi_class']['f1_score']
    roc_auc = results['binary']['roc_auc']
    
    print(f"{model_name:<12} | {accuracy:<8.3f} | {precision:<9.3f} | {recall:<8.3f} | {f1:<8.3f} | {roc_auc:<8.3f}")

# Detailed per-class results
print("\\n" + "=" * 80)
print("PER-CLASS PERFORMANCE")
print("=" * 80)

for model_name, results in evaluation_results.items():
    print(f"\\n{model_name.upper()} Model:")
    print(f"{'Class':<20} | {'Precision':<9} | {'Recall':<8} | {'F1-Score':<8}")
    print("-" * 60)
    
    for i, class_name in enumerate(label_encoder.classes_):
        if i < len(results['multi_class']['precision_per_class']):
            precision = results['multi_class']['precision_per_class'][i]
            recall = results['multi_class']['recall_per_class'][i]
            f1 = results['multi_class']['f1_per_class'][i]
            print(f"{class_name:<20} | {precision:<9.3f} | {recall:<8.3f} | {f1:<8.3f}")

# Binary classification summary
print("\\n" + "=" * 80)
print("BINARY CLASSIFICATION (Normal vs Attack)")
print("=" * 80)

print(f"{'Model':<12} | {'Accuracy':<8} | {'Precision':<9} | {'Recall':<8} | {'F1-Score':<8} | {'ROC-AUC':<8} | {'PR-AUC':<8}")
print("-" * 90)

for model_name, results in evaluation_results.items():
    binary_results = results['binary']
    print(f"{model_name:<12} | {binary_results['accuracy']:<8.3f} | {binary_results['precision']:<9.3f} | "
          f"{binary_results['recall']:<8.3f} | {binary_results['f1_score']:<8.3f} | "
          f"{binary_results['roc_auc']:<8.3f} | {binary_results['pr_auc']:<8.3f}")

print("\\n✅ Model evaluation complete!")

## 11. Comparative Analysis and Visualization

Generate comprehensive visualizations and comparative analysis.

In [None]:
# Visualization functions
def plot_training_history(training_results):
    """Plot training history for all models"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    models = list(training_results.keys())
    colors = ['blue', 'red', 'green']
    
    # Training Loss
    for i, model in enumerate(models):
        history = training_results[model]
        axes[0, 0].plot(history['train_loss'], label=f'{model} Train', color=colors[i], linestyle='-')
        axes[0, 0].plot(history['val_loss'], label=f'{model} Val', color=colors[i], linestyle='--')
    
    axes[0, 0].set_title('Training and Validation Loss')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # Training Accuracy
    for i, model in enumerate(models):
        history = training_results[model]
        axes[0, 1].plot(history['train_acc'], label=f'{model} Train', color=colors[i], linestyle='-')
        axes[0, 1].plot(history['val_acc'], label=f'{model} Val', color=colors[i], linestyle='--')
    
    axes[0, 1].set_title('Training and Validation Accuracy')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Accuracy (%)')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # Model Comparison - Final Metrics
    model_names = list(evaluation_results.keys())
    accuracies = [evaluation_results[model]['multi_class']['accuracy'] for model in model_names]
    f1_scores = [evaluation_results[model]['multi_class']['f1_score'] for model in model_names]
    roc_aucs = [evaluation_results[model]['binary']['roc_auc'] for model in model_names]
    
    x = np.arange(len(model_names))
    width = 0.25
    
    axes[1, 0].bar(x - width, accuracies, width, label='Accuracy', alpha=0.8)
    axes[1, 0].bar(x, f1_scores, width, label='F1-Score', alpha=0.8)
    axes[1, 0].bar(x + width, roc_aucs, width, label='ROC-AUC', alpha=0.8)
    
    axes[1, 0].set_title('Model Performance Comparison')
    axes[1, 0].set_xlabel('Model')
    axes[1, 0].set_ylabel('Score')
    axes[1, 0].set_xticks(x)
    axes[1, 0].set_xticklabels(model_names)
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # ROC Curves
    for model_name, results in evaluation_results.items():
        y_true_binary = results['predictions']['y_true_binary']
        y_prob_binary = results['predictions']['y_prob_binary']
        
        if len(np.unique(y_true_binary)) > 1:  # Check if both classes present
            fpr, tpr, _ = roc_curve(y_true_binary, y_prob_binary)
            roc_auc = auc(fpr, tpr)
            axes[1, 1].plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.3f})')
    
    axes[1, 1].plot([0, 1], [0, 1], 'k--', alpha=0.5)
    axes[1, 1].set_title('ROC Curves (Binary Classification)')
    axes[1, 1].set_xlabel('False Positive Rate')
    axes[1, 1].set_ylabel('True Positive Rate')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def plot_confusion_matrices(evaluation_results, label_encoder):
    """Plot confusion matrices for all models"""
    n_models = len(evaluation_results)
    fig, axes = plt.subplots(1, n_models, figsize=(6*n_models, 5))
    
    if n_models == 1:
        axes = [axes]
    
    for idx, (model_name, results) in enumerate(evaluation_results.items()):
        y_true = results['predictions']['y_true']
        y_pred = results['predictions']['y_pred']
        
        cm = confusion_matrix(y_true, y_pred)
        
        # Normalize confusion matrix
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        # Plot
        im = axes[idx].imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)
        axes[idx].set_title(f'{model_name} Confusion Matrix')
        
        # Add colorbar
        plt.colorbar(im, ax=axes[idx])
        
        # Add labels
        tick_marks = np.arange(len(label_encoder.classes_))
        axes[idx].set_xticks(tick_marks)
        axes[idx].set_yticks(tick_marks)
        axes[idx].set_xticklabels(label_encoder.classes_, rotation=45, ha='right')
        axes[idx].set_yticklabels(label_encoder.classes_)
        
        # Add text annotations
        thresh = cm_normalized.max() / 2.
        for i, j in np.ndindex(cm_normalized.shape):
            axes[idx].text(j, i, f'{cm_normalized[i, j]:.2f}',
                         ha="center", va="center",
                         color="white" if cm_normalized[i, j] > thresh else "black")
        
        axes[idx].set_ylabel('True Label')
        axes[idx].set_xlabel('Predicted Label')
    
    plt.tight_layout()
    plt.show()

def plot_reconstruction_errors(evaluation_results):
    """Plot reconstruction error distributions for autoencoder"""
    if 'Autoencoder' in evaluation_results:
        autoencoder_results = evaluation_results['Autoencoder']
        reconstruction_errors = autoencoder_results['reconstruction_errors']
        y_true_binary = autoencoder_results['predictions']['y_true_binary']
        
        # Separate errors by class
        normal_errors = reconstruction_errors[y_true_binary == 0]
        attack_errors = reconstruction_errors[y_true_binary == 1]
        
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Histogram
        axes[0].hist(normal_errors, bins=50, alpha=0.7, label='Normal Traffic', density=True, color='blue')
        axes[0].hist(attack_errors, bins=50, alpha=0.7, label='Attack Traffic', density=True, color='red')
        axes[0].set_title('Reconstruction Error Distribution')
        axes[0].set_xlabel('Reconstruction Error')
        axes[0].set_ylabel('Density')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Box plot
        data = [normal_errors, attack_errors]
        axes[1].boxplot(data, labels=['Normal', 'Attack'])
        axes[1].set_title('Reconstruction Error Box Plot')
        axes[1].set_ylabel('Reconstruction Error')
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Calculate optimal threshold
        all_errors = reconstruction_errors
        all_labels = y_true_binary
        
        # Find threshold that maximizes F1-score
        thresholds = np.percentile(all_errors, np.linspace(50, 99, 50))
        best_f1 = 0
        best_threshold = 0
        
        for threshold in thresholds:
            pred_binary = (all_errors > threshold).astype(int)
            if len(np.unique(pred_binary)) > 1:
                f1 = f1_score(all_labels, pred_binary)
                if f1 > best_f1:
                    best_f1 = f1
                    best_threshold = threshold
        
        print(f"Optimal reconstruction error threshold: {best_threshold:.6f}")
        print(f"F1-score with optimal threshold: {best_f1:.3f}")

# Generate all visualizations
print("=" * 60)
print("GENERATING VISUALIZATIONS")
print("=" * 60)

# Plot training history
print("\\n📊 Plotting training history...")
plot_training_history(training_results)

# Plot confusion matrices
print("\\n📊 Plotting confusion matrices...")
plot_confusion_matrices(evaluation_results, label_encoder)

# Plot reconstruction errors (for autoencoder)
print("\\n📊 Plotting reconstruction error analysis...")
plot_reconstruction_errors(evaluation_results)

# Create comprehensive comparison table
print("\\n📋 Creating comprehensive comparison table...")

comparison_data = []
for model_name, results in evaluation_results.items():
    multi_class = results['multi_class']
    binary = results['binary']
    
    comparison_data.append({
        'Model': model_name,
        'Multi-Class Accuracy': f"{multi_class['accuracy']:.3f}",
        'Multi-Class Precision': f"{multi_class['precision']:.3f}",
        'Multi-Class Recall': f"{multi_class['recall']:.3f}",
        'Multi-Class F1': f"{multi_class['f1_score']:.3f}",
        'Binary Accuracy': f"{binary['accuracy']:.3f}",
        'Binary Precision': f"{binary['precision']:.3f}",
        'Binary Recall': f"{binary['recall']:.3f}",
        'Binary F1': f"{binary['f1_score']:.3f}",
        'ROC-AUC': f"{binary['roc_auc']:.3f}",
        'PR-AUC': f"{binary['pr_auc']:.3f}"
    })

comparison_df = pd.DataFrame(comparison_data)
print("\\n" + "=" * 120)
print("COMPREHENSIVE MODEL COMPARISON TABLE")
print("=" * 120)
print(comparison_df.to_string(index=False))

# Model parameter comparison
print("\\n" + "=" * 80)
print("MODEL ARCHITECTURE COMPARISON")
print("=" * 80)

architecture_data = []
models_info = [
    lstm_model.get_model_info(),
    cnn_model.get_model_info(),
    autoencoder_model.get_model_info()
]

for info in models_info:
    architecture_data.append({
        'Model': info['model_type'],
        'Total Parameters': f"{info['total_params']:,}",
        'Trainable Parameters': f"{info['trainable_params']:,}",
        'Model Size (MB)': f"{info['total_params'] * 4 / 1024**2:.2f}"  # Assuming float32
    })

architecture_df = pd.DataFrame(architecture_data)
print(architecture_df.to_string(index=False))

print("\\n✅ Visualization and analysis complete!")

## 12. Model Inference Demo

Demonstrate model inference on test samples with confidence scores and anomaly detection.

In [None]:
def run_inference_demo(models, test_loader, device, label_encoder, num_samples=10):
    """
    Run inference demo on a batch of test samples
    
    Args:
        models: Dictionary of trained models
        test_loader: Test data loader
        device: Computing device
        label_encoder: Label encoder for class names
        num_samples: Number of samples to demonstrate
    """
    print("=" * 80)
    print("MODEL INFERENCE DEMONSTRATION")
    print("=" * 80)
    
    # Get a batch of test samples
    data_iter = iter(test_loader)
    sample_batch = next(data_iter)
    sample_data, sample_targets = sample_batch
    
    # Take only num_samples
    sample_data = sample_data[:num_samples].to(device)
    sample_targets = sample_targets[:num_samples]
    
    print(f"Running inference on {num_samples} test samples...")
    print(f"True labels: {[label_encoder.classes_[label] for label in sample_targets]}")
    
    # Store results for all models
    inference_results = {}
    
    for model_name, model in models.items():
        model.eval()
        
        with torch.no_grad():
            if model_name.lower() == 'autoencoder':
                reconstructed, logits, encoded = model(sample_data)
                probabilities = F.softmax(logits, dim=1)
                predictions = logits.argmax(dim=1)
                
                # Calculate reconstruction errors
                reconstruction_errors = model.compute_reconstruction_error(sample_data, reconstructed)
                
                inference_results[model_name] = {
                    'predictions': predictions.cpu().numpy(),
                    'probabilities': probabilities.cpu().numpy(),
                    'reconstruction_errors': reconstruction_errors.cpu().numpy()
                }
            else:
                output = model(sample_data)
                probabilities = F.softmax(output, dim=1)
                predictions = output.argmax(dim=1)
                
                inference_results[model_name] = {
                    'predictions': predictions.cpu().numpy(),
                    'probabilities': probabilities.cpu().numpy()
                }
    
    # Display results in a formatted table
    print(f"\\n{'Sample':<8} | {'True Label':<20} | {'LSTM Pred':<15} | {'CNN Pred':<15} | {'AE Pred':<15} | {'LSTM Conf':<10} | {'CNN Conf':<10} | {'AE Conf':<10}")
    print("-" * 140)
    
    for i in range(num_samples):
        true_label = label_encoder.classes_[sample_targets[i]]\n        \n        # Get predictions and confidences for each model\n        lstm_pred = label_encoder.classes_[inference_results['LSTM']['predictions'][i]]\n        cnn_pred = label_encoder.classes_[inference_results['CNN']['predictions'][i]]\n        ae_pred = label_encoder.classes_[inference_results['Autoencoder']['predictions'][i]]\n        \n        lstm_conf = inference_results['LSTM']['probabilities'][i].max()\n        cnn_conf = inference_results['CNN']['probabilities'][i].max()\n        ae_conf = inference_results['Autoencoder']['probabilities'][i].max()\n        \n        print(f"{i+1:<8} | {true_label:<20} | {lstm_pred:<15} | {cnn_pred:<15} | {ae_pred:<15} | "\n              f"{lstm_conf:<10.3f} | {cnn_conf:<10.3f} | {ae_conf:<10.3f}")\n    \n    # Model agreement analysis\n    print(f"\\n" + "=" * 80)\n    print("MODEL AGREEMENT ANALYSIS")\n    print("=" * 80)\n    \n    agreements = []\n    for i in range(num_samples):\n        lstm_pred = inference_results['LSTM']['predictions'][i]\n        cnn_pred = inference_results['CNN']['predictions'][i]\n        ae_pred = inference_results['Autoencoder']['predictions'][i]\n        \n        if lstm_pred == cnn_pred == ae_pred:\n            agreement = "All Agree"\n        elif lstm_pred == cnn_pred or lstm_pred == ae_pred or cnn_pred == ae_pred:\n            agreement = "Partial Agreement"\n        else:\n            agreement = "No Agreement"\n        \n        agreements.append(agreement)\n    \n    agreement_counts = pd.Series(agreements).value_counts()\n    print(f"Agreement statistics for {num_samples} samples:")\n    for agreement_type, count in agreement_counts.items():\n        print(f"  {agreement_type}: {count} samples ({count/num_samples*100:.1f}%)")\n    \n    # Anomaly detection analysis (Binary: Normal vs Attack)\n    print(f"\\n" + "=" * 80)\n    print("ANOMALY DETECTION ANALYSIS (Normal vs Attack)")\n    print("=" * 80)\n    \n    # Assuming class 0 is 'BENIGN' (normal)\n    true_binary = (sample_targets != 0).numpy()\n    \n    print(f"{'Sample':<8} | {'True':<10} | {'LSTM':<10} | {'CNN':<10} | {'AE':<10} | {'AE Recon Err':<12}")\n    print("-" * 70)\n    \n    for i in range(num_samples):\n        true_anomaly = "Attack" if true_binary[i] else "Normal"\n        \n        lstm_anomaly = "Attack" if inference_results['LSTM']['predictions'][i] != 0 else "Normal"\n        cnn_anomaly = "Attack" if inference_results['CNN']['predictions'][i] != 0 else "Normal"\n        ae_anomaly = "Attack" if inference_results['Autoencoder']['predictions'][i] != 0 else "Normal"\n        ae_recon_err = inference_results['Autoencoder']['reconstruction_errors'][i]\n        \n        print(f"{i+1:<8} | {true_anomaly:<10} | {lstm_anomaly:<10} | {cnn_anomaly:<10} | "\n              f"{ae_anomaly:<10} | {ae_recon_err:<12.6f}")\n    \n    # Calculate accuracy for this sample\n    sample_accuracies = {}\n    for model_name in models.keys():\n        correct = np.sum(inference_results[model_name]['predictions'] == sample_targets.numpy())\n        accuracy = correct / num_samples\n        sample_accuracies[model_name] = accuracy\n    \n    print(f"\\nSample accuracy on {num_samples} samples:")\n    for model_name, accuracy in sample_accuracies.items():\n        print(f"  {model_name}: {accuracy:.3f} ({accuracy*100:.1f}%)")\n    \n    return inference_results\n\n# Prepare models dictionary\nmodels_dict = {\n    'LSTM': lstm_model,\n    'CNN': cnn_model,\n    'Autoencoder': autoencoder_model\n}\n\n# Run inference demo\ninference_demo_results = run_inference_demo(\n    models_dict, test_loader, device, label_encoder, num_samples=10\n)\n\nprint(f"\\n" + "=" * 80)\nprint("INFERENCE DEMO COMPLETE")\nprint("=" * 80)\nprint("Key observations:")\nprint("1. Compare model predictions and confidence scores")\nprint("2. Analyze model agreement patterns")\nprint("3. Examine reconstruction errors for autoencoder anomaly detection")\nprint("4. Evaluate binary classification performance (Normal vs Attack)")\n\n# Additional insights\nprint(f"\\n" + "=" * 80)\nprint("PROJECT SUMMARY & EXPECTED RESULTS")\nprint("=" * 80)\nprint("Expected Performance Ranges (with full training on real CSE-CIC-IDS2018):")\nprint("• LSTM Model: ~94% accuracy (excellent for sequential pattern recognition)")\nprint("• CNN Model: ~92% accuracy (good for local feature extraction)")\nprint("• Autoencoder: High recall but more false positives (unsupervised approach)")\nprint("\\nKey Advantages:")\nprint("• LSTM: Captures temporal dependencies in network flows")\nprint("• CNN: Efficient feature extraction with fewer parameters")\nprint("• Autoencoder: Detects novel attacks not seen during training")\nprint("\\nRecommendations:")\nprint("• Use LSTM for highest overall accuracy")\nprint("• Combine models in an ensemble for robust detection")\nprint("• Fine-tune hyperparameters for production deployment")\nprint("• Implement real-time inference pipeline with appropriate preprocessing")