Network Traffic Anomaly Detection using Deep Learning

Dataset: UNSW-NB15 (Parquet format)
Objective: Automatic identification of network attacks using Autoencoder-based approach

This project demonstrates AI/ML techniques for cybersecurity, specifically:
Unsupervised anomaly detection
- Deep learning architectures
- Network traffic analysis
- Feature engineering for cybersecurity

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    roc_auc_score, roc_curve, precision_recall_curve
)

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)


Data loading and explorarion
Load UNSW-NB15 dataset from parquet files.
    
    The dataset can be dowloaded as follows:
    dataset_name = "UNSW-NB15"
    subset = ['Network-Flows', 'Packet-Fields', 'Payload-Bytes']   # come nell'esempio, oppure 'all'
    files = [3, 5, 10] 
    from nids_datasets import Dataset, DatasetInfo
    data = Dataset(dataset=dataset_name, subset=subset, files=files)

    data.download() 
    

In [2]:
def load_unsw_nb15(data_dir="UNSW-NB15"):
        
    data_path = Path(data_dir)
    
    # Primary dataset file
    parquet_file = data_path / "Network-Flows" / "UNSW_Flow.parquet"
    
    try:
        if parquet_file.exists():
            df = pd.read_parquet(parquet_file)
            print(f"Dataset loaded successfully: {df.shape[0]:,} samples, {df.shape[1]} features\n")
            return df
        else:
            dataset_name = "UNSW-NB15"
            subset = ['Network-Flows', 'Packet-Fields', 'Payload-Bytes']   
            files = [3, 5, 10] 
            from nids_datasets import Dataset, DatasetInfo
            data = Dataset(dataset=dataset_name, subset=subset, files=files)
            data.download() 
            return "Error in the dataset loading"
            
    except Exception as e:
        return (f"Error loading dataset: {e}")
        

Data preprocessing and feature engineerin

In [None]:
class NetworkDataPreprocessor:
    """preprocessing of network traffic data"""
    def __init__(self):
        self.scalers = {}
        self.label_encoders = {}
        self.feature_names = None
        
    def fit_transform(self, df):
        """Fit preprocessors and transform data"""
        print("Preprocessing data...\n")
        
        # Check if binary_label already exists
        if 'binary_label' in df.columns:
            df['label'] = df['binary_label'].astype(int)
            label_source = 'binary_label'
        else:
            raise ValueError("Could not find label column. Available columns: " + str(list(df.columns)))
        
        #Show attack type distribution if available
        if 'attack_label' in df.columns:
            print(f"\n Attack type distribution:")
            attack_counts = df['attack_label'].value_counts()
            for attack_type, count in attack_counts.head(10).items():
                print(f"   {attack_type}: {count:,} ({count/len(df)*100:.1f}%)")
        
        print(f"\n Label distribution:")
        print(f"   Normal (0): {sum(df['label']==0):,} ({sum(df['label']==0)/len(df)*100:.1f}%)")
        print(f"   Attack (1): {sum(df['label']==1):,} ({sum(df['label']==1)/len(df)*100:.1f}%)")
        print()
        
        # The model in the training does not have the information if is an attack or not
        exclude_cols = ['label', 'binary_label', 'attack_label']
        
        # Exclude network identifiers that identify a single connection/flow to avoid overfitting 
        exclude_patterns = ['id', 'ip', 'addr', 'port', 'time', 'src', 'dst', 
                           'source', 'dest', 'destination']
        
        for col in df.columns:
            col_lower = col.lower()
            if any(pattern in col_lower for pattern in exclude_patterns):
                if col not in exclude_cols:
                    exclude_cols.append(col)
        
        print(f"Excluding {len(exclude_cols)} columns: {exclude_cols[:5]}...")
        print()
        
        # Separate categorical and numerical features
        remaining_cols = [c for c in df.columns if c not in exclude_cols]
        
        #string/objects columns
        categorical_cols = []
        #numerical columns
        numerical_cols = []
        
        for col in remaining_cols:
            dtype_str = str(df[col].dtype)
            # Check if categorical
            if dtype_str in ['object', 'string', 'category']:
                categorical_cols.append(col)
            elif df[col].nunique() < 20 and 'int' in dtype_str.lower():
                # Low cardinality integers might be categorical (protocol flags)
                categorical_cols.append(col)
            else:
                numerical_cols.append(col)
              
        # Handle missing values in numerical columns
        for col in numerical_cols:
            if df[col].isna().any():
                median_val = df[col].median()
                df.loc[:, col] = df[col].fillna(median_val)
        
        # Normalize categorical labels with numerical values
        for col in categorical_cols:
            le = LabelEncoder()
            df.loc[:, col] = le.fit_transform(df[col].astype(str))
            self.label_encoders[col] = le
        
        # Prepare feature matrix
        feature_cols = numerical_cols + categorical_cols
        X = df[feature_cols].values
        
        # Scale features based on mean and standard deviation
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        self.scalers['main'] = scaler
        self.feature_names = feature_cols
        
        print(f"Final feature dimension: {X_scaled.shape[1]}\n")
        
        return X_scaled, df['label'].values


Deep Autoencoder for anomaly detection in network traffic. Personalized Neural network

Architecture
 
    - Encoder: Progressively compresses input to low-dimensional latent space  
    - Decoder: Reconstructs original input from latent representation  
    - Dropout: Regularization to prevent overfitting  
    - BatchNorm: Stabilizes training  
    
    Anomaly detection principle:
    Normal traffic is well-reconstructed (low error)
    Anomalous traffic has high reconstruction error

In [4]:
class DeepAutoencoder(nn.Module):
    """
    Deep Autoencoder for anomaly detection in network traffic.
    """
    
    def __init__(self, input_dim, latent_dim=16):
        super(DeepAutoencoder, self).__init__()
        
        # Encoder 
        # compress the original input into a low-dimensional representation
        self.encoder = nn.Sequential(
            #Transform the input into a larger space
            nn.Linear(input_dim, 256),
            #Batch normalization
            nn.BatchNorm1d(256),
            #Non linear activation function 
            nn.ReLU(),
            #Regularization technique. Reduce overfitting
            nn.Dropout(0.3),
            
            #Compression
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            #Compression
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            #Bottleneck
            nn.Linear(64, latent_dim)
        )
        
        # Decoder 
        # try to expand the compressed latent_dim to reconstruct the original input
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            #first expaction
            nn.Linear(64, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(256, input_dim)
        )
    

    #needed for training and loss calculation
    def forward(self, x):
        #compress input
        latent = self.encoder(x)
        #reconstruct input
        reconstructed = self.decoder(latent)
        return reconstructed

Trainning pipeline
Operational logic

In [5]:
class AnomalyDetector:
    """Complete pipeline for network anomaly detection"""
    
    def __init__(self, input_dim, latent_dim=16, device='cpu'):
        self.device = device
        self.model = DeepAutoencoder(input_dim, latent_dim).to(device)
        #Mean Squared Error (standard for autorncoder)
        self.criterion = nn.MSELoss()
        #Optimizer. Weight decay prevent overfitting. lr=learning rate
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3, weight_decay=1e-5)
        #reduces learning rate of 0.5 if after 3 epochs the loss doesn't improve
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='min', patience=3, factor=0.5
        )
        self.history = {'train_loss': [], 'val_loss': []}
    
    def train_epoch(self, train_loader):
        self.model.train()
        total_loss = 0
        
        for batch_idx, (data,) in enumerate(train_loader):
            data = data.to(self.device)
            
            #Reset previously calculated gradients
            self.optimizer.zero_grad()
            reconstructed = self.model(data)
            loss = self.criterion(reconstructed, data)
            loss.backward()
            #Weight Update
            self.optimizer.step()
            
            total_loss += loss.item()
        
        return total_loss / len(train_loader)
        
    #validate the performance after each epoch
    def validate(self, val_loader):
        #eval mode
        self.model.eval()
        total_loss = 0
        
        #no weight update in validation
        with torch.no_grad():
            for data, in val_loader:
                data = data.to(self.device)
                reconstructed = self.model(data)
                loss = self.criterion(reconstructed, data)
                total_loss += loss.item()
        
        return total_loss / len(val_loader)
    
    #data preparations, train the model
    def fit(self, X_train, X_val, epochs=30, batch_size=256):
        print("Starting training\n")
        
        # Conversion in tensor
        train_dataset = TensorDataset(torch.FloatTensor(X_train))
        val_dataset = TensorDataset(torch.FloatTensor(X_val))
        
        #divide training and validation set in batch
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        best_val_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(epochs):
            train_loss = self.train_epoch(train_loader)
            val_loss = self.validate(val_loader)
            
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            
            self.scheduler.step(val_loss)
            
            print(f"Epoch {epoch+1:2d}/{epochs} | "
                  f"Train Loss: {train_loss:.6f} | "
                  f"Val Loss: {val_loss:.6f}")
            
            # Early stopping if loss does not reduce
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= 5:
                    print(f"\n Early stopping at epoch {epoch+1}")
                    break
        
        print("\n Training completed!\n")
    
    #final output of the trained model
    def predict_reconstruction_error(self, X):
        self.model.eval()
        X_tensor = torch.FloatTensor(X).to(self.device)
        
        with torch.no_grad():
            reconstructed = self.model(X_tensor)
            #Calculate the average of the errors along the feature dimension
            errors = ((reconstructed - X_tensor) ** 2).mean(dim=1).cpu().numpy()
        
        return errors

    

Evaluation: 

Take the reconstruction errors calculated by the Autoencoder and transform them into clear performance metrics to evaluate the effectiveness of the model in detecting anomalies

In [6]:
def evaluate_model(detector, X_normal, X_attack, y_true):
    print("Evaluating model performance\n")
    
    # Calculate reconstruction errors
    errors_normal = detector.predict_reconstruction_error(X_normal)
    errors_attack = detector.predict_reconstruction_error(X_attack)
    
    # Combine for evaluation
    all_errors = np.concatenate([errors_normal, errors_attack])
    
    # ROC-AUC Score (Area Under the Receiver Operating Characteristic Curve)
    # Find a threshold value that separates normal traffic from attacks
    auc = roc_auc_score(y_true, all_errors)
    
    # Find optimal threshold using Youden's J statistic
    fpr, tpr, thresholds = roc_curve(y_true, all_errors)
    j_scores = tpr - fpr
    optimal_idx = np.argmax(j_scores)
    optimal_threshold = thresholds[optimal_idx]
    
    # Predictions with optimal threshold. 
    # Used to convert continuous errors into binary predictions
    predictions = (all_errors > optimal_threshold).astype(int)
    
    print("EVALUATION RESULTS")
    print(f"\n ROC-AUC Score: {auc:.4f}")
    print(f"\n Optimal Threshold: {optimal_threshold:.6f}")
    print(f"\n Reconstruction Error Statistics:")
    print(f"   Normal Traffic:  {errors_normal.mean():.6f} ± {errors_normal.std():.6f}")
    print(f"   Attack Traffic:  {errors_attack.mean():.6f} ± {errors_attack.std():.6f}")
    print(f"   Separation:      {(errors_attack.mean() / errors_normal.mean()):.2f}x")
    
    print(f"\n Classification Report:")
    print(classification_report(y_true, predictions, 
                                target_names=['Normal', 'Attack'],
                                digits=4))
    
    return {
        'auc': auc,
        'threshold': optimal_threshold,
        'errors_normal': errors_normal,
        'errors_attack': errors_attack,
        'predictions': predictions
    }


Main execution pipeline 

Parameters:
    
    use_subset : bool
        If True, uses a subset of data for faster training

    subset_size : int
        Number of samples to use if use_subset=True
        
    contamination_rate : float
        Percentage of attacks in training set (0.0 = pure unsupervised, 0.05 = 5% attacks)

In [None]:
def main(use_subset=True, subset_size=200000, contamination_rate=0.05):
    
    # Load data
    df = load_unsw_nb15()
    
    # Subset for faster training/testing if dataset too big
    if use_subset and len(df) > subset_size:
        print(f"Using subset of {subset_size:,} samples for faster training")
        print(f" (Full dataset: {len(df):,} samples)")
        
        # Stratified sampling to maintain class balance
        if 'binary_label' in df.columns:
            df, _ = train_test_split(
                df, 
                train_size=subset_size, 
                stratify=df['binary_label'],
                random_state=42
            )
        else:
            df = df.sample(n=subset_size, random_state=42)
        print(f"   Subset created: {len(df):,} samples\n")
    
    # Preprocess
    preprocessor = NetworkDataPreprocessor()
    X, y = preprocessor.fit_transform(df)
    
    print(f"Dataset statistics:")
    print(f" Total samples: {len(y):,}")
    print(f" Normal traffic: {sum(y==0):,} ({sum(y==0)/len(y)*100:.1f}%)")
    print(f" Attack traffic: {sum(y==1):,} ({sum(y==1)/len(y)*100:.1f}%)")
    print()
    
    # Split data with configurable contamination
    X_normal = X[y == 0]
    X_attack = X[y == 1]
    
    # Standard anomaly detection: train only on normal
    X_train_normal, X_val_normal = train_test_split(X_normal, test_size=0.2, random_state=42)
    
    # For evaluation: create balanced test set
    X_test_normal, X_test_attack = train_test_split(X_val_normal, test_size=0.5, random_state=42)
    X_attack_test, X_attack_remaining = train_test_split(X_attack, test_size=0.6, random_state=42)
    
    # Training set: mostly normal (can add small contamination for realism)
    n_contamination = int(len(X_train_normal) * contamination_rate)
    
    if n_contamination > 0 and len(X_attack_remaining) > n_contamination:
        X_contamination = X_attack_remaining[:n_contamination]
        X_train = np.vstack([X_train_normal, X_contamination])
    
    # Validation set: only normal
    X_val = X_test_normal
        
    # Initialize and train model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f" Device: {device}")
    if device == 'cuda':
        print(f"  GPU: {torch.cuda.get_device_name(0)}\n")
    
    detector = AnomalyDetector(input_dim=X.shape[1], latent_dim=16, device=device)
    
    # Adjust epochs based on dataset size
    if use_subset and subset_size <= 100000:
        epochs = 30
    elif use_subset and subset_size <= 500000:
        epochs = 20
    else:
        epochs = 15
        
    detector.fit(X_train, X_val, epochs=epochs, batch_size=256)
    
    # Evaluate on balanced test set
    X_test = np.vstack([X_test_normal, X_attack_test])
    y_test = np.concatenate([np.zeros(len(X_test_normal)), np.ones(len(X_attack_test))])
    
    # Split test data for evaluation function
    results = evaluate_model(detector, X_test_normal, X_attack_test, y_test)
        
    # Save configuration info
    results['config'] = {
        'subset_size': subset_size if use_subset else len(df),
        'contamination_rate': contamination_rate,
        'epochs': epochs,
        'device': device,
        'features': X.shape[1]
    }
    
    return detector, results, preprocessor

Examples of different main parameters 
Quick test (fast, ~2-3 minutes):
    detector, results, preprocessor = main(use_subset=True, subset_size=50000, contamination_rate=0.05)

Medium test (balanced, ~5-8 minutes):
    detector, results, preprocessor = main(use_subset=True, subset_size=200000, contamination_rate=0.05)

Full dataset (best results, ~20-30 minutes):
    detector, results, preprocessor = main(use_subset=False, contamination_rate=0.05)

Pure unsupervised (no contamination):
    detector, results, preprocessor = main(use_subset=True, subset_size=200000, contamination_rate=0.0)

In [None]:

if __name__ == "__main__":
        
    detector, results, preprocessor = main(
        use_subset=True,           # Use subset 
        subset_size=500000,         
        contamination_rate=0.05    # 5% attack contamination
    )
    
    