In [1]:
# --- STEP 1: INSTALL DEPENDENCIES ---
# Run this cell once to install the required libraries for Graph Neural Networks
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
!pip install -q torch-geometric
!pip install -q pandas numpy scikit-learn

# --- STEP 2: IMPORT MODULES ---
import os
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# PyTorch Geometric (Graph Library)
from torch_geometric.data import Data
from torch_geometric.nn import TransformerConv, GCN2Conv

# Data Preprocessing
from sklearn.preprocessing import MinMaxScaler

# --- STEP 3: DEVICE CONFIGURATION ---
# Automatically detects if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Setup Complete. Using Device: {device}")

# --- STEP 4: HYPERPARAMETERS (From DIGNN Paper) ---
CONFIG = {
    'snapshot_size': 2048,   # Paper optimal: 2048 flows per graph
    'window_size': 10,       # Paper optimal: Sequence length of 10
    'hidden_dim': 64,        # Dimension of hidden layers
    'heads': 2,              # Attention heads for TransformerConv
    'epochs': 50,            # Sufficient for convergence
    'lr': 0.001              # Learning Rate
}

Setup Complete. Using Device: cpu


In [5]:
# --- BLOCK 3: LINE GRAPH GENERATION (Run BEFORE Block 2) ---
import torch
from torch_geometric.data import Data

def create_snapshots(df, feat_cols, label_col, snapshot_size):
    """
    Converts a DataFrame chunk into a list of Graph Snapshots.
    Nodes = Network Flows.
    Edges = Shared IP Addresses (Line Graph Methodology).
    """
    # Don't print for every chunk to keep output clean
    # print(f"  Generating snapshots...") 
    
    snapshots = []
    num_samples = len(df)
    
    # Iterate through the dataframe chunk in steps of 'snapshot_size'
    for i in range(0, num_samples, snapshot_size):
        # 1. Slice the Window
        window = df.iloc[i : i + snapshot_size].reset_index(drop=True)
        if len(window) < 10: continue # Skip tiny tails
        
        # 2. Node Features (X) & Labels (Y)
        x = torch.tensor(window[feat_cols].values, dtype=torch.float)
        y = torch.tensor(window[label_col].values, dtype=torch.long)
        
        # 3. Edge Construction (Line Graph Logic)
        # We connect flows that share a Source IP or Destination IP.
        
        # Check if topology columns exist (they should, based on Block 2)
        if 'srcip' in window.columns and 'dstip' in window.columns:
            ip_map = {}
            srcs = window['srcip'].values
            dsts = window['dstip'].values
            
            # Map IP -> List of Flow Indices
            for idx, (s, d) in enumerate(zip(srcs, dsts)):
                if s not in ip_map: ip_map[s] = []
                if d not in ip_map: ip_map[d] = []
                ip_map[s].append(idx)
                ip_map[d].append(idx)
                
            src_e, dst_e = [], []
            
            # Create Edges
            for ip, nodes in ip_map.items():
                if len(nodes) > 1:
                    # Chain Connection: Connect 0-1, 1-2, 2-3...
                    # This creates a path through all flows sharing the IP.
                    # It is much more memory efficient than Fully Connected.
                    for k in range(len(nodes)-1):
                        u, v = nodes[k], nodes[k+1]
                        # Undirected Graph (Add both directions)
                        src_e.extend([u, v])
                        dst_e.extend([v, u])
                        
            if src_e:
                edge_index = torch.tensor([src_e, dst_e], dtype=torch.long)
            else:
                # Fallback: No shared IPs in this small window (rare)
                edge_index = torch.empty((2, 0), dtype=torch.long)
        else:
            # Fallback if topology cols missing
            edge_index = torch.empty((2, 0), dtype=torch.long)
            
        # Create PyG Data Object
        data = Data(x=x, edge_index=edge_index, y=y)
        snapshots.append(data)
        
    return snapshots

# --- USAGE LINES ---
# This function is usually NOT called manually. 
# It is called automatically by 'process_data_incrementally' in Block 2.

In [8]:
# --- BLOCK 2: INCREMENTAL PROCESSING (RAM FRIENDLY) ---
import pandas as pd
import numpy as np
import os
import gc  # Garbage Collector
from sklearn.preprocessing import MinMaxScaler

def process_data_incrementally(file_paths, features_csv, snapshot_size):
    """
    Processes UNSW-NB15 CSV files incrementally using the Paper's snapshot size.
    """
    
    # --- SETUP HEADERS ---
    print(f"Step 1: Parsing Feature Headers from {features_csv}...")
    if not os.path.exists(features_csv):
        raise FileNotFoundError(f"Features file not found: {features_csv}")
        
    feat_df = pd.read_csv(features_csv, encoding='cp1252')
    headers = feat_df['Name'].str.strip().str.lower().tolist()
    
    # Scaler for consistent normalization across files
    scaler = MinMaxScaler()
    is_scaler_fitted = False
    
    all_snapshots = [] 
    input_dim = 0      
    
    # --- LOOP THROUGH FILES ---
    for fpath in file_paths:
        if not os.path.exists(fpath):
            print(f"Skipping missing file: {fpath}")
            continue
            
        print(f"\nProcessing {fpath}...")
        
        # 1. LOAD SINGLE FILE
        df = pd.read_csv(fpath, header=None, names=headers, low_memory=False)
        
        # 2. HANDLE LABELS
        if 'label' in df.columns:
            df['label'] = pd.to_numeric(df['label'], errors='coerce').fillna(0).astype(int)
            label_col = 'label'
        elif 'attack_cat' in df.columns:
            df['label'] = df['attack_cat'].apply(lambda x: 0 if str(x).strip().lower() == 'normal' else 1)
            label_col = 'label'
        else:
            df.rename(columns={df.columns[-1]: 'label'}, inplace=True)
            label_col = 'label'
            
        # 3. SORT (Time)
        if 'stime' in df.columns:
            df = df.sort_values('stime').reset_index(drop=True)
            
        # 4. PREPARE FEATURES
        topo_cols = ['srcip', 'sport', 'dstip', 'dsport', 'proto']
        cat_cols = ['proto', 'service', 'state']
        
        # One-Hot Encoding
        existing_cat = [c for c in cat_cols if c in df.columns]
        if existing_cat:
            df = pd.get_dummies(df, columns=existing_cat, drop_first=True)
            
        # Select Features
        exclude_cols = topo_cols + [label_col, 'id', 'attack_cat']
        feat_cols = [c for c in df.columns if c not in exclude_cols]
        
        # Force Numeric
        for c in feat_cols:
            df[c] = pd.to_numeric(df[c], errors='coerce')
        df.fillna(0, inplace=True)
        
        # Filter to only numeric columns to avoid dtype conversion errors
        numeric_dtypes = [np.float64, np.float32, np.int64, np.int32, np.int16, np.int8, np.uint64, np.uint32, np.uint16, np.uint8]
        feat_cols = [c for c in feat_cols if df[c].dtype in numeric_dtypes]
        
        # 5. SCALE
        if not is_scaler_fitted:
            print("  Fitting Scaler on first file...")
            scaler.fit(df[feat_cols])
            input_dim = len(feat_cols)
            is_scaler_fitted = True
        
        # align columns
        current_cols = [c for c in feat_cols if c in df.columns]
        if len(current_cols) == input_dim:
             df[current_cols] = scaler.transform(df[current_cols])

        # 6. GENERATE SNAPSHOTS (Using Configured Size)
        print(f"  Generating snapshots (Size: {snapshot_size})...")
        
        # Note: Ensure Block 3 (create_snapshots) is already run/defined before this!
        file_snapshots = create_snapshots(df, feat_cols, label_col, snapshot_size)
        all_snapshots.extend(file_snapshots)
        
        print(f"  Done. Total snapshots so far: {len(all_snapshots)}")
        
        # 7. CLEAR RAM
        del df
        del file_snapshots
        gc.collect() 
        
    return all_snapshots, input_dim

# --- USAGE LINES ---
# 1. Define File Paths
data_files = [
    'UNSW-NB15_1.csv',
    'UNSW-NB15_2.csv',
    'UNSW-NB15_3.csv',
    'UNSW-NB15_4.csv'
]
features_file = 'NUSW-NB15_features.csv'

# 2. Run Processing using CONFIG
if os.path.exists(features_file):
    # Passing CONFIG['snapshot_size'] (which is 2048)
    all_snapshots, input_dim = process_data_incrementally(
        data_files, 
        features_file, 
        snapshot_size=CONFIG['snapshot_size']
    )
    print(f"Final Dataset Ready: {len(all_snapshots)} snapshots.")
else:
    print(f"Features file '{features_file}' not found.")

Step 1: Parsing Feature Headers from NUSW-NB15_features.csv...

Processing UNSW-NB15_1.csv...
  Fitting Scaler on first file...
  Generating snapshots (Size: 2048)...
  Done. Total snapshots so far: 342

Processing UNSW-NB15_2.csv...
  Generating snapshots (Size: 2048)...
  Done. Total snapshots so far: 684

Processing UNSW-NB15_3.csv...
  Generating snapshots (Size: 2048)...
  Done. Total snapshots so far: 1026

Processing UNSW-NB15_4.csv...
  Generating snapshots (Size: 2048)...
  Done. Total snapshots so far: 1241
Final Dataset Ready: 1241 snapshots.


In [12]:
# --- BLOCK 4: DIGNN MODEL ARCHITECTURE ---
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import TransformerConv, GCN2Conv

class IGNN_GRU(nn.Module):
    """
    Integrated GNN Cell (Section 3.5 of Paper).
    Replaces standard GRU linear gates with GCNII convolutions.
    Handles variable-sized graphs by using graph-level pooling of hidden states.
    """
    def __init__(self, in_ch, out_ch, layer=1):
        super().__init__()
        # GCNII Layers for Reset(r), Update(z), Candidate(h)
        self.gcn_r = GCN2Conv(out_ch, alpha=0.1, theta=0.5, layer=layer+1)
        self.gcn_z = GCN2Conv(out_ch, alpha=0.1, theta=0.5, layer=layer+1)
        self.gcn_h = GCN2Conv(out_ch, alpha=0.1, theta=0.5, layer=layer+1)
        
        # Projection layer
        self.lin_x = nn.Linear(in_ch, out_ch)

    def forward(self, x, h_prev, edge_index):
        # x: [num_nodes, hid_dim]
        # h_prev: [hid_dim] or [1, hid_dim] - pooled hidden state from previous timestep
        # edge_index: graph connectivity
        
        x_h = self.lin_x(x)
        x_0 = x_h
        
        # Ensure h_prev is [num_nodes, hid_dim] by repeating if needed
        if h_prev.dim() == 1:
            h_prev = h_prev.unsqueeze(0)  # [hid_dim] -> [1, hid_dim]
        
        if h_prev.size(0) != x_h.size(0):
            # Repeat h_prev to match number of nodes
            h_prev = h_prev.expand(x_h.size(0), -1)
        
        # GCN gates
        r = torch.sigmoid(self.gcn_r(x_h, x_0, edge_index) + self.gcn_r(h_prev, x_0, edge_index))
        z = torch.sigmoid(self.gcn_z(x_h, x_0, edge_index) + self.gcn_z(h_prev, x_0, edge_index))
        h_tilde = torch.tanh(self.gcn_h(x_h, x_0, edge_index) + self.gcn_h(r*h_prev, x_0, edge_index))
        
        # Update state
        h_new = (1-z)*h_tilde + z*h_prev
        
        # Pool to graph level: average across all nodes
        h_pooled = h_new.mean(dim=0)  # [num_nodes, hid_dim] -> [hid_dim]
        
        return h_new, h_pooled

class DIGNN(nn.Module):
    """
    Full DIGNN-A Architecture (Modified for Variable Graph Sizes):
    Spatial (TransformerConv) -> Temporal (IG-NN with pooling) -> Classifier
    """
    def __init__(self, in_dim, hid_dim, heads=2):
        super().__init__()
        # Spatial Module
        self.spatial = TransformerConv(in_dim, hid_dim, heads=heads, concat=False)
        
        # Temporal Module
        self.temporal = IGNN_GRU(hid_dim, hid_dim)
        
        # Classifier
        self.clf = nn.Linear(hid_dim, 2)

    def forward(self, seq):
        # seq: List of Data objects (sliding window)
        h = None
        
        for data in seq:
            x, ei = data.x, data.edge_index
            
            # Spatial processing
            x_s = F.elu(self.spatial(x, ei))
            
            # Initialize hidden state if first step
            if h is None:
                h = torch.zeros(x_s.size(1), device=x_s.device)
            
            # Temporal evolution - returns (per_node_state, pooled_state)
            _, h = self.temporal(x_s, h, ei)
        
        # Final classification on pooled state
        logits = self.clf(h)  # [hid_dim] -> [2]
        return F.log_softmax(logits, dim=0)

# --- USAGE LINES ---
# model = DIGNN(in_dim=100, hid_dim=CONFIG['hidden_dim'], heads=CONFIG['heads'])
# model.to(device)


In [13]:
# --- BLOCK 5: MAIN EXECUTION ---
import torch.optim as optim
import time

# 1. DEFINE FILES (Update these to your exact paths)
data_files = [
    'UNSW-NB15_1.csv',
    'UNSW-NB15_2.csv',
    'UNSW-NB15_3.csv',
    'UNSW-NB15_4.csv'
]
features_file = 'NUSW-NB15_features.csv'

# 2. CHECK & RUN PROCESSING
if os.path.exists(features_file):
    print("--- STARTING PIPELINE ---")
    
    # Call Block 2 Function (which calls Block 3)
    # Uses CONFIG['snapshot_size'] = 2048
    all_snapshots, input_dim = process_data_incrementally(
        data_files, 
        features_file, 
        snapshot_size=CONFIG['snapshot_size']
    )
    
    print(f"\nDataset Ready. Total Snapshots: {len(all_snapshots)}")
    
    # 3. SPLIT TRAIN/TEST (Time-based 80/20)
    split_idx = int(len(all_snapshots) * 0.8)
    train_snapshots = all_snapshots[:split_idx]
    test_snapshots = all_snapshots[split_idx:]
    
    print(f"Train Size: {len(train_snapshots)} | Test Size: {len(test_snapshots)}")
    
    # 4. TRAINING LOOP
    if len(train_snapshots) > CONFIG['window_size']:
        print(f"\n--- TRAINING ({CONFIG['epochs']} Epochs) ---")
        
        # Initialize Model (Block 4)
        model = DIGNN(in_dim=input_dim, hid_dim=CONFIG['hidden_dim'], heads=CONFIG['heads']).to(device)
        
        optimizer = optim.Adam(model.parameters(), lr=CONFIG['lr'])
        criterion = nn.NLLLoss()  # Use NLLLoss since output is log_softmax
        
        model.train()
        
        for ep in range(CONFIG['epochs']):
            total_loss = 0
            steps = 0
            start_time = time.time()
            
            # Sliding Window Loop
            # We slide over the list of snapshots: [0..9] -> predict 9, [1..10] -> predict 10...
            for i in range(len(train_snapshots) - CONFIG['window_size']):
                # Get Sequence
                sequence = train_snapshots[i : i+CONFIG['window_size']]
                
                # Move to Device
                seq_gpu = [d.to(device) for d in sequence]
                
                # Target is the MAJORITY label of the last snapshot (graph-level prediction)
                y_target = seq_gpu[-1].y
                # Use majority class or mean label for graph-level target
                graph_label = (y_target.float().mean() > 0.5).long()
                
                optimizer.zero_grad()
                output = model(seq_gpu)  # Output shape: [2] (log probabilities for 2 classes)
                
                # Calculate Loss
                loss = criterion(output.unsqueeze(0), graph_label.unsqueeze(0))  # Add batch dimension
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                steps += 1
            
            avg_loss = total_loss / steps if steps > 0 else 0
            print(f"Epoch {ep+1}/{CONFIG['epochs']} | Loss: {avg_loss:.4f} | Time: {time.time()-start_time:.1f}s")
            
        # Save Model
        torch.save(model.state_dict(), 'dignn_nb15_complete.pth')
        print("Model Saved.")
        
        # 5. TESTING LOOP
        print("\n--- EVALUATION ---")
        model.eval()
        correct = 0
        total_samples = 0
        
        with torch.no_grad():
            for i in range(len(test_snapshots) - CONFIG['window_size']):
                sequence = test_snapshots[i : i+CONFIG['window_size']]
                seq_gpu = [d.to(device) for d in sequence]
                
                # Graph-level target
                y_target = seq_gpu[-1].y
                graph_label = (y_target.float().mean() > 0.5).long()
                
                output = model(seq_gpu)  # Output shape: [2]
                pred = output.argmax(dim=0)  # Get class with highest probability
                
                correct += (pred == graph_label).sum().item()
                total_samples += 1
                
        acc = correct / total_samples if total_samples > 0 else 0
        print(f"Test Accuracy: {acc:.4f}")
        
    else:
        print("Error: Not enough data for the specified window size.")
else:
    print(f"Error: Features file '{features_file}' not found.")


--- STARTING PIPELINE ---
Step 1: Parsing Feature Headers from NUSW-NB15_features.csv...

Processing UNSW-NB15_1.csv...
  Fitting Scaler on first file...
  Generating snapshots (Size: 2048)...
  Done. Total snapshots so far: 342

Processing UNSW-NB15_2.csv...
  Generating snapshots (Size: 2048)...
  Done. Total snapshots so far: 684

Processing UNSW-NB15_3.csv...
  Generating snapshots (Size: 2048)...
  Done. Total snapshots so far: 1026

Processing UNSW-NB15_4.csv...
  Generating snapshots (Size: 2048)...
  Done. Total snapshots so far: 1241

Dataset Ready. Total Snapshots: 1241
Train Size: 992 | Test Size: 249

--- TRAINING (50 Epochs) ---
Epoch 1/50 | Loss: 0.1235 | Time: 157.4s
Epoch 2/50 | Loss: 0.1193 | Time: 168.2s
Epoch 3/50 | Loss: 0.1180 | Time: 165.8s
Epoch 4/50 | Loss: 0.1160 | Time: 179.1s
Epoch 5/50 | Loss: 0.1127 | Time: 160.9s
Epoch 6/50 | Loss: 0.1156 | Time: 176.6s
Epoch 7/50 | Loss: 0.1135 | Time: 166.2s
Epoch 8/50 | Loss: 0.1143 | Time: 166.8s
Epoch 9/50 | Loss: 0.1