In [None]:
"""
Training the original DANTE model in 10 epochs
"""

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, DistributedSampler
import torch.distributed as dist
import os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

Datasets

In [None]:
# extractXYogDANTE

In [None]:
# ==============================================================================
# üíæ DATASET SETUP (Re-using the structure defined previously)
# ==============================================================================

class InsiderThreatDataset(Dataset):
    def __init__(self, X_path, y_path):
        self.X = pd.read_pickle(X_path)
        self.y = pd.read_pickle(y_path)

        # Convert to Tensors: X must be Long (for nn.Embedding input), y must be Float
        self.X = torch.tensor(self.X.tolist(), dtype=torch.long)
        # Unsqueeze(1) makes labels (N, 1) for standard binary classification
        self.y = torch.tensor(self.y.values.astype(float), dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

Model Definition

In [None]:
# This file contains the DANTE.v2 model definition (InsiderClassifier)

import torch
import torch.nn as nn
import torch.nn.functional as F

# Model Definition:

class LSTM_Encoder(nn.Module):
    def __init__(self, padding_idx=None, input_size=201, embedding_dim=40, lstm_hidden_size=40, num_layers=3, dropout_rate=0.5):
        super(LSTM_Encoder, self).__init__()

        # Model hyperparameters/constants
        self.input_size = input_size
        self.lstm_hidden_size = lstm_hidden_size

        # Embedding layer
        self.embedding = nn.Embedding(input_size, embedding_dim, padding_idx=padding_idx)
        lstm_input_size = embedding_dim

        # One-hot encoder fallback (optional)
        self.one_hot_encoder = F.one_hot

        # Core LSTM Encoder
        self.lstm_encoder = nn.LSTM(
            lstm_input_size,
            lstm_hidden_size,
            num_layers=num_layers,
            dropout=dropout_rate,
            batch_first=True)

        self.dropout = nn.Dropout(dropout_rate)
        # Decoder maps hidden_size back to the input vocab size (input_size)
        self.decoder = nn.Linear(lstm_hidden_size, input_size)
        self.log_softmax = nn.LogSoftmax(dim=2)

    def forward(self, sequence):
        # sequence shape: (batch_size, seq_len)

        # 1. Input Processing (Embedding or One-Hot)
        if self.embedding:
            x = self.embedding(sequence)
        else:
            x = self.one_hot_encoder(sequence,
                num_classes=self.input_size).float()
        # x shape: (batch_size, seq_len, embed_dim)

        # 2. LSTM Forward Pass
        x, _ = self.lstm_encoder(x)
        # x shape: (batch_size, seq_len, lstm_hidden_size)

        # 3. Output for Training (Reconstruction) or Inference (Hidden State)
        if self.training:
            x = self.dropout(x)
            x = self.decoder(x)
            x = self.log_softmax(x)
            # Output for reconstruction loss: (batch_size, seq_len, input_size)
            return x
        else:
            # Output for Classifier: (batch_size, seq_len, lstm_hidden_size)
            return x


class CNN_Classifier(nn.Module):
    def __init__(self, seq_length=200, lstm_hidden_size=40):
        super(CNN_Classifier, self).__init__()

        self.seq_length = seq_length
        self.lstm_hidden_size = lstm_hidden_size

        final_seq_dim = self.seq_length // 4
        final_hidden_dim = self.lstm_hidden_size // 4

        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5, padding=2)
        self.maxpool1 = nn.MaxPool2d(2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, padding=2)
        self.maxpool2 = nn.MaxPool2d(2, stride=2)

        # Calculate the required linear input size dynamically
        linear_input_size = 64 * final_seq_dim * final_hidden_dim

        self.flatten = lambda x: x.view(x.size(0),-1)
        self.linear = nn.Linear(linear_input_size, 2) # Output 2 classes (malicious/not malicious)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.maxpool2(x)
        x = self.flatten(x)
        x = self.linear(x)

        return x

class InsiderClassifier(nn.Module):
    def __init__(self, lstm_checkpoint, device='cuda'):
        super(InsiderClassifier, self).__init__()

        # Assuming similar architecture as in the original code:
        lstm_hidden_size = 40
        seq_length = 200 # Adjusted (max_length=250)

        self.lstm_encoder = LSTM_Encoder(lstm_hidden_size=lstm_hidden_size)
        self.lstm_encoder.requires_grad = False
        self.lstm_encoder.eval()
        self.load_encoder(lstm_checkpoint, device=device) # Pass device here!

        self.sigmoid = nn.Sigmoid()
        self.cnn_classifier = CNN_Classifier(seq_length=seq_length, lstm_hidden_size=lstm_hidden_size)

        # Move the entire model to the correct device upon initialization
        self.to(device)

    def train(self, mode=True):
        # Only the CNN classifier is trained, the encoder stays in eval mode
        self.training = mode
        self.sigmoid.train(mode)
        self.cnn_classifier.train(mode)

        # Crucially, the encoder MUST remain in evaluation mode and have grad disabled.
        self.lstm_encoder.eval()
        self.lstm_encoder.requires_grad = False
        return self

    def load_encoder(self, checkpoint, device):
        # Map location ensures the checkpoint is loaded correctly regardless of current device
        self.lstm_encoder.load_state_dict(
            torch.load(
                checkpoint,
                map_location=torch.device(device)),
            strict=True
            )
        # Move the encoder to the target device after loading its state
        self.lstm_encoder.to(device)
        return self

    def forward(self, x):
        # Ensure input data is on the same device as the model
        device = next(self.parameters()).device
        x = x.to(device)

        with torch.no_grad():
            # The encoder is on the correct device due to __init__ and load_encoder
            hidden_state = self.lstm_encoder(x)
            hidden_state = self.sigmoid(hidden_state)

        # hidden_state shape: (batch_size, seq_len, lstm_hidden_size)
        # CNN expects (N, C, H, W). We add the channel dimension (C=1)
        scores = self.cnn_classifier(hidden_state.unsqueeze(1))

        return scores


In [None]:
# ==============================================================================
# ‚öôÔ∏è CONFIGURATION VARIABLES (Change these for your specific environment)
# ==============================================================================

# --- A. DEVICE / SINGLE-GPU CONFIG ---
# 'cpu', 'cuda', or 'cuda:0'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


# --- B. DDP (Distributed Data Parallel) CONFIG ---
# Set to True to enable DDP for multi-GPU/multi-node training
USE_DDP = False
# DDP parameters (only relevant if USE_DDP is True)
RANK = 0         # Rank of the current process (0 to WORLD_SIZE - 1)
WORLD_SIZE = 1   # Total number of participating GPUs/processes
BACKEND = 'nccl' # Communication backend (usually 'nccl' for GPUs)
INIT_METHOD = 'env://' # How processes find each other (e.g., environment variables)

# --- C. HYPERPARAMETERS & DATA LOADING ---
BATCH_SIZE = 64
LEARNING_RATE = 1e-3 # default learning rate for torch.optim.Adam
# Data paths (Assuming X.pkl contains padded action_id sequences, y.pkl contains labels)
X_PATH = 'X_train_ogDANTE.pkl'
Y_PATH = 'y_train_ogDANTE.pkl'
NUM_WORKERS = 0 # How many subprocesses to use for data loading

# ==============================================================================
# üöÄ MAIN TRAINING FUNCTION (UPDATED)
# ==============================================================================

def train_cnn_classifier(EPOCHS=10, OUTPUT_FILENAME='ogDANTE_logsoft.pkl', LSTM_CHECKPOINT_PATH='kkogDANTE'):
# ------------------------------------
    # 1. DDP and Device Initialization
    # ------------------------------------
    if USE_DDP:
        # Initialize the distributed process group
        dist.init_process_group(BACKEND, init_method=INIT_METHOD, rank=RANK, world_size=WORLD_SIZE)
        # Use the local rank (device ID) as the actual training device
        local_rank = int(os.environ["LOCAL_RANK"]) if "LOCAL_RANK" in os.environ else 0
        current_device = torch.device(f'cuda:{local_rank}')
        print(f"DDP: Rank {RANK} initialized on device {current_device}")
    else:
        # Single-device setup
        current_device = torch.device(DEVICE)
        print(f"Single Device: Initializing on device {current_device}")

    # ------------------------------------
    # 2. Data Loading
    # ------------------------------------
    dataset = InsiderThreatDataset(X_PATH, Y_PATH)

    if USE_DDP:
        # Use DistributedSampler for DDP
        sampler = DistributedSampler(dataset, num_replicas=WORLD_SIZE, rank=RANK, shuffle=True)
        # When using DDP, DataLoader should NOT shuffle, sampler handles it
        dataloader = DataLoader(
            dataset,
            batch_size=BATCH_SIZE,
            sampler=sampler,
            num_workers=NUM_WORKERS
        )
    else:
        # Standard DataLoader for single device
        dataloader = DataLoader(
            dataset,
            batch_size=BATCH_SIZE,
            shuffle=True,
            num_workers=NUM_WORKERS
        )

    # ------------------------------------
    # 3. Model, Loss, and Optimizer
    # ------------------------------------

    # Initialize the InsiderClassifier model
    model = InsiderClassifier(
        lstm_checkpoint=LSTM_CHECKPOINT_PATH,
        device=current_device
    )

    # DDP wrapping (if applicable)
    if USE_DDP:
        model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])

    criterion = nn.NLLLoss() # original DANTE used this one
    logsoftmax = nn.LogSoftmax(dim=1)

    optimizer = optim.Adam(model.parameters()) # original one used default lr

    print(f"Starting unweighted training...")

    for epoch in range(EPOCHS):
        if USE_DDP:
            dataloader.sampler.set_epoch(epoch)

        model.train()
        running_loss = 0.0

        # Lists to temporarily hold predictions/labels for the metrics calculation
        temp_preds, temp_labels = [], []

        for i, (X_batch, y_batch) in enumerate(dataloader):
            if X_batch.shape[1] != 200:
                print("actual batch shape is", X_batch.shape)
            X_batch = X_batch.long().to(current_device)
            # CrossEntropyLoss expects (N,) Long tensor for labels
            y_batch_long = y_batch.long().squeeze(1).to(current_device)

            optimizer.zero_grad()
            scores = model(X_batch)
            loss = criterion(logsoftmax(scores), y_batch_long) # logsoftmax used to solve exploding gradients
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # --- METRICS COLLECTION ---
            _, predicted_classes = torch.max(scores, 1)

            temp_preds.extend(predicted_classes.cpu().tolist())
            temp_labels.extend(y_batch_long.cpu().tolist())

            # Print progress every 100 batches
            if (i + 1) % 100 == 0:

                batch_preds = np.array(temp_preds)
                batch_labels = np.array(temp_labels)

                # --- CALCULATE ALL METRICS ---
                batch_accuracy = accuracy_score(batch_labels, batch_preds)
                batch_f1 = f1_score(batch_labels, batch_preds, average='binary', zero_division=0)

                # New: Calculate Recall and Precision for the Malicious (positive/Class 1) class
                batch_recall = recall_score(batch_labels, batch_preds, average='binary', zero_division=0)
                batch_precision = precision_score(batch_labels, batch_preds, average='binary', zero_division=0)

                # Print the combined report
                avg_loss = running_loss / 100
                print(f"[Epoch {epoch+1}, Batch {i+1}] "
                      f"Loss: {avg_loss:.4f} | "
                      f"Acc: {batch_accuracy:.4f} | "
                      f"F1: {batch_f1:.4f} | "
                      f"Prec: {batch_precision:.4f} | "
                      f"Recall: {batch_recall:.4f}")

                # Reset counters for the next 100 batches
                running_loss = 0.0
                temp_preds, temp_labels = [], []
    # ------------------------------------
    # 5. Final Model Saving (Updated Logic)
    # ------------------------------------

    # In DDP, ensure only Rank 0 saves the model
    if not USE_DDP or RANK == 0:

        # Get the actual model state, unwrapping DDP if necessary
        model_to_save = model.module if USE_DDP else model

        # Save the final model state to 'model.pkl'
        torch.save(model_to_save.state_dict(), OUTPUT_FILENAME)
        print(f"\nTraining Complete. Final model parameters saved to: {OUTPUT_FILENAME}")

    if USE_DDP:
        dist.destroy_process_group()

In [None]:
def train_lstm_encoder(EPOCHS=2, LSTM_CHECKPOINT_PATH='kkogDANTE'):
    # ------------------------------------
    # 1. Device and Data Loading Setup
    # ------------------------------------
    current_device = torch.device(DEVICE)
    print(f"Starting LSTM Encoder training on device {current_device}...")

    # For the Encoder training, we don't need the y_path (malicious labels)
    # but the InsiderThreatDataset loads it, so we'll just ignore it in the loop.
    dataset = InsiderThreatDataset(X_PATH, Y_PATH)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS
    )

    # ------------------------------------
    # 2. Model, Loss, and Optimizer
    # ------------------------------------

    # Initialize the Encoder model and move to device
    # Note: LSTM_Encoder is NOT wrapped in the InsiderClassifier here.
    model = LSTM_Encoder().to(current_device)
    model.train() # Set to training mode for dropout and decoder output

    # Loss function for reconstruction (input is categorical, output is LogSoftmax)
    # We use NLLLoss combined with F.one_hot() to handle the categorical reconstruction.
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters()) # original one used default lr

    # ------------------------------------
    # 3. Training Loop
    # ------------------------------------
    for epoch in range(EPOCHS):
        running_loss = 0.0

        for i, (X_batch, _) in enumerate(dataloader):
            # X_batch is the padded sequence (B, S). Target must be Long tensor.
            X_batch = X_batch.to(current_device)

            optimizer.zero_grad()

            # Forward pass: the encoder returns the reconstructed sequence (LogSoftmax output)
            reconstructed_X = model(X_batch)

            # --- Calculate Loss (Categorical Reconstruction) ---
            # 1. reconstructed_X shape: (B, S, V) [Logits]
            # 2. X_batch shape: (B, S) [Target IDs]
            # NLLLoss expects (N*V, C) logits and (N*V,) target IDs.

            # Reshape logits to (B*S, V) and target to (B*S)
            loss = criterion(
                reconstructed_X.permute(0, 2, 1), # NLLLoss expects (B, V, S) input
                X_batch.long()
            )

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{EPOCHS} | Avg Reconstruction Loss: {avg_loss:.4f}", end='\t')
        print()

    # ------------------------------------
    # 4. Save the Checkpoint
    # ------------------------------------
    # Save the final state dictionary to the required path
    torch.save(model.state_dict(), f'{LSTM_CHECKPOINT_PATH}')
    print(f"\nLSTM Encoder training complete. Checkpoint saved to: {LSTM_CHECKPOINT_PATH}")

In [None]:
train_lstm_encoder()

Starting LSTM Encoder training on device cuda...


  self.X = torch.tensor(self.X.tolist(), dtype=torch.long)


Epoch 1/2 | Avg Reconstruction Loss: 0.2398	
Epoch 2/2 | Avg Reconstruction Loss: 0.0127	

LSTM Encoder training complete. Checkpoint saved to: kkogDANTE


In [None]:
train_cnn_classifier()

Single Device: Initializing on device cuda
Starting unweighted training...
[Epoch 1, Batch 100] Loss: 0.3180 | Acc: 0.9881 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 200] Loss: 0.1261 | Acc: 0.9964 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 300] Loss: 0.1124 | Acc: 0.9959 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 400] Loss: 0.0914 | Acc: 0.9980 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 500] Loss: 0.0238 | Acc: 0.9986 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 600] Loss: 0.0393 | Acc: 0.9983 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 700] Loss: 0.0207 | Acc: 0.9980 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 800] Loss: 0.0137 | Acc: 0.9989 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 900] Loss: 0.0198 | Acc: 0.9980 | F1: 0.0000 | Prec: 0.0000 | Recall: 0.0000
[Epoch 1, Batch 1000] Loss: 0.0097 | Acc: 0.9988 | F1: 0.0000 | Prec: 0.0000 | Recall: 0