# Transformer with All Features Combined

This notebook implements a Transformer Encoder for syscall-based malware detection using **all features combined**:
- **Syscall**: Categorical feature (embedded)
- **Return Value (Ret)**: Categorical feature (embedded)
- **Parameters**: Text feature (sentence transformer embeddings)

The three feature representations are concatenated at each timestep and projected to a hidden dimension before being fed to the Transformer.

In [1]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [2000]  # Different sliding window lengths to test
# WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
BATCH_SIZE = 32
EPOCHS = 6
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Embedding dimensions
SYSCALL_EMBED_DIM = 32
RETVAL_EMBED_DIM = 32

# Load sentence transformer model for parameter embeddings
print("Loading sentence transformer model...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
PARAM_EMBED_DIM = sentence_model.get_sentence_embedding_dimension()
print(f"Parameter embedding dimension: {PARAM_EMBED_DIM}")
print(f"Total combined embedding dimension: {SYSCALL_EMBED_DIM + RETVAL_EMBED_DIM + PARAM_EMBED_DIM}")

Using device: cuda
Loading sentence transformer model...
Parameter embedding dimension: 384
Total combined embedding dimension: 448


In [2]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_all_features(file_path):
    """Load all features (syscall, ret, parameters) grouped by run.
    
    Returns list of runs, where each run is a dict with:
        - 'syscalls': list of syscall names
        - 'retvals': list of return values
        - 'params': list of parameter strings
    """
    df = pd.read_csv(file_path)
    runs = []
    for run_id, group in df.groupby('run'):
        run_data = {
            'syscalls': group['syscall'].tolist(),
            'retvals': group['Ret'].tolist(),
            'params': group['parameters'].tolist()
        }
        runs.append(run_data)
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_all_features(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321


In [3]:
# Build encoders for syscalls and return values
print("Building encoders for syscalls and return values...")

all_syscalls = []
all_retvals = []
all_params = set()

for path, _ in train_files + test_files:
    for run_data in load_runs_all_features(path):
        all_syscalls.extend(run_data['syscalls'])
        all_retvals.extend(run_data['retvals'])
        for param in run_data['params']:
            if pd.isna(param):
                all_params.add('<EMPTY>')
            else:
                all_params.add(str(param))

# Build syscall encoder
syscall_encoder = LabelEncoder()
syscall_encoder.fit(all_syscalls)
syscall_vocab_size = len(syscall_encoder.classes_) + 1  # +1 for PAD token
print(f"Syscall vocabulary size: {syscall_vocab_size} (including PAD)")

# Build return value encoder
retval_encoder = LabelEncoder()
retval_encoder.fit(all_retvals)
retval_vocab_size = len(retval_encoder.classes_) + 1  # +1 for PAD token
print(f"Return value vocabulary size: {retval_vocab_size} (including PAD)")

# PAD index for embeddings
PAD_IDX = 0

print(f"\nUnique parameter strings: {len(all_params)}")

Building encoders for syscalls and return values...
Syscall vocabulary size: 81 (including PAD)
Return value vocabulary size: 42586 (including PAD)

Unique parameter strings: 266019


In [4]:
# Pre-compute sentence embeddings for all unique parameter strings
print("Computing sentence embeddings for parameters (this may take a few minutes)...")

all_params_list = list(all_params)
param_embeddings = sentence_model.encode(
    all_params_list,
    show_progress_bar=True,
    batch_size=256,
    convert_to_numpy=True
)

# Create a mapping from parameter string to embedding
param_to_embedding = {param: emb for param, emb in zip(all_params_list, param_embeddings)}
print(f"Parameter embeddings computed. Shape per embedding: {PARAM_EMBED_DIM}")

# Create zero embedding for padding
PAD_PARAM_EMBEDDING = np.zeros(PARAM_EMBED_DIM, dtype=np.float32)

Computing sentence embeddings for parameters (this may take a few minutes)...


Batches:   0%|          | 0/1040 [00:00<?, ?it/s]

Parameter embeddings computed. Shape per embedding: 384


In [5]:
class AllFeaturesDataset(Dataset):
    """Dataset that combines syscall, return value, and parameter features.
    
    For each timestep, we have:
    - Syscall index (to be embedded by the model)
    - Return value index (to be embedded by the model)
    - Parameter embedding (pre-computed sentence embedding)
    """
    def __init__(self, file_label_pairs, syscall_encoder, retval_encoder, 
                 param_to_embedding, window_size, param_embed_dim, pad_param_embedding):
        self.syscall_seqs = []
        self.retval_seqs = []
        self.param_embeddings = []
        self.labels = []
        label_map = {'benign': 0, 'malicious': 1}
        
        for path, label in file_label_pairs:
            runs = load_runs_all_features(path)
            for run_data in runs:
                syscalls = run_data['syscalls']
                retvals = run_data['retvals']
                params = run_data['params']
                
                seq_len = min(len(syscalls), window_size)
                
                # Encode syscalls (+1 to reserve 0 for PAD)
                encoded_syscalls = syscall_encoder.transform(syscalls[:seq_len]) + 1
                if len(encoded_syscalls) < window_size:
                    encoded_syscalls = np.pad(encoded_syscalls, 
                                              (0, window_size - len(encoded_syscalls)), 
                                              constant_values=PAD_IDX)
                
                # Encode return values (+1 to reserve 0 for PAD)
                encoded_retvals = retval_encoder.transform(retvals[:seq_len]) + 1
                if len(encoded_retvals) < window_size:
                    encoded_retvals = np.pad(encoded_retvals, 
                                             (0, window_size - len(encoded_retvals)), 
                                             constant_values=PAD_IDX)
                
                # Get parameter embeddings
                param_embs = []
                for i in range(window_size):
                    if i < len(params):
                        param = params[i]
                        if pd.isna(param):
                            key = '<EMPTY>'
                        else:
                            key = str(param)
                        param_embs.append(param_to_embedding[key])
                    else:
                        param_embs.append(pad_param_embedding)
                
                self.syscall_seqs.append(encoded_syscalls)
                self.retval_seqs.append(encoded_retvals)
                self.param_embeddings.append(np.array(param_embs, dtype=np.float32))
                self.labels.append(label_map[label])
        
        self.syscall_seqs = np.array(self.syscall_seqs)
        self.retval_seqs = np.array(self.retval_seqs)
        self.param_embeddings = np.array(self.param_embeddings)
        self.labels = np.array(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.syscall_seqs[idx], dtype=torch.long),
            torch.tensor(self.retval_seqs[idx], dtype=torch.long),
            torch.tensor(self.param_embeddings[idx], dtype=torch.float32),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )

In [6]:
class TransformerEncoderAllFeatures(nn.Module):
    """Transformer Encoder that combines all features:
    - Syscall embeddings (learned)
    - Return value embeddings (learned)
    - Parameter embeddings (pre-computed sentence embeddings)
    
    All three are concatenated at each timestep, projected to hidden dim,
    and then fed to the Transformer encoder with positional encoding.
    """
    def __init__(self, syscall_vocab_size, retval_vocab_size,
                 syscall_embed_dim, retval_embed_dim, param_embed_dim,
                 max_len, hidden_dim=128, num_heads=4, num_layers=2, 
                 ff_dim=256, dropout=0.1):
        super().__init__()
        
        # Embedding layers for categorical features
        self.syscall_embedding = nn.Embedding(syscall_vocab_size, syscall_embed_dim, padding_idx=PAD_IDX)
        self.retval_embedding = nn.Embedding(retval_vocab_size, retval_embed_dim, padding_idx=PAD_IDX)
        
        # Total input dimension after concatenation
        total_embed_dim = syscall_embed_dim + retval_embed_dim + param_embed_dim
        
        # Project concatenated embeddings to hidden dimension
        self.input_projection = nn.Linear(total_embed_dim, hidden_dim)
        
        # Learnable positional encoding
        self.pos_encoding = nn.Parameter(torch.zeros(1, max_len, hidden_dim))
        
        # Transformer encoder layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Classification head
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2)
        )
        
        self._init_weights()
    
    def _init_weights(self):
        nn.init.normal_(self.pos_encoding, std=0.02)
    
    def forward(self, syscalls, retvals, param_embs):
        # Embed categorical features
        syscall_emb = self.syscall_embedding(syscalls)  # (batch, seq_len, syscall_embed_dim)
        retval_emb = self.retval_embedding(retvals)      # (batch, seq_len, retval_embed_dim)
        
        # param_embs already has shape (batch, seq_len, param_embed_dim)
        
        # Concatenate all embeddings
        x = torch.cat([syscall_emb, retval_emb, param_embs], dim=2)  # (batch, seq_len, total_embed_dim)
        
        # Create padding mask based on syscalls (PAD positions have index 0)
        padding_mask = (syscalls == PAD_IDX)  # True for padded positions
        
        # Project to hidden dimension and add positional encoding
        x = self.input_projection(x) + self.pos_encoding[:, :x.size(1), :]
        
        # Transformer encoder
        x = self.transformer(x, src_key_padding_mask=padding_mask)
        
        # Global average pooling (excluding padding)
        mask_expanded = (~padding_mask).unsqueeze(-1).float()
        x = (x * mask_expanded).sum(dim=1) / (mask_expanded.sum(dim=1) + 1e-9)
        
        return self.fc(x)

In [7]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Create datasets
    train_dataset = AllFeaturesDataset(
        train_files, syscall_encoder, retval_encoder,
        param_to_embedding, window_size, PARAM_EMBED_DIM, PAD_PARAM_EMBEDDING
    )
    test_dataset = AllFeaturesDataset(
        test_files, syscall_encoder, retval_encoder,
        param_to_embedding, window_size, PARAM_EMBED_DIM, PAD_PARAM_EMBEDDING
    )
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    
    # Print input shapes
    print(f"\n--- Input Shapes ---")
    print(f"  Syscall input shape:   (batch_size, {window_size})")
    print(f"  Retval input shape:    (batch_size, {window_size})")
    print(f"  Param emb input shape: (batch_size, {window_size}, {PARAM_EMBED_DIM})")
    print(f"  After embedding concat: (batch_size, {window_size}, {SYSCALL_EMBED_DIM + RETVAL_EMBED_DIM + PARAM_EMBED_DIM})")
    
    # Create model
    model = TransformerEncoderAllFeatures(
        syscall_vocab_size=syscall_vocab_size,
        retval_vocab_size=retval_vocab_size,
        syscall_embed_dim=SYSCALL_EMBED_DIM,
        retval_embed_dim=RETVAL_EMBED_DIM,
        param_embed_dim=PARAM_EMBED_DIM,
        max_len=window_size,
        hidden_dim=128,
        num_heads=4,
        num_layers=2,
        ff_dim=256,
        dropout=0.1
    ).to(DEVICE)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Print model parameter sizes per layer
    print(f"\n--- Model Architecture & Parameters ---")
    total_params = 0
    trainable_params = 0
    for name, param in model.named_parameters():
        param_count = param.numel()
        total_params += param_count
        if param.requires_grad:
            trainable_params += param_count
        print(f"  {name}: {list(param.shape)} = {param_count:,} params")
    print(f"  {'─'*50}")
    print(f"  Total parameters:     {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")
    
    # Training
    print(f"\nTraining...")
    train_start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=True)
        for syscalls, retvals, param_embs, labels in pbar:
            syscalls = syscalls.to(DEVICE)
            retvals = retvals.to(DEVICE)
            param_embs = param_embs.to(DEVICE)
            labels = labels.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(syscalls, retvals, param_embs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            
            pbar.set_postfix({'loss': f'{total_loss/total:.4f}', 'acc': f'{correct/total:.4f}'})
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
    
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for syscalls, retvals, param_embs, labels in test_loader:
            syscalls = syscalls.to(DEVICE)
            retvals = retvals.to(DEVICE)
            param_embs = param_embs.to(DEVICE)
            
            outputs = model(syscalls, retvals, param_embs)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(all_labels, all_preds, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['benign', 'malicious']))
    
    labels_names = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels_names], columns=[f'Pred: {l}' for l in labels_names])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")
    
    # Clear GPU memory between experiments
    del model
    torch.cuda.empty_cache()


EXPERIMENT: Window Size = 2000
Train samples: 1986, Test samples: 810

--- Input Shapes ---
  Syscall input shape:   (batch_size, 2000)
  Retval input shape:    (batch_size, 2000)
  Param emb input shape: (batch_size, 2000, 384)
  After embedding concat: (batch_size, 2000, 448)

--- Model Architecture & Parameters ---
  pos_encoding: [1, 2000, 128] = 256,000 params
  syscall_embedding.weight: [81, 32] = 2,592 params
  retval_embedding.weight: [42586, 32] = 1,362,752 params
  input_projection.weight: [128, 448] = 57,344 params
  input_projection.bias: [128] = 128 params
  transformer.layers.0.self_attn.in_proj_weight: [384, 128] = 49,152 params
  transformer.layers.0.self_attn.in_proj_bias: [384] = 384 params
  transformer.layers.0.self_attn.out_proj.weight: [128, 128] = 16,384 params
  transformer.layers.0.self_attn.out_proj.bias: [128] = 128 params
  transformer.layers.0.linear1.weight: [256, 128] = 32,768 params
  transformer.layers.0.linear1.bias: [256] = 256 params
  transformer.l

Epoch 1/6: 100%|██████████| 63/63 [27:39<00:00, 26.34s/it, loss=0.0078, acc=0.9149]
Epoch 2/6: 100%|██████████| 63/63 [27:39<00:00, 26.34s/it, loss=0.0041, acc=0.9658]
Epoch 3/6: 100%|██████████| 63/63 [27:39<00:00, 26.34s/it, loss=0.0034, acc=0.9612]
Epoch 4/6: 100%|██████████| 63/63 [27:39<00:00, 26.34s/it, loss=0.0004, acc=0.9990]
Epoch 5/6: 100%|██████████| 63/63 [27:39<00:00, 26.34s/it, loss=0.0003, acc=0.9985]
Epoch 6/6: 100%|██████████| 63/63 [27:39<00:00, 26.34s/it, loss=0.0002, acc=0.9990]


Training time: 9956.35s

Evaluating...
Test time: 25.86s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.81      0.90       489
   malicious       0.78      1.00      0.87       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              397               92
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.1881
F1-score (weighted): 0.8876


### Earlier results for 250, 500, and 1000 window size

```plaintext

============================================================
EXPERIMENT: Window Size = 250
============================================================
Train samples: 1986, Test samples: 810

--- Input Shapes ---
  Syscall input shape:   (batch_size, 250)
  Retval input shape:    (batch_size, 250)
  Param emb input shape: (batch_size, 250, 384)
  After embedding concat: (batch_size, 250, 448)

--- Model Architecture & Parameters ---
  pos_encoding: [1, 250, 128] = 32,000 params
  syscall_embedding.weight: [81, 32] = 2,592 params
  retval_embedding.weight: [42586, 32] = 1,362,752 params
  input_projection.weight: [128, 448] = 57,344 params
  input_projection.bias: [128] = 128 params
  transformer.layers.0.self_attn.in_proj_weight: [384, 128] = 49,152 params
  transformer.layers.0.self_attn.in_proj_bias: [384] = 384 params
  transformer.layers.0.self_attn.out_proj.weight: [128, 128] = 16,384 params
  transformer.layers.0.self_attn.out_proj.bias: [128] = 128 params
  transformer.layers.0.linear1.weight: [256, 128] = 32,768 params
  transformer.layers.0.linear1.bias: [256] = 256 params
  transformer.layers.0.linear2.weight: [128, 256] = 32,768 params
  transformer.layers.0.linear2.bias: [128] = 128 params
  transformer.layers.0.norm1.weight: [128] = 128 params
  transformer.layers.0.norm1.bias: [128] = 128 params
  transformer.layers.0.norm2.weight: [128] = 128 params
  transformer.layers.0.norm2.bias: [128] = 128 params
  transformer.layers.1.self_attn.in_proj_weight: [384, 128] = 49,152 params
  transformer.layers.1.self_attn.in_proj_bias: [384] = 384 params
  transformer.layers.1.self_attn.out_proj.weight: [128, 128] = 16,384 params
  transformer.layers.1.self_attn.out_proj.bias: [128] = 128 params
  transformer.layers.1.linear1.weight: [256, 128] = 32,768 params
  transformer.layers.1.linear1.bias: [256] = 256 params
  transformer.layers.1.linear2.weight: [128, 256] = 32,768 params
  transformer.layers.1.linear2.bias: [128] = 128 params
  transformer.layers.1.norm1.weight: [128] = 128 params
  transformer.layers.1.norm1.bias: [128] = 128 params
  transformer.layers.1.norm2.weight: [128] = 128 params
  transformer.layers.1.norm2.bias: [128] = 128 params
  fc.0.weight: [64, 128] = 8,192 params
  fc.0.bias: [64] = 64 params
  fc.3.weight: [2, 64] = 128 params
  fc.3.bias: [2] = 2 params
  ──────────────────────────────────────────────────
  Total parameters:     1,728,162
  Trainable parameters: 1,728,162

Training...
Epoch 1/6: 100%|██████████| 63/63 [00:19<00:00,  3.26it/s, loss=0.0081, acc=0.9038]
Epoch 2/6: 100%|██████████| 63/63 [00:20<00:00,  3.07it/s, loss=0.0010, acc=0.9950]
Epoch 3/6: 100%|██████████| 63/63 [00:20<00:00,  3.04it/s, loss=0.0009, acc=0.9955]
Epoch 4/6: 100%|██████████| 63/63 [00:20<00:00,  3.02it/s, loss=0.0009, acc=0.9955]
Epoch 5/6: 100%|██████████| 63/63 [00:20<00:00,  3.02it/s, loss=0.0009, acc=0.9955]
Epoch 6/6: 100%|██████████| 63/63 [00:20<00:00,  3.03it/s, loss=0.0020, acc=0.9955]
Training time: 123.06s

Evaluating...
Test time: 2.12s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.82      0.90       489
   malicious       0.79      0.99      0.88       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              402               87
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.1779
F1-score (weighted): 0.8901

============================================================
EXPERIMENT: Window Size = 500
============================================================
Train samples: 1986, Test samples: 810

--- Input Shapes ---
  Syscall input shape:   (batch_size, 500)
  Retval input shape:    (batch_size, 500)
  Param emb input shape: (batch_size, 500, 384)
  After embedding concat: (batch_size, 500, 448)

--- Model Architecture & Parameters ---
  pos_encoding: [1, 500, 128] = 64,000 params
  syscall_embedding.weight: [81, 32] = 2,592 params
  retval_embedding.weight: [42586, 32] = 1,362,752 params
  input_projection.weight: [128, 448] = 57,344 params
  input_projection.bias: [128] = 128 params
  transformer.layers.0.self_attn.in_proj_weight: [384, 128] = 49,152 params
  transformer.layers.0.self_attn.in_proj_bias: [384] = 384 params
  transformer.layers.0.self_attn.out_proj.weight: [128, 128] = 16,384 params
  transformer.layers.0.self_attn.out_proj.bias: [128] = 128 params
  transformer.layers.0.linear1.weight: [256, 128] = 32,768 params
  transformer.layers.0.linear1.bias: [256] = 256 params
  transformer.layers.0.linear2.weight: [128, 256] = 32,768 params
  transformer.layers.0.linear2.bias: [128] = 128 params
  transformer.layers.0.norm1.weight: [128] = 128 params
  transformer.layers.0.norm1.bias: [128] = 128 params
  transformer.layers.0.norm2.weight: [128] = 128 params
  transformer.layers.0.norm2.bias: [128] = 128 params
  transformer.layers.1.self_attn.in_proj_weight: [384, 128] = 49,152 params
  transformer.layers.1.self_attn.in_proj_bias: [384] = 384 params
  transformer.layers.1.self_attn.out_proj.weight: [128, 128] = 16,384 params
  transformer.layers.1.self_attn.out_proj.bias: [128] = 128 params
  transformer.layers.1.linear1.weight: [256, 128] = 32,768 params
  transformer.layers.1.linear1.bias: [256] = 256 params
  transformer.layers.1.linear2.weight: [128, 256] = 32,768 params
  transformer.layers.1.linear2.bias: [128] = 128 params
  transformer.layers.1.norm1.weight: [128] = 128 params
  transformer.layers.1.norm1.bias: [128] = 128 params
  transformer.layers.1.norm2.weight: [128] = 128 params
  transformer.layers.1.norm2.bias: [128] = 128 params
  fc.0.weight: [64, 128] = 8,192 params
  fc.0.bias: [64] = 64 params
  fc.3.weight: [2, 64] = 128 params
  fc.3.bias: [2] = 2 params
  ──────────────────────────────────────────────────
  Total parameters:     1,760,162
  Trainable parameters: 1,760,162

Training...
Epoch 1/6: 100%|██████████| 63/63 [00:57<00:00,  1.10it/s, loss=0.0100, acc=0.8666]
Epoch 2/6: 100%|██████████| 63/63 [00:59<00:00,  1.06it/s, loss=0.0008, acc=0.9955]
Epoch 3/6: 100%|██████████| 63/63 [01:00<00:00,  1.04it/s, loss=0.0007, acc=0.9970]
Epoch 4/6: 100%|██████████| 63/63 [01:00<00:00,  1.05it/s, loss=0.0006, acc=0.9970]
Epoch 5/6: 100%|██████████| 63/63 [00:59<00:00,  1.05it/s, loss=0.0006, acc=0.9919]
Epoch 6/6: 100%|██████████| 63/63 [01:00<00:00,  1.05it/s, loss=0.0004, acc=0.9975]
Training time: 357.17s

Evaluating...
Test time: 5.46s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.81      0.90       489
   malicious       0.78      1.00      0.88       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              398               91
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.1861
F1-score (weighted): 0.8889

============================================================
EXPERIMENT: Window Size = 1000
============================================================
Train samples: 1986, Test samples: 810

--- Input Shapes ---
  Syscall input shape:   (batch_size, 1000)
  Retval input shape:    (batch_size, 1000)
  Param emb input shape: (batch_size, 1000, 384)
  After embedding concat: (batch_size, 1000, 448)

--- Model Architecture & Parameters ---
  pos_encoding: [1, 1000, 128] = 128,000 params
  syscall_embedding.weight: [81, 32] = 2,592 params
  retval_embedding.weight: [42586, 32] = 1,362,752 params
  input_projection.weight: [128, 448] = 57,344 params
  input_projection.bias: [128] = 128 params
  transformer.layers.0.self_attn.in_proj_weight: [384, 128] = 49,152 params
  transformer.layers.0.self_attn.in_proj_bias: [384] = 384 params
  transformer.layers.0.self_attn.out_proj.weight: [128, 128] = 16,384 params
  transformer.layers.0.self_attn.out_proj.bias: [128] = 128 params
  transformer.layers.0.linear1.weight: [256, 128] = 32,768 params
  transformer.layers.0.linear1.bias: [256] = 256 params
  transformer.layers.0.linear2.weight: [128, 256] = 32,768 params
  transformer.layers.0.linear2.bias: [128] = 128 params
  transformer.layers.0.norm1.weight: [128] = 128 params
  transformer.layers.0.norm1.bias: [128] = 128 params
  transformer.layers.0.norm2.weight: [128] = 128 params
  transformer.layers.0.norm2.bias: [128] = 128 params
  transformer.layers.1.self_attn.in_proj_weight: [384, 128] = 49,152 params
  transformer.layers.1.self_attn.in_proj_bias: [384] = 384 params
  transformer.layers.1.self_attn.out_proj.weight: [128, 128] = 16,384 params
  transformer.layers.1.self_attn.out_proj.bias: [128] = 128 params
  transformer.layers.1.linear1.weight: [256, 128] = 32,768 params
  transformer.layers.1.linear1.bias: [256] = 256 params
  transformer.layers.1.linear2.weight: [128, 256] = 32,768 params
  transformer.layers.1.linear2.bias: [128] = 128 params
  transformer.layers.1.norm1.weight: [128] = 128 params
  transformer.layers.1.norm1.bias: [128] = 128 params
  transformer.layers.1.norm2.weight: [128] = 128 params
  transformer.layers.1.norm2.bias: [128] = 128 params
  fc.0.weight: [64, 128] = 8,192 params
  fc.0.bias: [64] = 64 params
  fc.3.weight: [2, 64] = 128 params
  fc.3.bias: [2] = 2 params
  ──────────────────────────────────────────────────
  Total parameters:     1,824,162
  Trainable parameters: 1,824,162

Training...
Epoch 1/6: 100%|██████████| 63/63 [03:12<00:00,  3.06s/it, loss=0.0081, acc=0.9028]
Epoch 2/6: 100%|██████████| 63/63 [03:22<00:00,  3.21s/it, loss=0.0035, acc=0.9708]
Epoch 3/6: 100%|██████████| 63/63 [03:33<00:00,  3.39s/it, loss=0.0048, acc=0.9653]
Epoch 4/6: 100%|██████████| 63/63 [03:24<00:00,  3.25s/it, loss=0.0083, acc=0.8938]
Epoch 5/6: 100%|██████████| 63/63 [03:26<00:00,  3.28s/it, loss=0.0004, acc=0.9985]
Epoch 6/6: 100%|██████████| 63/63 [03:26<00:00,  3.27s/it, loss=0.0004, acc=0.9985]
Training time: 1226.17s

Evaluating...
Test time: 11.58s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.81      0.90       489
   malicious       0.78      1.00      0.87       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              397               92
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.1881
F1-score (weighted): 0.8876

============================================================
EXPERIMENT: Window Size = 2000
============================================================
```

In [None]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS (Transformer - All Features Combined)")
print("="*80)

# Include earlier results for 250, 500, and 1000 window sizes
all_results = [
    {'Window Size': 250, 'Detection Rate': 0.9907, 'False Positive Rate': 0.1779, 'F1-score (weighted)': 0.8901, 'Train Time (s)': 123.06, 'Test Time (s)': 2.12},
    {'Window Size': 500, 'Detection Rate': 1.0000, 'False Positive Rate': 0.1861, 'F1-score (weighted)': 0.8889, 'Train Time (s)': 357.17, 'Test Time (s)': 5.46},
    {'Window Size': 1000, 'Detection Rate': 1.0000, 'False Positive Rate': 0.1881, 'F1-score (weighted)': 0.8876, 'Train Time (s)': 1226.17, 'Test Time (s)': 11.58},
] + results

results_df = pd.DataFrame(all_results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))

print(f"\nFeatures combined:")
print(f"  - Syscall embedding: {SYSCALL_EMBED_DIM} dims")
print(f"  - Return value embedding: {RETVAL_EMBED_DIM} dims")
print(f"  - Parameter embedding (sentence transformer): {PARAM_EMBED_DIM} dims")
print(f"  - Total input: {SYSCALL_EMBED_DIM + RETVAL_EMBED_DIM + PARAM_EMBED_DIM} dims per timestep")
print(f"  - Projected to hidden dim: 128 dims")
print(f"\nTransformer Architecture:")
print(f"  - Number of heads: 4")
print(f"  - Number of layers: 2")
print(f"  - Feed-forward dim: 256")
print(f"  - Dropout: 0.1")


SUMMARY OF RESULTS (Transformer - All Features Combined)
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9907              0.1779              0.8901         123.06          2.12
         500         1.0000              0.1861              0.8889         357.17          5.46
        1000         1.0000              0.1881              0.8876        1226.17         11.58
        2000         1.0000              0.1881              0.8876        9956.35         25.86

Features combined:
  - Syscall embedding: 32 dims
  - Return value embedding: 32 dims
  - Parameter embedding (sentence transformer): 384 dims
  - Total input: 448 dims per timestep
  - Projected to hidden dim: 128 dims

Transformer Architecture:
  - Number of heads: 4
  - Number of layers: 2
  - Feed-forward dim: 256
  - Dropout: 0.1
