# 1D CNN with All Features Combined

This notebook implements a 1D CNN for syscall-based malware detection using **all features combined**:
- **Syscall**: Categorical feature (embedded)
- **Return Value (Ret)**: Categorical feature (embedded)
- **Parameters**: Text feature (sentence transformer embeddings)

The three feature representations are concatenated at each timestep to form a unified representation.

In [1]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [2000]  # Different sliding window lengths to test
# WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
BATCH_SIZE = 32
EPOCHS = 20
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Embedding dimensions
SYSCALL_EMBED_DIM = 32
RETVAL_EMBED_DIM = 32

# Load sentence transformer model for parameter embeddings
print("Loading sentence transformer model...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
PARAM_EMBED_DIM = sentence_model.get_sentence_embedding_dimension()
print(f"Parameter embedding dimension: {PARAM_EMBED_DIM}")
print(f"Total combined embedding dimension: {SYSCALL_EMBED_DIM + RETVAL_EMBED_DIM + PARAM_EMBED_DIM}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu
Loading sentence transformer model...
Parameter embedding dimension: 384
Total combined embedding dimension: 448


In [2]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_all_features(file_path):
    """Load all features (syscall, ret, parameters) grouped by run.
    
    Returns list of runs, where each run is a dict with:
        - 'syscalls': list of syscall names
        - 'retvals': list of return values
        - 'params': list of parameter strings
    """
    df = pd.read_csv(file_path)
    runs = []
    for run_id, group in df.groupby('run'):
        run_data = {
            'syscalls': group['syscall'].tolist(),
            'retvals': group['Ret'].tolist(),
            'params': group['parameters'].tolist()
        }
        runs.append(run_data)
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_all_features(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321


In [3]:
# Build encoders for syscalls and return values
print("Building encoders for syscalls and return values...")

all_syscalls = []
all_retvals = []
all_params = set()

for path, _ in train_files + test_files:
    for run_data in load_runs_all_features(path):
        all_syscalls.extend(run_data['syscalls'])
        all_retvals.extend(run_data['retvals'])
        for param in run_data['params']:
            if pd.isna(param):
                all_params.add('<EMPTY>')
            else:
                all_params.add(str(param))

# Build syscall encoder
syscall_encoder = LabelEncoder()
syscall_encoder.fit(all_syscalls)
syscall_vocab_size = len(syscall_encoder.classes_) + 1  # +1 for PAD token
print(f"Syscall vocabulary size: {syscall_vocab_size} (including PAD)")

# Build return value encoder
retval_encoder = LabelEncoder()
retval_encoder.fit(all_retvals)
retval_vocab_size = len(retval_encoder.classes_) + 1  # +1 for PAD token
print(f"Return value vocabulary size: {retval_vocab_size} (including PAD)")

# PAD index for embeddings
PAD_IDX = 0

print(f"\nUnique parameter strings: {len(all_params)}")

Building encoders for syscalls and return values...
Syscall vocabulary size: 81 (including PAD)
Return value vocabulary size: 42586 (including PAD)

Unique parameter strings: 266019


In [4]:
# Pre-compute sentence embeddings for all unique parameter strings
print("Computing sentence embeddings for parameters (this may take a few minutes)...")

all_params_list = list(all_params)
param_embeddings = sentence_model.encode(
    all_params_list,
    show_progress_bar=True,
    batch_size=256,
    convert_to_numpy=True
)

# Create a mapping from parameter string to embedding
param_to_embedding = {param: emb for param, emb in zip(all_params_list, param_embeddings)}
print(f"Parameter embeddings computed. Shape per embedding: {PARAM_EMBED_DIM}")

# Create zero embedding for padding
PAD_PARAM_EMBEDDING = np.zeros(PARAM_EMBED_DIM, dtype=np.float32)

Computing sentence embeddings for parameters (this may take a few minutes)...


Batches: 100%|██████████| 1040/1040 [19:56<00:00,  1.15s/it]


Parameter embeddings computed. Shape per embedding: 384


In [5]:
class AllFeaturesDataset(Dataset):
    """Dataset that combines syscall, return value, and parameter features.
    
    For each timestep, we have:
    - Syscall index (to be embedded by the model)
    - Return value index (to be embedded by the model)
    - Parameter embedding (pre-computed sentence embedding)
    """
    def __init__(self, file_label_pairs, syscall_encoder, retval_encoder, 
                 param_to_embedding, window_size, param_embed_dim, pad_param_embedding):
        self.syscall_seqs = []
        self.retval_seqs = []
        self.param_embeddings = []
        self.labels = []
        label_map = {'benign': 0, 'malicious': 1}
        
        for path, label in file_label_pairs:
            runs = load_runs_all_features(path)
            for run_data in runs:
                syscalls = run_data['syscalls']
                retvals = run_data['retvals']
                params = run_data['params']
                
                seq_len = min(len(syscalls), window_size)
                
                # Encode syscalls (+1 to reserve 0 for PAD)
                encoded_syscalls = syscall_encoder.transform(syscalls[:seq_len]) + 1
                if len(encoded_syscalls) < window_size:
                    encoded_syscalls = np.pad(encoded_syscalls, 
                                              (0, window_size - len(encoded_syscalls)), 
                                              constant_values=PAD_IDX)
                
                # Encode return values (+1 to reserve 0 for PAD)
                encoded_retvals = retval_encoder.transform(retvals[:seq_len]) + 1
                if len(encoded_retvals) < window_size:
                    encoded_retvals = np.pad(encoded_retvals, 
                                             (0, window_size - len(encoded_retvals)), 
                                             constant_values=PAD_IDX)
                
                # Get parameter embeddings
                param_embs = []
                for i in range(window_size):
                    if i < len(params):
                        param = params[i]
                        if pd.isna(param):
                            key = '<EMPTY>'
                        else:
                            key = str(param)
                        param_embs.append(param_to_embedding[key])
                    else:
                        param_embs.append(pad_param_embedding)
                
                self.syscall_seqs.append(encoded_syscalls)
                self.retval_seqs.append(encoded_retvals)
                self.param_embeddings.append(np.array(param_embs, dtype=np.float32))
                self.labels.append(label_map[label])
        
        self.syscall_seqs = np.array(self.syscall_seqs)
        self.retval_seqs = np.array(self.retval_seqs)
        self.param_embeddings = np.array(self.param_embeddings)
        self.labels = np.array(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.syscall_seqs[idx], dtype=torch.long),
            torch.tensor(self.retval_seqs[idx], dtype=torch.long),
            torch.tensor(self.param_embeddings[idx], dtype=torch.float32),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )

In [6]:
class CNN1DAllFeatures(nn.Module):
    """1D CNN that combines all features:
    - Syscall embeddings (learned)
    - Return value embeddings (learned)
    - Parameter embeddings (pre-computed sentence embeddings)
    
    All three are concatenated at each timestep before being fed to the CNN.
    """
    def __init__(self, syscall_vocab_size, retval_vocab_size, 
                 syscall_embed_dim, retval_embed_dim, param_embed_dim,
                 num_filters=64, kernel_sizes=[3, 5, 7]):
        super().__init__()
        
        # Embedding layers for categorical features
        self.syscall_embedding = nn.Embedding(syscall_vocab_size, syscall_embed_dim, padding_idx=PAD_IDX)
        self.retval_embedding = nn.Embedding(retval_vocab_size, retval_embed_dim, padding_idx=PAD_IDX)
        
        # Total input dimension after concatenation
        total_embed_dim = syscall_embed_dim + retval_embed_dim + param_embed_dim
        
        # 1D Convolution layers
        self.convs = nn.ModuleList([
            nn.Conv1d(total_embed_dim, num_filters, k, padding=k//2)
            for k in kernel_sizes
        ])
        
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(num_filters * len(kernel_sizes), 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 2)
        )
    
    def forward(self, syscalls, retvals, param_embs):
        # Embed categorical features
        syscall_emb = self.syscall_embedding(syscalls)  # (batch, seq_len, syscall_embed_dim)
        retval_emb = self.retval_embedding(retvals)      # (batch, seq_len, retval_embed_dim)
        
        # param_embs already has shape (batch, seq_len, param_embed_dim)
        
        # Concatenate all embeddings
        x = torch.cat([syscall_emb, retval_emb, param_embs], dim=2)  # (batch, seq_len, total_embed_dim)
        x = x.permute(0, 2, 1)  # (batch, total_embed_dim, seq_len)
        
        # Apply convolutions with global max pooling
        conv_outs = []
        for conv in self.convs:
            c = torch.relu(conv(x))
            c = torch.max(c, dim=2)[0]  # Global max pooling
            conv_outs.append(c)
        
        x = torch.cat(conv_outs, dim=1)
        return self.fc(x)

### Results for 250, 500, and 1000 window size
```plaintext
============================================================
EXPERIMENT: Window Size = 250
============================================================
Train samples: 1986, Test samples: 810

--- Input Shapes ---
  Syscall input shape:   (batch_size, 250)
  Retval input shape:    (batch_size, 250)
  Param emb input shape: (batch_size, 250, 384)
  After embedding concat: (batch_size, 250, 448)

--- Model Architecture & Parameters ---
  syscall_embedding.weight: [81, 32] = 2,592 params
  retval_embedding.weight: [42586, 32] = 1,362,752 params
  convs.0.weight: [64, 448, 3] = 86,016 params
  convs.0.bias: [64] = 64 params
  convs.1.weight: [64, 448, 5] = 143,360 params
  convs.1.bias: [64] = 64 params
  convs.2.weight: [64, 448, 7] = 200,704 params
  convs.2.bias: [64] = 64 params
  fc.0.weight: [64, 192] = 12,288 params
  fc.0.bias: [64] = 64 params
  fc.3.weight: [2, 64] = 128 params
  fc.3.bias: [2] = 2 params
  ──────────────────────────────────────────────────
  Total parameters:     1,808,098
  Trainable parameters: 1,808,098

Training...
Epoch 1/20: 100%|██████████| 63/63 [00:08<00:00,  7.48it/s, loss=0.0061, acc=0.9285]
Epoch 2/20: 100%|██████████| 63/63 [00:09<00:00,  6.68it/s, loss=0.0004, acc=0.9950]
Epoch 3/20: 100%|██████████| 63/63 [00:09<00:00,  6.51it/s, loss=0.0003, acc=0.9960]
Epoch 4/20: 100%|██████████| 63/63 [00:09<00:00,  6.35it/s, loss=0.0001, acc=0.9980]
Epoch 5/20: 100%|██████████| 63/63 [00:10<00:00,  6.22it/s, loss=0.0001, acc=0.9990]
Epoch 6/20: 100%|██████████| 63/63 [00:10<00:00,  6.04it/s, loss=0.0001, acc=0.9990]
Epoch 7/20: 100%|██████████| 63/63 [00:10<00:00,  6.00it/s, loss=0.0000, acc=1.0000]
Epoch 8/20: 100%|██████████| 63/63 [00:10<00:00,  6.06it/s, loss=0.0000, acc=1.0000]
Epoch 9/20: 100%|██████████| 63/63 [00:10<00:00,  6.11it/s, loss=0.0000, acc=1.0000]
Epoch 10/20: 100%|██████████| 63/63 [00:10<00:00,  5.96it/s, loss=0.0000, acc=1.0000]
Epoch 11/20: 100%|██████████| 63/63 [00:10<00:00,  6.01it/s, loss=0.0000, acc=1.0000]
Epoch 12/20: 100%|██████████| 63/63 [00:10<00:00,  5.94it/s, loss=0.0000, acc=1.0000]
Epoch 13/20: 100%|██████████| 63/63 [00:10<00:00,  6.00it/s, loss=0.0000, acc=1.0000]
Epoch 14/20: 100%|██████████| 63/63 [00:10<00:00,  5.98it/s, loss=0.0000, acc=1.0000]
Epoch 15/20: 100%|██████████| 63/63 [00:10<00:00,  6.01it/s, loss=0.0000, acc=1.0000]
Epoch 16/20: 100%|██████████| 63/63 [00:10<00:00,  5.90it/s, loss=0.0000, acc=1.0000]
Epoch 17/20: 100%|██████████| 63/63 [00:10<00:00,  6.01it/s, loss=0.0000, acc=1.0000]
Epoch 18/20: 100%|██████████| 63/63 [00:10<00:00,  5.88it/s, loss=0.0000, acc=1.0000]
Epoch 19/20: 100%|██████████| 63/63 [00:10<00:00,  5.92it/s, loss=0.0000, acc=1.0000]
Epoch 20/20: 100%|██████████| 63/63 [00:10<00:00,  5.92it/s, loss=0.0000, acc=1.0000]
Training time: 205.56s

Evaluating...
Test time: 1.06s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.99      0.99       489
   malicious       0.98      0.99      0.98       321

    accuracy                           0.99       810
   macro avg       0.99      0.99      0.99       810
weighted avg       0.99      0.99      0.99       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              482                7
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.0143
F1-score (weighted): 0.9877

============================================================
EXPERIMENT: Window Size = 500
============================================================
Train samples: 1986, Test samples: 810

--- Input Shapes ---
  Syscall input shape:   (batch_size, 500)
  Retval input shape:    (batch_size, 500)
  Param emb input shape: (batch_size, 500, 384)
  After embedding concat: (batch_size, 500, 448)

--- Model Architecture & Parameters ---
  syscall_embedding.weight: [81, 32] = 2,592 params
  retval_embedding.weight: [42586, 32] = 1,362,752 params
  convs.0.weight: [64, 448, 3] = 86,016 params
  convs.0.bias: [64] = 64 params
  convs.1.weight: [64, 448, 5] = 143,360 params
  convs.1.bias: [64] = 64 params
  convs.2.weight: [64, 448, 7] = 200,704 params
  convs.2.bias: [64] = 64 params
  fc.0.weight: [64, 192] = 12,288 params
  fc.0.bias: [64] = 64 params
  fc.3.weight: [2, 64] = 128 params
  fc.3.bias: [2] = 2 params
  ──────────────────────────────────────────────────
  Total parameters:     1,808,098
  Trainable parameters: 1,808,098

Training...
Epoch 1/20: 100%|██████████| 63/63 [00:16<00:00,  3.89it/s, loss=0.0040, acc=0.9466]
Epoch 2/20: 100%|██████████| 63/63 [00:18<00:00,  3.44it/s, loss=0.0003, acc=0.9980]
Epoch 3/20: 100%|██████████| 63/63 [00:19<00:00,  3.29it/s, loss=0.0001, acc=0.9995]
Epoch 4/20: 100%|██████████| 63/63 [00:19<00:00,  3.17it/s, loss=0.0000, acc=1.0000]
Epoch 5/20: 100%|██████████| 63/63 [00:20<00:00,  3.10it/s, loss=0.0000, acc=1.0000]
Epoch 6/20: 100%|██████████| 63/63 [00:20<00:00,  3.11it/s, loss=0.0000, acc=1.0000]
Epoch 7/20: 100%|██████████| 63/63 [00:20<00:00,  3.13it/s, loss=0.0000, acc=1.0000]
Epoch 8/20: 100%|██████████| 63/63 [00:20<00:00,  3.13it/s, loss=0.0000, acc=1.0000]
Epoch 9/20: 100%|██████████| 63/63 [00:20<00:00,  3.10it/s, loss=0.0000, acc=1.0000]
Epoch 10/20: 100%|██████████| 63/63 [00:20<00:00,  3.13it/s, loss=0.0000, acc=1.0000]
Epoch 11/20: 100%|██████████| 63/63 [00:20<00:00,  3.10it/s, loss=0.0000, acc=1.0000]
Epoch 12/20: 100%|██████████| 63/63 [00:20<00:00,  3.14it/s, loss=0.0000, acc=1.0000]
Epoch 13/20: 100%|██████████| 63/63 [00:19<00:00,  3.15it/s, loss=0.0000, acc=1.0000]
Epoch 14/20: 100%|██████████| 63/63 [00:20<00:00,  3.08it/s, loss=0.0000, acc=1.0000]
Epoch 15/20: 100%|██████████| 63/63 [00:20<00:00,  3.09it/s, loss=0.0000, acc=1.0000]
Epoch 16/20: 100%|██████████| 63/63 [00:20<00:00,  3.11it/s, loss=0.0000, acc=1.0000]
Epoch 17/20: 100%|██████████| 63/63 [00:20<00:00,  3.09it/s, loss=0.0000, acc=1.0000]
Epoch 18/20: 100%|██████████| 63/63 [00:20<00:00,  3.08it/s, loss=0.0000, acc=1.0000]
Epoch 19/20: 100%|██████████| 63/63 [00:20<00:00,  3.07it/s, loss=0.0000, acc=1.0000]
Epoch 20/20: 100%|██████████| 63/63 [00:20<00:00,  3.14it/s, loss=0.0000, acc=1.0000]
Training time: 397.73s

Evaluating...
Test time: 2.01s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.84      0.91       489
   malicious       0.80      1.00      0.89       321

    accuracy                           0.90       810
   macro avg       0.90      0.92      0.90       810
weighted avg       0.92      0.90      0.90       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              411               78
True: malicious             1              320

Detection Rate: 0.9969
False Positive Rate: 0.1595
F1-score (weighted): 0.9035

============================================================
EXPERIMENT: Window Size = 1000
============================================================
Train samples: 1986, Test samples: 810

--- Input Shapes ---
  Syscall input shape:   (batch_size, 1000)
  Retval input shape:    (batch_size, 1000)
  Param emb input shape: (batch_size, 1000, 384)
  After embedding concat: (batch_size, 1000, 448)

--- Model Architecture & Parameters ---
  syscall_embedding.weight: [81, 32] = 2,592 params
  retval_embedding.weight: [42586, 32] = 1,362,752 params
  convs.0.weight: [64, 448, 3] = 86,016 params
  convs.0.bias: [64] = 64 params
  convs.1.weight: [64, 448, 5] = 143,360 params
  convs.1.bias: [64] = 64 params
  convs.2.weight: [64, 448, 7] = 200,704 params
  convs.2.bias: [64] = 64 params
  fc.0.weight: [64, 192] = 12,288 params
  fc.0.bias: [64] = 64 params
  fc.3.weight: [2, 64] = 128 params
  fc.3.bias: [2] = 2 params
  ──────────────────────────────────────────────────
  Total parameters:     1,808,098
  Trainable parameters: 1,808,098

Training...
Epoch 1/20: 100%|██████████| 63/63 [00:28<00:00,  2.23it/s, loss=0.0054, acc=0.9290]
Epoch 2/20: 100%|██████████| 63/63 [00:34<00:00,  1.83it/s, loss=0.0003, acc=0.9985]
Epoch 3/20: 100%|██████████| 63/63 [00:36<00:00,  1.73it/s, loss=0.0001, acc=1.0000]
Epoch 4/20: 100%|██████████| 63/63 [00:37<00:00,  1.67it/s, loss=0.0000, acc=1.0000]
Epoch 5/20: 100%|██████████| 63/63 [00:38<00:00,  1.65it/s, loss=0.0000, acc=1.0000]
Epoch 6/20: 100%|██████████| 63/63 [00:38<00:00,  1.65it/s, loss=0.0000, acc=1.0000]
Epoch 7/20: 100%|██████████| 63/63 [00:38<00:00,  1.63it/s, loss=0.0000, acc=1.0000]
Epoch 8/20: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s, loss=0.0000, acc=1.0000]
Epoch 9/20: 100%|██████████| 63/63 [00:38<00:00,  1.62it/s, loss=0.0000, acc=1.0000]
Epoch 10/20: 100%|██████████| 63/63 [00:38<00:00,  1.62it/s, loss=0.0000, acc=1.0000]
Epoch 11/20: 100%|██████████| 63/63 [00:38<00:00,  1.63it/s, loss=0.0000, acc=1.0000]
Epoch 12/20: 100%|██████████| 63/63 [00:38<00:00,  1.64it/s, loss=0.0000, acc=1.0000]
Epoch 13/20: 100%|██████████| 63/63 [00:38<00:00,  1.62it/s, loss=0.0000, acc=1.0000]
Epoch 14/20: 100%|██████████| 63/63 [00:38<00:00,  1.64it/s, loss=0.0000, acc=1.0000]
Epoch 15/20: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s, loss=0.0000, acc=1.0000]
Epoch 16/20: 100%|██████████| 63/63 [00:39<00:00,  1.59it/s, loss=0.0000, acc=1.0000]
Epoch 17/20: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s, loss=0.0000, acc=1.0000]
Epoch 18/20: 100%|██████████| 63/63 [00:39<00:00,  1.60it/s, loss=0.0000, acc=1.0000]
Epoch 19/20: 100%|██████████| 63/63 [00:38<00:00,  1.62it/s, loss=0.0000, acc=1.0000]
Epoch 20/20: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s, loss=0.0000, acc=1.0000]
Training time: 758.89s

Evaluating...
Test time: 3.96s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.94      0.97       489
   malicious       0.92      1.00      0.96       321

    accuracy                           0.96       810
   macro avg       0.96      0.97      0.96       810
weighted avg       0.97      0.96      0.96       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              460               29
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.0593
F1-score (weighted): 0.9644
```

In [7]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Create datasets
    train_dataset = AllFeaturesDataset(
        train_files, syscall_encoder, retval_encoder,
        param_to_embedding, window_size, PARAM_EMBED_DIM, PAD_PARAM_EMBEDDING
    )
    test_dataset = AllFeaturesDataset(
        test_files, syscall_encoder, retval_encoder,
        param_to_embedding, window_size, PARAM_EMBED_DIM, PAD_PARAM_EMBEDDING
    )
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    
    # Print input shapes
    print(f"\n--- Input Shapes ---")
    print(f"  Syscall input shape:   (batch_size, {window_size})")
    print(f"  Retval input shape:    (batch_size, {window_size})")
    print(f"  Param emb input shape: (batch_size, {window_size}, {PARAM_EMBED_DIM})")
    print(f"  After embedding concat: (batch_size, {window_size}, {SYSCALL_EMBED_DIM + RETVAL_EMBED_DIM + PARAM_EMBED_DIM})")
    
    # Create model
    model = CNN1DAllFeatures(
        syscall_vocab_size=syscall_vocab_size,
        retval_vocab_size=retval_vocab_size,
        syscall_embed_dim=SYSCALL_EMBED_DIM,
        retval_embed_dim=RETVAL_EMBED_DIM,
        param_embed_dim=PARAM_EMBED_DIM,
        num_filters=64
    ).to(DEVICE)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Print model parameter sizes per layer
    print(f"\n--- Model Architecture & Parameters ---")
    total_params = 0
    trainable_params = 0
    for name, param in model.named_parameters():
        param_count = param.numel()
        total_params += param_count
        if param.requires_grad:
            trainable_params += param_count
        print(f"  {name}: {list(param.shape)} = {param_count:,} params")
    print(f"  {'─'*50}")
    print(f"  Total parameters:     {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")
    
    # Training
    print(f"\nTraining...")
    train_start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=True)
        for syscalls, retvals, param_embs, labels in pbar:
            syscalls = syscalls.to(DEVICE)
            retvals = retvals.to(DEVICE)
            param_embs = param_embs.to(DEVICE)
            labels = labels.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(syscalls, retvals, param_embs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            
            pbar.set_postfix({'loss': f'{total_loss/total:.4f}', 'acc': f'{correct/total:.4f}'})
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
    
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for syscalls, retvals, param_embs, labels in test_loader:
            syscalls = syscalls.to(DEVICE)
            retvals = retvals.to(DEVICE)
            param_embs = param_embs.to(DEVICE)
            
            outputs = model(syscalls, retvals, param_embs)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(all_labels, all_preds, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['benign', 'malicious']))
    
    labels_names = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels_names], columns=[f'Pred: {l}' for l in labels_names])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")
    
    # Clear GPU memory between experiments
    del model
    torch.cuda.empty_cache()


EXPERIMENT: Window Size = 2000
Train samples: 1986, Test samples: 810

--- Input Shapes ---
  Syscall input shape:   (batch_size, 2000)
  Retval input shape:    (batch_size, 2000)
  Param emb input shape: (batch_size, 2000, 384)
  After embedding concat: (batch_size, 2000, 448)

--- Model Architecture & Parameters ---
  syscall_embedding.weight: [81, 32] = 2,592 params
  retval_embedding.weight: [42586, 32] = 1,362,752 params
  convs.0.weight: [64, 448, 3] = 86,016 params
  convs.0.bias: [64] = 64 params
  convs.1.weight: [64, 448, 5] = 143,360 params
  convs.1.bias: [64] = 64 params
  convs.2.weight: [64, 448, 7] = 200,704 params
  convs.2.bias: [64] = 64 params
  fc.0.weight: [64, 192] = 12,288 params
  fc.0.bias: [64] = 64 params
  fc.3.weight: [2, 64] = 128 params
  fc.3.bias: [2] = 2 params
  ──────────────────────────────────────────────────
  Total parameters:     1,808,098
  Trainable parameters: 1,808,098

Training...


Epoch 1/20: 100%|██████████| 63/63 [00:59<00:00,  1.06it/s, loss=0.0047, acc=0.9391]
Epoch 2/20: 100%|██████████| 63/63 [01:10<00:00,  1.12s/it, loss=0.0002, acc=0.9990]
Epoch 3/20: 100%|██████████| 63/63 [01:14<00:00,  1.18s/it, loss=0.0000, acc=0.9995]
Epoch 4/20: 100%|██████████| 63/63 [01:15<00:00,  1.20s/it, loss=0.0000, acc=1.0000]
Epoch 5/20: 100%|██████████| 63/63 [01:16<00:00,  1.21s/it, loss=0.0000, acc=1.0000]
Epoch 6/20: 100%|██████████| 63/63 [01:17<00:00,  1.22s/it, loss=0.0000, acc=1.0000]
Epoch 7/20: 100%|██████████| 63/63 [01:16<00:00,  1.22s/it, loss=0.0000, acc=1.0000]
Epoch 8/20: 100%|██████████| 63/63 [01:17<00:00,  1.23s/it, loss=0.0000, acc=1.0000]
Epoch 9/20: 100%|██████████| 63/63 [01:16<00:00,  1.21s/it, loss=0.0000, acc=1.0000]
Epoch 10/20: 100%|██████████| 63/63 [01:17<00:00,  1.22s/it, loss=0.0000, acc=1.0000]
Epoch 11/20: 100%|██████████| 63/63 [01:18<00:00,  1.25s/it, loss=0.0000, acc=1.0000]
Epoch 12/20: 100%|██████████| 63/63 [01:16<00:00,  1.22s/it, lo

Training time: 1513.77s

Evaluating...
Test time: 7.93s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.96      0.98       489
   malicious       0.94      1.00      0.97       321

    accuracy                           0.98       810
   macro avg       0.97      0.98      0.98       810
weighted avg       0.98      0.98      0.98       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              470               19
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.0389
F1-score (weighted): 0.9766


In [8]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS (All Features Combined)")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))

print(f"\nFeatures combined:")
print(f"  - Syscall embedding: {SYSCALL_EMBED_DIM} dims")
print(f"  - Return value embedding: {RETVAL_EMBED_DIM} dims")
print(f"  - Parameter embedding (sentence transformer): {PARAM_EMBED_DIM} dims")
print(f"  - Total: {SYSCALL_EMBED_DIM + RETVAL_EMBED_DIM + PARAM_EMBED_DIM} dims per timestep")


SUMMARY OF RESULTS (All Features Combined)
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9907              0.0143              0.9877         205.56          1.06
         500         0.9969              0.1595              0.9035         397.73          2.01
        1000         1.0000              0.0593              0.9644         758.89          3.96
        2000         1.0000              0.0389              0.9766        1513.77          7.93

Features combined:
  - Syscall embedding: 32 dims
  - Return value embedding: 32 dims
  - Parameter embedding (sentence transformer): 384 dims
  - Total: 448 dims per timestep
