## Feature: Syscall only

In [7]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
BATCH_SIZE = 32
EPOCHS = 6
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

Using device: cuda


In [8]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs(file_path):
    """Load syscalls grouped by run."""
    df = pd.read_csv(file_path)
    runs = df.groupby('run')['syscall'].apply(list).tolist()
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

# Collect all syscalls for building vocabulary
all_syscalls = []
for path, _ in train_files + test_files:
    for run in load_runs(path):
        all_syscalls.extend(run)

# Build syscall encoder (add PAD token at index 0)
syscall_encoder = LabelEncoder()
syscall_encoder.fit(all_syscalls)
vocab_size = len(syscall_encoder.classes_) + 1  # +1 for PAD token
PAD_IDX = 0
print(f"\nVocabulary size: {vocab_size} (including PAD)")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321

Vocabulary size: 81 (including PAD)


In [9]:
class SyscallDataset(Dataset):
    def __init__(self, file_label_pairs, encoder, max_len):
        self.sequences = []
        self.labels = []
        label_map = {'benign': 0, 'malicious': 1}
        
        for path, label in file_label_pairs:
            runs = load_runs(path)
            for run_syscalls in runs:
                # Encode syscalls (+1 to reserve 0 for PAD)
                encoded = encoder.transform(run_syscalls) + 1
                
                # Truncate or pad to max_len
                if len(encoded) > max_len:
                    encoded = encoded[:max_len]
                else:
                    encoded = np.pad(encoded, (0, max_len - len(encoded)), constant_values=PAD_IDX)
                
                self.sequences.append(encoded)
                self.labels.append(label_map[label])
        
        self.sequences = np.array(self.sequences)
        self.labels = np.array(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.sequences[idx], dtype=torch.long),
                torch.tensor(self.labels[idx], dtype=torch.long))

In [10]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim=64, num_heads=4, num_layers=2, ff_dim=128, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.pos_encoding = nn.Parameter(torch.zeros(1, max_len, embed_dim))
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2)
        )
        
        self._init_weights()
    
    def _init_weights(self):
        nn.init.normal_(self.pos_encoding, std=0.02)
    
    def forward(self, x):
        # Create padding mask (True for PAD positions)
        padding_mask = (x == PAD_IDX)
        
        # Embedding + positional encoding
        x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
        
        # Transformer encoder
        x = self.transformer(x, src_key_padding_mask=padding_mask)
        
        # Global average pooling (excluding padding)
        mask_expanded = (~padding_mask).unsqueeze(-1).float()
        x = (x * mask_expanded).sum(dim=1) / (mask_expanded.sum(dim=1) + 1e-9)
        
        return self.fc(x)

In [11]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Create datasets
    train_dataset = SyscallDataset(train_files, syscall_encoder, window_size)
    test_dataset = SyscallDataset(test_files, syscall_encoder, window_size)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    
    # Create model
    model = TransformerEncoder(vocab_size, max_len=window_size).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Training
    print(f"\nTraining...")
    train_start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Acc: {train_acc:.4f}")
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(DEVICE)
            outputs = model(x)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y.numpy())
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(all_labels, all_preds, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")


EXPERIMENT: Window Size = 250
Train samples: 1986, Test samples: 810
Model parameters: 92,418

Training...
Epoch 1/6 - Loss: 0.2768, Acc: 0.8766
Epoch 5/6 - Loss: 0.0227, Acc: 0.9940
Training time: 189.58s

Evaluating...
Test time: 2.55s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.82      0.90       489
   malicious       0.78      0.99      0.87       321

    accuracy                           0.89       810
   macro avg       0.89      0.90      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              400               89
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.1820
F1-score (weighted): 0.8877

EXPERIMENT: Window Size = 500
Train samples: 1986, Test samples: 810
Model parameters: 108,418

Training...
Epoch 1/6 - Loss: 0.2772, Acc: 0.8877
Epoch 5/6 - Loss: 0.0150

In [12]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))


SUMMARY OF RESULTS
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9907              0.1820              0.8877         189.58          2.55
         500         0.9938              0.1840              0.8877         627.14          4.90
        1000         1.0000              0.1881              0.8876        2265.30          8.43
        2000         1.0000              0.1840              0.8901        8463.57         14.74


## Feature: Return values only

In [18]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
BATCH_SIZE = 32
EPOCHS = 6
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

Using device: cuda


In [19]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_retval(file_path):
    """Load return values grouped by run."""
    df = pd.read_csv(file_path)
    runs = df.groupby('run')['Ret'].apply(list).tolist()
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_retval(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

# Collect all return values for building vocabulary
all_retvals = []
for path, _ in train_files + test_files:
    for run in load_runs_retval(path):
        all_retvals.extend(run)

# Build return value encoder (add PAD token at index 0)
retval_encoder = LabelEncoder()
retval_encoder.fit(all_retvals)
vocab_size = len(retval_encoder.classes_) + 1  # +1 for PAD token
PAD_IDX = 0
print(f"\nVocabulary size: {vocab_size} (including PAD)")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321

Vocabulary size: 42586 (including PAD)


In [20]:
class RetvalDataset(Dataset):
    def __init__(self, file_label_pairs, encoder, max_len):
        self.sequences = []
        self.labels = []
        label_map = {'benign': 0, 'malicious': 1}
        
        for path, label in file_label_pairs:
            runs = load_runs_retval(path)
            for run_retvals in runs:
                # Encode return values (+1 to reserve 0 for PAD)
                encoded = encoder.transform(run_retvals) + 1
                
                # Truncate or pad to max_len
                if len(encoded) > max_len:
                    encoded = encoded[:max_len]
                else:
                    encoded = np.pad(encoded, (0, max_len - len(encoded)), constant_values=PAD_IDX)
                
                self.sequences.append(encoded)
                self.labels.append(label_map[label])
        
        self.sequences = np.array(self.sequences)
        self.labels = np.array(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.sequences[idx], dtype=torch.long),
                torch.tensor(self.labels[idx], dtype=torch.long))

In [21]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim=64, num_heads=4, num_layers=2, ff_dim=128, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.pos_encoding = nn.Parameter(torch.zeros(1, max_len, embed_dim))
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2)
        )
        
        self._init_weights()
    
    def _init_weights(self):
        nn.init.normal_(self.pos_encoding, std=0.02)
    
    def forward(self, x):
        # Create padding mask (True for PAD positions)
        padding_mask = (x == PAD_IDX)
        
        # Embedding + positional encoding
        x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
        
        # Transformer encoder
        x = self.transformer(x, src_key_padding_mask=padding_mask)
        
        # Global average pooling (excluding padding)
        mask_expanded = (~padding_mask).unsqueeze(-1).float()
        x = (x * mask_expanded).sum(dim=1) / (mask_expanded.sum(dim=1) + 1e-9)
        
        return self.fc(x)

In [22]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Create datasets
    train_dataset = RetvalDataset(train_files, retval_encoder, window_size)
    test_dataset = RetvalDataset(test_files, retval_encoder, window_size)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    
    # Create model
    model = TransformerEncoder(vocab_size, max_len=window_size).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Training
    print(f"\nTraining...")
    train_start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Acc: {train_acc:.4f}")
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(DEVICE)
            outputs = model(x)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y.numpy())
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(all_labels, all_preds, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")


EXPERIMENT: Window Size = 250
Train samples: 1986, Test samples: 810
Model parameters: 2,812,738

Training...
Epoch 1/6 - Loss: 0.3241, Acc: 0.8620
Epoch 5/6 - Loss: 0.0238, Acc: 0.9955
Training time: 197.73s

Evaluating...
Test time: 2.54s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.83      0.90       489
   malicious       0.79      0.99      0.88       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              404               85
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.1738
F1-score (weighted): 0.8925

EXPERIMENT: Window Size = 500
Train samples: 1986, Test samples: 810
Model parameters: 2,828,738

Training...
Epoch 1/6 - Loss: 0.3219, Acc: 0.8620
Epoch 5/6 - Loss: 0

In [23]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))


SUMMARY OF RESULTS
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9907              0.1738              0.8925         197.73          2.54
         500         0.9969              0.0082              0.9938          67.34          0.40
        1000         1.0000              0.1840              0.8901        2278.55          8.49
        2000         1.0000              0.1840              0.8901        8481.32         14.81


## Feature: Parameters only

In [1]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
BATCH_SIZE = 32
EPOCHS = 6
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Load sentence transformer model for semantic embeddings
# Using a lightweight model that produces 384-dim embeddings
# Keep on CPU to save GPU memory for training
print("Loading sentence transformer model...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
EMBEDDING_DIM = sentence_model.get_sentence_embedding_dimension()
print(f"Sentence embedding dimension: {EMBEDDING_DIM}")

Using device: cuda
Loading sentence transformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence embedding dimension: 384


In [2]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_params_raw(file_path):
    """Load raw parameter strings grouped by run (list of param strings per run).
    
    Returns list of runs, where each run is a list of parameter strings (one per syscall).
    """
    df = pd.read_csv(file_path)
    runs = []
    for run_id, group in df.groupby('run'):
        # Keep parameters as list of strings (one per syscall)
        run_params = group['parameters'].tolist()
        runs.append(run_params)
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_params_raw(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321


In [3]:
# Pre-compute sentence embeddings for all unique parameter strings
# This avoids re-encoding the same strings multiple times and saves memory
print("Collecting unique parameter strings...")

unique_params = set()
for path, _ in train_files + test_files:
    for run_params in load_runs_params_raw(path):
        for param_str in run_params:
            # Convert to string and handle NaN
            if pd.isna(param_str):
                unique_params.add('<EMPTY>')
            else:
                unique_params.add(str(param_str))

unique_params = list(unique_params)
print(f"Unique parameter strings: {len(unique_params)}")

# Compute embeddings for all unique strings in batches
print("Computing sentence embeddings (this may take a few minutes)...")
param_embeddings = sentence_model.encode(
    unique_params, 
    show_progress_bar=True, 
    batch_size=256,
    convert_to_numpy=True
)

# Create a mapping from parameter string to embedding
param_to_embedding = {param: emb for param, emb in zip(unique_params, param_embeddings)}
print(f"Embeddings computed. Shape per embedding: {EMBEDDING_DIM}")

# Create a zero embedding for padding
PAD_EMBEDDING = np.zeros(EMBEDDING_DIM, dtype=np.float32)

Collecting unique parameter strings...
Unique parameter strings: 266019
Computing sentence embeddings (this may take a few minutes)...


Batches:   0%|          | 0/1040 [00:00<?, ?it/s]

Embeddings computed. Shape per embedding: 384


In [4]:
class ParamsEmbeddingDataset(Dataset):
    """Dataset that uses pre-computed sentence embeddings for parameter strings.
    
    Instead of tokenizing each parameter string into multiple tokens,
    we embed each parameter string as a single vector using a sentence transformer.
    This reduces sequence length from window_size * tokens_per_syscall to just window_size.
    """
    def __init__(self, file_label_pairs, param_to_embedding, window_size, embed_dim, pad_embedding):
        """
        Args:
            file_label_pairs: List of (file_path, label) tuples
            param_to_embedding: Dict mapping parameter strings to embeddings
            window_size: Number of syscalls (parameter strings) to consider from each run
            embed_dim: Dimension of sentence embeddings
            pad_embedding: Zero embedding for padding
        """
        self.embeddings = []
        self.labels = []
        self.window_size = window_size
        self.embed_dim = embed_dim
        label_map = {'benign': 0, 'malicious': 1}
        
        for path, label in file_label_pairs:
            runs = load_runs_params_raw(path)
            for run_params in runs:
                # Take first window_size parameter strings from this run
                params_to_use = run_params[:window_size]
                
                # Get embedding for each parameter string
                run_embeddings = []
                for param_str in params_to_use:
                    if pd.isna(param_str):
                        key = '<EMPTY>'
                    else:
                        key = str(param_str)
                    run_embeddings.append(param_to_embedding[key])
                
                # Pad to window_size if needed
                while len(run_embeddings) < window_size:
                    run_embeddings.append(pad_embedding)
                
                self.embeddings.append(np.array(run_embeddings, dtype=np.float32))
                self.labels.append(label_map[label])
        
        self.embeddings = np.array(self.embeddings)  # Shape: (num_samples, window_size, embed_dim)
        self.labels = np.array(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.embeddings[idx], dtype=torch.float32),
                torch.tensor(self.labels[idx], dtype=torch.long))

In [5]:
class TransformerEncoderForEmbeddings(nn.Module):
    """Transformer encoder that takes pre-computed embeddings as input.
    
    Unlike the standard version that has an embedding layer,
    this version expects inputs to already be embedded (float tensors).
    """
    def __init__(self, input_dim, max_len, hidden_dim=128, num_heads=4, num_layers=2, ff_dim=256, dropout=0.1):
        super().__init__()
        
        # Project input embeddings to hidden dimension
        self.input_projection = nn.Linear(input_dim, hidden_dim)
        self.pos_encoding = nn.Parameter(torch.zeros(1, max_len, hidden_dim))
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 2)
        )
        
        self._init_weights()
    
    def _init_weights(self):
        nn.init.normal_(self.pos_encoding, std=0.02)
    
    def forward(self, x):
        # x shape: (batch_size, seq_len, input_dim)
        # Detect padding by checking if embedding is all zeros
        padding_mask = (x.abs().sum(dim=-1) == 0)  # True for padded positions
        
        # Project to hidden dimension and add positional encoding
        x = self.input_projection(x) + self.pos_encoding[:, :x.size(1), :]
        
        # Transformer encoder
        x = self.transformer(x, src_key_padding_mask=padding_mask)
        
        # Global average pooling (excluding padding)
        mask_expanded = (~padding_mask).unsqueeze(-1).float()
        x = (x * mask_expanded).sum(dim=1) / (mask_expanded.sum(dim=1) + 1e-9)
        
        return self.fc(x)

In [6]:
# Run experiments with different window sizes (window_size = number of syscalls)
# With sentence embeddings, sequence length = window_size (instead of window_size * tokens_per_syscall)
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size} syscalls")
    print(f"{'='*60}")
    
    # Create datasets with pre-computed embeddings
    train_dataset = ParamsEmbeddingDataset(
        train_files, param_to_embedding, window_size, EMBEDDING_DIM, PAD_EMBEDDING
    )
    test_dataset = ParamsEmbeddingDataset(
        test_files, param_to_embedding, window_size, EMBEDDING_DIM, PAD_EMBEDDING
    )
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    print(f"Sequence length: {window_size} (1 embedding per syscall parameter)")
    print(f"Input embedding dim: {EMBEDDING_DIM}")
    
    # Create model - takes pre-computed embeddings as input
    model = TransformerEncoderForEmbeddings(
        input_dim=EMBEDDING_DIM, 
        max_len=window_size,
        hidden_dim=128,
        num_heads=4,
        num_layers=2,
        ff_dim=256
    ).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Training
    print(f"\nTraining...")
    train_start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=True)
        for x, y in pbar:
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
            
            # Update progress bar with current metrics
            pbar.set_postfix({'loss': f'{total_loss/total:.4f}', 'acc': f'{correct/total:.4f}'})
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(DEVICE)
            outputs = model(x)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y.numpy())
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(all_labels, all_preds, average='weighted')
    
    # Store results
    results.append({
        'Window Size (syscalls)': window_size,
        'Seq Length': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")
    
    # Clear GPU memory between experiments
    del model
    torch.cuda.empty_cache()


EXPERIMENT: Window Size = 250 syscalls
Train samples: 1986, Test samples: 810
Sequence length: 250 (1 embedding per syscall parameter)
Input embedding dim: 384
Model parameters: 354,626

Training...


Epoch 1/6: 100%|██████████| 63/63 [00:05<00:00, 12.33it/s, loss=0.0169, acc=0.7533]
Epoch 2/6: 100%|██████████| 63/63 [00:04<00:00, 12.64it/s, loss=0.0224, acc=0.6883]
Epoch 3/6: 100%|██████████| 63/63 [00:05<00:00, 12.59it/s, loss=0.0124, acc=0.8293]
Epoch 4/6: 100%|██████████| 63/63 [00:05<00:00, 12.59it/s, loss=0.0023, acc=0.9854]
Epoch 5/6: 100%|██████████| 63/63 [00:04<00:00, 12.60it/s, loss=0.0010, acc=0.9950]
Epoch 6/6: 100%|██████████| 63/63 [00:04<00:00, 12.60it/s, loss=0.0009, acc=0.9950]


Training time: 30.12s

Evaluating...
Test time: 0.55s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.82      0.90       489
   malicious       0.78      0.99      0.87       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              401               88
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.1800
F1-score (weighted): 0.8889

EXPERIMENT: Window Size = 500 syscalls
Train samples: 1986, Test samples: 810
Sequence length: 500 (1 embedding per syscall parameter)
Input embedding dim: 384
Model parameters: 386,626

Training...


Epoch 1/6: 100%|██████████| 63/63 [00:14<00:00,  4.23it/s, loss=0.0105, acc=0.8560]
Epoch 2/6: 100%|██████████| 63/63 [00:15<00:00,  3.98it/s, loss=0.0027, acc=0.9839]
Epoch 3/6: 100%|██████████| 63/63 [00:17<00:00,  3.70it/s, loss=0.0004, acc=0.9975]
Epoch 4/6: 100%|██████████| 63/63 [00:16<00:00,  3.73it/s, loss=0.0004, acc=0.9980]
Epoch 5/6: 100%|██████████| 63/63 [02:15<00:00,  2.15s/it, loss=0.0004, acc=0.9980]
Epoch 6/6: 100%|██████████| 63/63 [02:19<00:00,  2.22s/it, loss=0.0010, acc=0.9935]


Training time: 339.68s

Evaluating...
Test time: 8.26s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.90      0.94       489
   malicious       0.86      1.00      0.92       321

    accuracy                           0.94       810
   macro avg       0.93      0.95      0.93       810
weighted avg       0.94      0.94      0.94       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              438               51
True: malicious             1              320

Detection Rate: 0.9969
False Positive Rate: 0.1043
F1-score (weighted): 0.9364

EXPERIMENT: Window Size = 1000 syscalls
Train samples: 1986, Test samples: 810
Sequence length: 1000 (1 embedding per syscall parameter)
Input embedding dim: 384
Model parameters: 450,626

Training...


Epoch 1/6: 100%|██████████| 63/63 [07:44<00:00,  7.37s/it, loss=0.0138, acc=0.8132]
Epoch 2/6: 100%|██████████| 63/63 [07:44<00:00,  7.37s/it, loss=0.0008, acc=0.9975]
Epoch 3/6: 100%|██████████| 63/63 [07:44<00:00,  7.37s/it, loss=0.0007, acc=0.9955]
Epoch 4/6: 100%|██████████| 63/63 [07:44<00:00,  7.37s/it, loss=0.0003, acc=0.9990]
Epoch 5/6: 100%|██████████| 63/63 [07:44<00:00,  7.37s/it, loss=0.0002, acc=0.9990]
Epoch 6/6: 100%|██████████| 63/63 [07:44<00:00,  7.37s/it, loss=0.0003, acc=0.9975]


Training time: 2786.50s

Evaluating...
Test time: 14.08s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.83      0.91       489
   malicious       0.79      1.00      0.89       321

    accuracy                           0.90       810
   macro avg       0.90      0.92      0.90       810
weighted avg       0.92      0.90      0.90       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              406               83
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.1697
F1-score (weighted): 0.8986

EXPERIMENT: Window Size = 2000 syscalls
Train samples: 1986, Test samples: 810
Sequence length: 2000 (1 embedding per syscall parameter)
Input embedding dim: 384
Model parameters: 578,626

Training...


Epoch 1/6: 100%|██████████| 63/63 [27:49<00:00, 26.49s/it, loss=0.0114, acc=0.8474]
Epoch 2/6: 100%|██████████| 63/63 [27:45<00:00, 26.44s/it, loss=0.0004, acc=0.9985]
Epoch 3/6: 100%|██████████| 63/63 [27:45<00:00, 26.43s/it, loss=0.0004, acc=0.9985]
Epoch 4/6: 100%|██████████| 63/63 [27:45<00:00, 26.43s/it, loss=0.0009, acc=0.9935]
Epoch 5/6: 100%|██████████| 63/63 [27:45<00:00, 26.44s/it, loss=0.0004, acc=0.9985]
Epoch 6/6: 100%|██████████| 63/63 [27:45<00:00, 26.43s/it, loss=0.0004, acc=0.9985]


Training time: 9996.82s

Evaluating...
Test time: 24.92s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.82      0.90       489
   malicious       0.78      1.00      0.88       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              399               90
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.1840
F1-score (weighted): 0.8901


In [7]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS (Sentence Embeddings)")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))

print("\nNote: Using sentence embeddings reduces sequence length from")
print("window_size * ~13 tokens to just window_size embeddings,")
print("enabling larger window sizes within GPU memory constraints.")


SUMMARY OF RESULTS (Sentence Embeddings)
 Window Size (syscalls)  Seq Length Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
                    250         250         0.9907              0.1800              0.8889          30.12          0.55
                    500         500         0.9969              0.1043              0.9364         339.68          8.26
                   1000        1000         1.0000              0.1697              0.8986        2786.50         14.08
                   2000        2000         1.0000              0.1840              0.8901        9996.82         24.92

Note: Using sentence embeddings reduces sequence length from
window_size * ~13 tokens to just window_size embeddings,
enabling larger window sizes within GPU memory constraints.


## Features: Syscalls + Return values + Parameters