## Feature: Syscall only

In [35]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
# WINDOW_SIZES = [500]  # Different sliding window lengths to test
BATCH_SIZE = 32
EPOCHS = 20
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

Using device: cuda


In [36]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs(file_path):
    """Load syscalls grouped by run."""
    df = pd.read_csv(file_path)
    runs = df.groupby('run')['syscall'].apply(list).tolist()
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

# Collect all syscalls for building vocabulary
all_syscalls = []
for path, _ in train_files + test_files:
    for run in load_runs(path):
        all_syscalls.extend(run)

# Build syscall encoder (add PAD token at index 0)
syscall_encoder = LabelEncoder()
syscall_encoder.fit(all_syscalls)
vocab_size = len(syscall_encoder.classes_) + 1  # +1 for PAD token
PAD_IDX = 0
print(f"\nVocabulary size: {vocab_size} (including PAD)")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321

Vocabulary size: 81 (including PAD)


In [37]:
class SyscallDataset(Dataset):
    def __init__(self, file_label_pairs, encoder, max_len):
        self.sequences = []
        self.labels = []
        label_map = {'benign': 0, 'malicious': 1}
        
        for path, label in file_label_pairs:
            runs = load_runs(path)
            for run_syscalls in runs:
                # Encode syscalls (+1 to reserve 0 for PAD)
                encoded = encoder.transform(run_syscalls) + 1
                
                # Truncate or pad to max_len
                if len(encoded) > max_len:
                    encoded = encoded[:max_len]
                else:
                    encoded = np.pad(encoded, (0, max_len - len(encoded)), constant_values=PAD_IDX)
                
                self.sequences.append(encoded)
                self.labels.append(label_map[label])
        
        self.sequences = np.array(self.sequences)
        self.labels = np.array(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.sequences[idx], dtype=torch.long),
                torch.tensor(self.labels[idx], dtype=torch.long))

In [38]:
class CNN1D(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, num_filters=64, kernel_sizes=[3, 5, 7]):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        
        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, k, padding=k//2)
            for k in kernel_sizes
        ])
        
        self.fc = nn.Sequential(
            nn.Linear(num_filters * len(kernel_sizes), 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 2)
        )
    
    def forward(self, x):
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        
        conv_outs = []
        for conv in self.convs:
            c = torch.relu(conv(x))
            c = torch.max(c, dim=2)[0]  # Global max pooling
            conv_outs.append(c)
        
        x = torch.cat(conv_outs, dim=1)
        return self.fc(x)

In [39]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Create datasets
    train_dataset = SyscallDataset(train_files, syscall_encoder, window_size)
    test_dataset = SyscallDataset(test_files, syscall_encoder, window_size)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    
    # Create model
    model = CNN1D(vocab_size).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Training
    print(f"\nTraining...")
    train_start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Acc: {train_acc:.4f}")
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(DEVICE)
            outputs = model(x)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y.numpy())
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(all_labels, all_preds, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")


EXPERIMENT: Window Size = 250
Train samples: 1986, Test samples: 810

Training...
Epoch 1/20 - Loss: 0.1934, Acc: 0.9235
Epoch 5/20 - Loss: 0.0071, Acc: 0.9965
Epoch 10/20 - Loss: 0.0017, Acc: 0.9995
Epoch 15/20 - Loss: 0.0001, Acc: 1.0000
Epoch 20/20 - Loss: 0.0001, Acc: 1.0000
Training time: 9.03s

Evaluating...
Test time: 0.07s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.82      0.90       489
   malicious       0.78      0.99      0.87       321

    accuracy                           0.89       810
   macro avg       0.89      0.90      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              400               89
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.1820
F1-score (weighted): 0.8877

EXPERIMENT: Window Size = 500
Train samples: 1986, Test samples: 810

Train

In [40]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))


SUMMARY OF RESULTS
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9907              0.1820              0.8877           9.03          0.07
         500         0.9969              0.0327              0.9791          14.67          0.10
        1000         1.0000              0.0695              0.9583          26.58          0.18
        2000         1.0000              0.0307              0.9815         184.97          3.61


## Feature: Return values only

In [1]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
BATCH_SIZE = 32
EPOCHS = 20
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")


Using device: cpu


In [2]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_retval(file_path):
    """Load return values grouped by run."""
    df = pd.read_csv(file_path)
    runs = df.groupby('run')['Ret'].apply(list).tolist()
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_retval(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

# Collect all return values for building vocabulary
all_retvals = []
for path, _ in train_files + test_files:
    for run in load_runs_retval(path):
        all_retvals.extend(run)

# Build return value encoder (add PAD token at index 0)
retval_encoder = LabelEncoder()
retval_encoder.fit(all_retvals)
vocab_size = len(retval_encoder.classes_) + 1  # +1 for PAD token
PAD_IDX = 0
print(f"\nVocabulary size: {vocab_size} (including PAD)")


Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321

Vocabulary size: 42586 (including PAD)


In [3]:
class RetvalDataset(Dataset):
    def __init__(self, file_label_pairs, encoder, max_len):
        self.sequences = []
        self.labels = []
        label_map = {'benign': 0, 'malicious': 1}
        
        for path, label in file_label_pairs:
            runs = load_runs_retval(path)
            for run_retvals in runs:
                # Encode return values (+1 to reserve 0 for PAD)
                encoded = encoder.transform(run_retvals) + 1
                
                # Truncate or pad to max_len
                if len(encoded) > max_len:
                    encoded = encoded[:max_len]
                else:
                    encoded = np.pad(encoded, (0, max_len - len(encoded)), constant_values=PAD_IDX)
                
                self.sequences.append(encoded)
                self.labels.append(label_map[label])
        
        self.sequences = np.array(self.sequences)
        self.labels = np.array(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.sequences[idx], dtype=torch.long),
                torch.tensor(self.labels[idx], dtype=torch.long))


In [4]:
class CNN1D(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, num_filters=64, kernel_sizes=[3, 5, 7]):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        
        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, k, padding=k//2)
            for k in kernel_sizes
        ])
        
        self.fc = nn.Sequential(
            nn.Linear(num_filters * len(kernel_sizes), 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 2)
        )
    
    def forward(self, x):
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        
        conv_outs = []
        for conv in self.convs:
            c = torch.relu(conv(x))
            c = torch.max(c, dim=2)[0]  # Global max pooling
            conv_outs.append(c)
        
        x = torch.cat(conv_outs, dim=1)
        return self.fc(x)


In [5]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Create datasets
    train_dataset = RetvalDataset(train_files, retval_encoder, window_size)
    test_dataset = RetvalDataset(test_files, retval_encoder, window_size)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    
    # Create model
    model = CNN1D(vocab_size).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Training
    print(f"\nTraining...")
    train_start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Acc: {train_acc:.4f}")
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(DEVICE)
            outputs = model(x)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y.numpy())
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(all_labels, all_preds, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")



EXPERIMENT: Window Size = 250
Train samples: 1986, Test samples: 810

Training...
Epoch 1/20 - Loss: 0.2172, Acc: 0.9129
Epoch 5/20 - Loss: 0.0023, Acc: 0.9995
Epoch 10/20 - Loss: 0.0004, Acc: 1.0000
Epoch 15/20 - Loss: 0.0002, Acc: 1.0000
Epoch 20/20 - Loss: 0.0001, Acc: 1.0000
Training time: 83.13s

Evaluating...
Test time: 0.26s

Classification Report:
              precision    recall  f1-score   support

      benign       0.93      0.96      0.94       489
   malicious       0.93      0.88      0.91       321

    accuracy                           0.93       810
   macro avg       0.93      0.92      0.93       810
weighted avg       0.93      0.93      0.93       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              469               20
True: malicious            37              284

Detection Rate: 0.8847
False Positive Rate: 0.0409
F1-score (weighted): 0.9293

EXPERIMENT: Window Size = 500
Train samples: 1986, Test samples: 810

Trai

In [6]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))



SUMMARY OF RESULTS
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.8847              0.0409              0.9293          83.13          0.26
         500         0.9969              0.0020              0.9975         140.61          0.43
        1000         0.9969              0.0532              0.9669         232.78          0.74
        2000         1.0000              0.0941              0.9437         527.77          3.24


## Feature: Parameters only

In [1]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [2000]  # Different sliding window lengths to test
# WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
BATCH_SIZE = 32
EPOCHS = 20
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Load sentence transformer model for semantic embeddings
print("Loading sentence transformer model...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
EMBEDDING_DIM = sentence_model.get_sentence_embedding_dimension()
print(f"Sentence embedding dimension: {EMBEDDING_DIM}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu
Loading sentence transformer model...
Sentence embedding dimension: 384


In [2]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_params_raw(file_path):
    """Load raw parameter strings grouped by run (list of param strings per run).
    
    Returns list of runs, where each run is a list of parameter strings (one per syscall).
    """
    df = pd.read_csv(file_path)
    runs = []
    for run_id, group in df.groupby('run'):
        # Keep parameters as list of strings (one per syscall)
        run_params = group['parameters'].tolist()
        runs.append(run_params)
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_params_raw(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")


Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321


In [3]:
# Pre-compute sentence embeddings for all unique parameter strings
print("Collecting unique parameter strings...")

unique_params = set()
for path, _ in train_files + test_files:
    for run_params in load_runs_params_raw(path):
        for param_str in run_params:
            # Convert to string and handle NaN
            if pd.isna(param_str):
                unique_params.add('<EMPTY>')
            else:
                unique_params.add(str(param_str))

unique_params = list(unique_params)
print(f"Unique parameter strings: {len(unique_params)}")

# Compute embeddings for all unique strings in batches
print("Computing sentence embeddings (this may take a few minutes)...")
param_embeddings = sentence_model.encode(
    unique_params, 
    show_progress_bar=True, 
    batch_size=256,
    convert_to_numpy=True
)

# Create a mapping from parameter string to embedding
param_to_embedding = {param: emb for param, emb in zip(unique_params, param_embeddings)}
print(f"Embeddings computed. Shape per embedding: {EMBEDDING_DIM}")

# Create a zero embedding for padding
PAD_EMBEDDING = np.zeros(EMBEDDING_DIM, dtype=np.float32)


Collecting unique parameter strings...
Unique parameter strings: 266019
Computing sentence embeddings (this may take a few minutes)...


Batches: 100%|██████████| 1040/1040 [19:58<00:00,  1.15s/it]


Embeddings computed. Shape per embedding: 384


In [4]:
class ParamsEmbeddingDataset(Dataset):
    """Dataset that uses pre-computed sentence embeddings for parameter strings.
    
    Instead of tokenizing each parameter string into multiple tokens,
    we embed each parameter string as a single vector using a sentence transformer.
    This reduces sequence length from window_size * tokens_per_syscall to just window_size.
    """
    def __init__(self, file_label_pairs, param_to_embedding, window_size, embed_dim, pad_embedding):
        """
        Args:
            file_label_pairs: List of (file_path, label) tuples
            param_to_embedding: Dict mapping parameter strings to embeddings
            window_size: Number of syscalls (parameter strings) to consider from each run
            embed_dim: Dimension of sentence embeddings
            pad_embedding: Zero embedding for padding
        """
        self.embeddings = []
        self.labels = []
        self.window_size = window_size
        self.embed_dim = embed_dim
        label_map = {'benign': 0, 'malicious': 1}
        
        for path, label in file_label_pairs:
            runs = load_runs_params_raw(path)
            for run_params in runs:
                # Take first window_size parameter strings from this run
                params_to_use = run_params[:window_size]
                
                # Get embedding for each parameter string
                run_embeddings = []
                for param_str in params_to_use:
                    if pd.isna(param_str):
                        key = '<EMPTY>'
                    else:
                        key = str(param_str)
                    run_embeddings.append(param_to_embedding[key])
                
                # Pad to window_size if needed
                while len(run_embeddings) < window_size:
                    run_embeddings.append(pad_embedding)
                
                self.embeddings.append(np.array(run_embeddings, dtype=np.float32))
                self.labels.append(label_map[label])
        
        self.embeddings = np.array(self.embeddings)  # Shape: (num_samples, window_size, embed_dim)
        self.labels = np.array(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.embeddings[idx], dtype=torch.float32),
                torch.tensor(self.labels[idx], dtype=torch.long))


In [5]:
class CNN1DForEmbeddings(nn.Module):
    """1D CNN that takes pre-computed embeddings as input.
    
    Unlike the standard version that has an embedding layer,
    this version expects inputs to already be embedded (float tensors).
    """
    def __init__(self, input_dim, num_filters=64, kernel_sizes=[3, 5, 7]):
        super().__init__()
        
        self.convs = nn.ModuleList([
            nn.Conv1d(input_dim, num_filters, k, padding=k//2)
            for k in kernel_sizes
        ])
        
        self.fc = nn.Sequential(
            nn.Linear(num_filters * len(kernel_sizes), 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 2)
        )
    
    def forward(self, x):
        # x shape: (batch, seq_len, input_dim)
        x = x.permute(0, 2, 1)  # (batch, input_dim, seq_len)
        
        conv_outs = []
        for conv in self.convs:
            c = torch.relu(conv(x))
            c = torch.max(c, dim=2)[0]  # Global max pooling
            conv_outs.append(c)
        
        x = torch.cat(conv_outs, dim=1)
        return self.fc(x)


In [6]:
# Run experiments with different window sizes (window_size = number of syscalls)
# With sentence embeddings, sequence length = window_size (instead of window_size * tokens_per_syscall)
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size} syscalls")
    print(f"{'='*60}")
    
    # Create datasets with pre-computed embeddings
    train_dataset = ParamsEmbeddingDataset(
        train_files, param_to_embedding, window_size, EMBEDDING_DIM, PAD_EMBEDDING
    )
    test_dataset = ParamsEmbeddingDataset(
        test_files, param_to_embedding, window_size, EMBEDDING_DIM, PAD_EMBEDDING
    )
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    print(f"Sequence length: {window_size} (1 embedding per syscall parameter)")
    print(f"Input embedding dim: {EMBEDDING_DIM}")
    
    # Create model - takes pre-computed embeddings as input
    model = CNN1DForEmbeddings(input_dim=EMBEDDING_DIM, num_filters=64).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Training
    print(f"\nTraining...")
    train_start_time = time.time()
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=True)
        for x, y in pbar:
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
            
            # Update progress bar with current metrics
            pbar.set_postfix({'loss': f'{total_loss/total:.4f}', 'acc': f'{correct/total:.4f}'})
        
        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(DEVICE)
            outputs = model(x)
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y.numpy())
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(all_labels, all_preds, average='weighted')
    
    # Store results
    results.append({
        'Window Size (syscalls)': window_size,
        'Seq Length': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")
    
    # Clear GPU memory between experiments
    del model
    torch.cuda.empty_cache()



EXPERIMENT: Window Size = 2000 syscalls
Train samples: 1986, Test samples: 810
Sequence length: 2000 (1 embedding per syscall parameter)
Input embedding dim: 384
Model parameters: 381,314

Training...


Epoch 1/20: 100%|██████████| 63/63 [00:37<00:00,  1.70it/s, loss=0.0076, acc=0.9013]
Epoch 2/20: 100%|██████████| 63/63 [00:42<00:00,  1.47it/s, loss=0.0003, acc=0.9985]
Epoch 3/20: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s, loss=0.0002, acc=0.9985]
Epoch 4/20: 100%|██████████| 63/63 [00:45<00:00,  1.37it/s, loss=0.0001, acc=0.9990]
Epoch 5/20: 100%|██████████| 63/63 [00:49<00:00,  1.27it/s, loss=0.0001, acc=0.9995]
Epoch 6/20: 100%|██████████| 63/63 [00:49<00:00,  1.28it/s, loss=0.0000, acc=1.0000]
Epoch 7/20: 100%|██████████| 63/63 [00:50<00:00,  1.25it/s, loss=0.0000, acc=1.0000]
Epoch 8/20: 100%|██████████| 63/63 [00:53<00:00,  1.17it/s, loss=0.0000, acc=1.0000]
Epoch 9/20: 100%|██████████| 63/63 [00:55<00:00,  1.14it/s, loss=0.0000, acc=1.0000]
Epoch 10/20: 100%|██████████| 63/63 [00:54<00:00,  1.15it/s, loss=0.0000, acc=1.0000]
Epoch 11/20: 100%|██████████| 63/63 [00:54<00:00,  1.16it/s, loss=0.0000, acc=1.0000]
Epoch 12/20: 100%|██████████| 63/63 [00:56<00:00,  1.11it/s, lo

Training time: 1035.87s

Evaluating...
Test time: 7.39s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.96      0.98       489
   malicious       0.94      1.00      0.97       321

    accuracy                           0.98       810
   macro avg       0.97      0.98      0.98       810
weighted avg       0.98      0.98      0.98       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              470               19
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.0389
F1-score (weighted): 0.9766


#### Earlier results 250 -> 1000 window size

```plaintext
============================================================
EXPERIMENT: Window Size = 250 syscalls
============================================================
Train samples: 1986, Test samples: 810
Sequence length: 250 (1 embedding per syscall parameter)
Input embedding dim: 384
Model parameters: 381,314

Training...
Epoch 1/20: 100%|██████████| 63/63 [00:04<00:00, 13.63it/s, loss=0.0087, acc=0.8832]
Epoch 2/20: 100%|██████████| 63/63 [00:05<00:00, 12.52it/s, loss=0.0007, acc=0.9955]
Epoch 3/20: 100%|██████████| 63/63 [00:05<00:00, 11.98it/s, loss=0.0006, acc=0.9935]
Epoch 4/20: 100%|██████████| 63/63 [00:05<00:00, 11.24it/s, loss=0.0004, acc=0.9945]
Epoch 5/20: 100%|██████████| 63/63 [00:05<00:00, 10.65it/s, loss=0.0003, acc=0.9965]
Epoch 6/20: 100%|██████████| 63/63 [00:06<00:00, 10.10it/s, loss=0.0002, acc=0.9970]
Epoch 7/20: 100%|██████████| 63/63 [00:06<00:00,  9.90it/s, loss=0.0002, acc=0.9990]
Epoch 8/20: 100%|██████████| 63/63 [00:06<00:00, 10.01it/s, loss=0.0002, acc=0.9985]
Epoch 9/20: 100%|██████████| 63/63 [00:06<00:00,  9.65it/s, loss=0.0002, acc=0.9975]
Epoch 10/20: 100%|██████████| 63/63 [00:06<00:00,  9.48it/s, loss=0.0001, acc=0.9995]
Epoch 11/20: 100%|██████████| 63/63 [00:07<00:00,  8.87it/s, loss=0.0001, acc=0.9990]
Epoch 12/20: 100%|██████████| 63/63 [00:07<00:00,  8.99it/s, loss=0.0001, acc=0.9990]
Epoch 13/20: 100%|██████████| 63/63 [00:07<00:00,  8.97it/s, loss=0.0000, acc=1.0000]
Epoch 14/20: 100%|██████████| 63/63 [00:06<00:00,  9.18it/s, loss=0.0000, acc=1.0000]
Epoch 15/20: 100%|██████████| 63/63 [00:07<00:00,  8.81it/s, loss=0.0000, acc=1.0000]
Epoch 16/20: 100%|██████████| 63/63 [00:07<00:00,  8.66it/s, loss=0.0000, acc=1.0000]
Epoch 17/20: 100%|██████████| 63/63 [00:07<00:00,  8.78it/s, loss=0.0000, acc=1.0000]
Epoch 18/20: 100%|██████████| 63/63 [00:07<00:00,  8.78it/s, loss=0.0000, acc=1.0000]
Epoch 19/20: 100%|██████████| 63/63 [00:07<00:00,  8.81it/s, loss=0.0000, acc=1.0000]
Epoch 20/20: 100%|██████████| 63/63 [00:07<00:00,  8.62it/s, loss=0.0000, acc=1.0000]
Training time: 129.75s

Evaluating...
Test time: 0.86s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.95      0.97       489
   malicious       0.93      0.99      0.96       321

    accuracy                           0.97       810
   macro avg       0.96      0.97      0.97       810
weighted avg       0.97      0.97      0.97       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              465               24
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.0491
F1-score (weighted): 0.9668

============================================================
...
Input embedding dim: 384
Model parameters: 381,314

Training...
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
Epoch 1/20: 100%|██████████| 63/63 [00:08<00:00,  7.22it/s, loss=0.0080, acc=0.8988]
Epoch 2/20: 100%|██████████| 63/63 [00:10<00:00,  6.30it/s, loss=0.0004, acc=0.9980]
Epoch 3/20: 100%|██████████| 63/63 [00:10<00:00,  5.85it/s, loss=0.0002, acc=0.9985]
Epoch 4/20: 100%|██████████| 63/63 [00:11<00:00,  5.49it/s, loss=0.0001, acc=0.9985]
Epoch 5/20: 100%|██████████| 63/63 [00:11<00:00,  5.43it/s, loss=0.0001, acc=0.9990]
Epoch 6/20: 100%|██████████| 63/63 [00:12<00:00,  5.04it/s, loss=0.0000, acc=0.9995]
Epoch 7/20: 100%|██████████| 63/63 [00:12<00:00,  4.89it/s, loss=0.0000, acc=1.0000]
Epoch 8/20: 100%|██████████| 63/63 [00:12<00:00,  4.94it/s, loss=0.0001, acc=0.9995]
Epoch 9/20: 100%|██████████| 63/63 [00:13<00:00,  4.68it/s, loss=0.0000, acc=1.0000]
Epoch 10/20: 100%|██████████| 63/63 [00:13<00:00,  4.61it/s, loss=0.0000, acc=1.0000]
Epoch 11/20: 100%|██████████| 63/63 [00:14<00:00,  4.47it/s, loss=0.0000, acc=1.0000]
Epoch 12/20: 100%|██████████| 63/63 [00:13<00:00,  4.51it/s, loss=0.0000, acc=1.0000]
Epoch 13/20: 100%|██████████| 63/63 [00:14<00:00,  4.48it/s, loss=0.0000, acc=1.0000]
Epoch 14/20: 100%|██████████| 63/63 [00:13<00:00,  4.57it/s, loss=0.0000, acc=1.0000]
Epoch 15/20: 100%|██████████| 63/63 [00:13<00:00,  4.52it/s, loss=0.0000, acc=1.0000]
Epoch 16/20: 100%|██████████| 63/63 [00:14<00:00,  4.39it/s, loss=0.0000, acc=1.0000]
Epoch 17/20: 100%|██████████| 63/63 [00:14<00:00,  4.46it/s, loss=0.0000, acc=1.0000]
Epoch 18/20: 100%|██████████| 63/63 [00:14<00:00,  4.42it/s, loss=0.0000, acc=1.0000]
Epoch 19/20: 100%|██████████| 63/63 [00:14<00:00,  4.44it/s, loss=0.0000, acc=1.0000]
Epoch 20/20: 100%|██████████| 63/63 [00:13<00:00,  4.54it/s, loss=0.0000, acc=1.0000]
Training time: 258.57s

Evaluating...
Test time: 1.69s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.99      1.00       489
   malicious       0.99      1.00      0.99       321

    accuracy                           1.00       810
   macro avg       0.99      1.00      0.99       810
weighted avg       1.00      1.00      1.00       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              486                3
True: malicious             1              320

Detection Rate: 0.9969
False Positive Rate: 0.0061
F1-score (weighted): 0.9951

============================================================
...
Input embedding dim: 384
Model parameters: 381,314

Training...
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
Epoch 1/20: 100%|██████████| 63/63 [00:17<00:00,  3.67it/s, loss=0.0078, acc=0.8902]
Epoch 2/20: 100%|██████████| 63/63 [00:21<00:00,  2.91it/s, loss=0.0003, acc=0.9985]
Epoch 3/20: 100%|██████████| 63/63 [00:21<00:00,  2.88it/s, loss=0.0002, acc=0.9990]
Epoch 4/20: 100%|██████████| 63/63 [00:23<00:00,  2.66it/s, loss=0.0001, acc=0.9990]
Epoch 5/20: 100%|██████████| 63/63 [00:24<00:00,  2.53it/s, loss=0.0001, acc=0.9990]
Epoch 6/20: 100%|██████████| 63/63 [00:25<00:00,  2.50it/s, loss=0.0000, acc=0.9995]
Epoch 7/20: 100%|██████████| 63/63 [00:25<00:00,  2.49it/s, loss=0.0000, acc=1.0000]
Epoch 8/20: 100%|██████████| 63/63 [00:26<00:00,  2.40it/s, loss=0.0000, acc=1.0000]
Epoch 9/20: 100%|██████████| 63/63 [00:26<00:00,  2.39it/s, loss=0.0000, acc=1.0000]
Epoch 10/20: 100%|██████████| 63/63 [00:26<00:00,  2.36it/s, loss=0.0000, acc=1.0000]
Epoch 11/20: 100%|██████████| 63/63 [00:26<00:00,  2.35it/s, loss=0.0000, acc=1.0000]
Epoch 12/20: 100%|██████████| 63/63 [00:26<00:00,  2.39it/s, loss=0.0000, acc=1.0000]
Epoch 13/20: 100%|██████████| 63/63 [00:27<00:00,  2.31it/s, loss=0.0000, acc=1.0000]
Epoch 14/20: 100%|██████████| 63/63 [00:27<00:00,  2.31it/s, loss=0.0000, acc=1.0000]
Epoch 15/20: 100%|██████████| 63/63 [00:26<00:00,  2.40it/s, loss=0.0000, acc=1.0000]
Epoch 16/20: 100%|██████████| 63/63 [00:26<00:00,  2.36it/s, loss=0.0000, acc=1.0000]
Epoch 17/20: 100%|██████████| 63/63 [00:27<00:00,  2.33it/s, loss=0.0000, acc=1.0000]
Epoch 18/20: 100%|██████████| 63/63 [00:27<00:00,  2.29it/s, loss=0.0000, acc=1.0000]
Epoch 19/20: 100%|██████████| 63/63 [00:27<00:00,  2.33it/s, loss=0.0000, acc=1.0000]
Epoch 20/20: 100%|██████████| 63/63 [00:27<00:00,  2.28it/s, loss=0.0000, acc=1.0000]
Training time: 509.14s

Evaluating...
Test time: 3.33s

Classification Report:
              precision    recall  f1-score   support

      benign       1.00      0.96      0.98       489
   malicious       0.94      1.00      0.97       321

    accuracy                           0.97       810
   macro avg       0.97      0.98      0.97       810
weighted avg       0.97      0.97      0.97       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              467               22
True: malicious             0              321

Detection Rate: 1.0000
False Positive Rate: 0.0450
F1-score (weighted): 0.9730

============================================================
EXPERIMENT: Window Size = 2000 syscalls
============================================================
```

In [7]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS (Sentence Embeddings)")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))

print("\nNote: Using sentence embeddings reduces sequence length from")
print("window_size * ~13 tokens to just window_size embeddings,")
print("enabling larger window sizes within GPU memory constraints.")



SUMMARY OF RESULTS (Sentence Embeddings)
 Window Size (syscalls)  Seq Length Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
                    250         250         0.9907              0.0491              0.9668         129.75          0.86
                    500         500         0.9969              0.0061              0.9951         258.57          1.69
                   1000        1000         1.0000              0.0450              0.9730         509.14          3.33
                   2000        2000         1.0000              0.0389              0.9766        1035.87          7.39

Note: Using sentence embeddings reduces sequence length from
window_size * ~13 tokens to just window_size embeddings,
enabling larger window sizes within GPU memory constraints.
