## Feature: Syscalls + Return values + Parameters (Combined)

In [1]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test
RETVAL_HASH_DIM = 256  # Reduced dimension for return values via feature hashing

print("SVM with RBF Kernel - Combined Features Classification")
print("Features: Syscalls + Return Values (hashed) + Parameters (Sentence Embeddings)")

  from .autonotebook import tqdm as notebook_tqdm


SVM with RBF Kernel - Combined Features Classification
Features: Syscalls + Return Values (hashed) + Parameters (Sentence Embeddings)


In [2]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_combined(file_path):
    """Load syscalls, return values, and parameters grouped by run.
    
    Returns:
        List of tuples: [(syscalls_list, retvals_list, params_list), ...]
    """
    df = pd.read_csv(file_path)
    runs = []
    for run_id, group in df.groupby('run'):
        syscalls = group['syscall'].tolist()
        retvals = group['Ret'].tolist()
        params = group['parameters'].tolist()
        runs.append((syscalls, retvals, params))
    return runs

# Count runs per label
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_combined(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321


In [3]:
# Build vocabulary for syscalls (return values will use feature hashing)
print("Building vocabularies...")

all_syscalls = []

for path, _ in train_files + test_files:
    for syscalls, _, _ in load_runs_combined(path):
        all_syscalls.extend(syscalls)

# Build syscall encoder
syscall_encoder = LabelEncoder()
syscall_encoder.fit(all_syscalls)
syscall_vocab_size = len(syscall_encoder.classes_)
print(f"Syscall vocabulary size: {syscall_vocab_size}")

# Return values use feature hashing to reduce dimensionality
print(f"Return value dimension (feature hashing): {RETVAL_HASH_DIM}")

Building vocabularies...
Syscall vocabulary size: 80
Return value dimension (feature hashing): 256


In [4]:
# Load sentence transformer and pre-compute parameter embeddings
print("\nLoading sentence transformer model...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
EMBEDDING_DIM = sentence_model.get_sentence_embedding_dimension()
print(f"Sentence embedding dimension: {EMBEDDING_DIM}")

# Collect unique parameter strings
print("\nCollecting unique parameter strings...")
unique_params = set()
for path, _ in train_files + test_files:
    for _, _, params in load_runs_combined(path):
        for param_str in params:
            if pd.isna(param_str):
                unique_params.add('<EMPTY>')
            else:
                unique_params.add(str(param_str))

unique_params = list(unique_params)
print(f"Unique parameter strings: {len(unique_params)}")

# Compute embeddings for all unique strings
print("Computing sentence embeddings (this may take a few minutes)...")
param_embeddings = sentence_model.encode(
    unique_params, 
    show_progress_bar=True, 
    batch_size=256,
    convert_to_numpy=True
)

# Create mapping from parameter string to embedding
param_to_embedding = {param: emb for param, emb in zip(unique_params, param_embeddings)}
PAD_EMBEDDING = np.zeros(EMBEDDING_DIM, dtype=np.float32)
print(f"Embeddings computed successfully.")


Loading sentence transformer model...
Sentence embedding dimension: 384

Collecting unique parameter strings...
Unique parameter strings: 266019
Computing sentence embeddings (this may take a few minutes)...


Batches: 100%|██████████| 1040/1040 [19:27<00:00,  1.12s/it]


Embeddings computed successfully.


In [5]:
def extract_syscall_frequency(syscalls, encoder, max_len):
    """Extract syscall frequency histogram features."""
    syscalls = syscalls[:max_len]
    encoded = encoder.transform(syscalls)
    vocab_size = len(encoder.classes_)
    freq = np.zeros(vocab_size)
    for idx in encoded:
        freq[idx] += 1
    total = len(encoded)
    if total > 0:
        freq = freq / total
    return freq

def extract_retval_hashed(retvals, hash_dim, max_len):
    """Extract return value features using feature hashing.
    
    Uses a hash function to map return values to a fixed-size vector,
    reducing dimensionality from ~42k to hash_dim (e.g., 256).
    """
    retvals = retvals[:max_len]
    freq = np.zeros(hash_dim)
    for rv in retvals:
        # Hash the return value to get bucket index
        bucket = hash(str(rv)) % hash_dim
        freq[bucket] += 1
    total = len(retvals)
    if total > 0:
        freq = freq / total
    return freq

def extract_param_embedding(params, param_to_embedding, pad_embedding, max_len):
    """Extract aggregated parameter sentence embedding (mean pooling)."""
    params = params[:max_len]
    embeddings = []
    for param_str in params:
        if pd.isna(param_str):
            key = '<EMPTY>'
        else:
            key = str(param_str)
        embeddings.append(param_to_embedding.get(key, pad_embedding))
    
    if embeddings:
        embeddings = np.array(embeddings)
        return embeddings.mean(axis=0)
    return pad_embedding

def extract_combined_features(syscalls, retvals, params, 
                              syscall_encoder, retval_hash_dim,
                              param_to_embedding, pad_embedding, max_len):
    """Extract and concatenate all three feature types.
    
    Returns:
        Combined feature vector: [syscall_freq | retval_hashed | param_embed]
    """
    syscall_feat = extract_syscall_frequency(syscalls, syscall_encoder, max_len)
    retval_feat = extract_retval_hashed(retvals, retval_hash_dim, max_len)
    param_feat = extract_param_embedding(params, param_to_embedding, pad_embedding, max_len)
    
    # Concatenate all features
    return np.concatenate([syscall_feat, retval_feat, param_feat])

print(f"Feature extraction functions defined.")
print(f"Combined feature dimensions: {syscall_vocab_size} (syscall) + {RETVAL_HASH_DIM} (retval hashed) + {EMBEDDING_DIM} (params)")
print(f"Total feature dimension: {syscall_vocab_size + RETVAL_HASH_DIM + EMBEDDING_DIM}")

Feature extraction functions defined.
Combined feature dimensions: 80 (syscall) + 256 (retval hashed) + 384 (params)
Total feature dimension: 720


In [6]:
def prepare_dataset_combined(file_label_pairs, syscall_encoder, retval_hash_dim,
                              param_to_embedding, pad_embedding, max_len):
    """Prepare combined feature matrix and labels."""
    X = []
    y = []
    label_map = {'benign': 0, 'malicious': 1}
    
    for path, label in file_label_pairs:
        runs = load_runs_combined(path)
        for syscalls, retvals, params in runs:
            if len(syscalls) > 0:  # Need at least 1 syscall
                features = extract_combined_features(
                    syscalls, retvals, params,
                    syscall_encoder, retval_hash_dim,
                    param_to_embedding, pad_embedding, max_len
                )
                X.append(features)
                y.append(label_map[label])
    
    return np.array(X), np.array(y)

print(f"Using combined features: syscall frequency + return value hashed + parameter embeddings")

Using combined features: syscall frequency + return value hashed + parameter embeddings


In [7]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Prepare datasets
    X_train, y_train = prepare_dataset_combined(
        train_files, syscall_encoder, RETVAL_HASH_DIM,
        param_to_embedding, PAD_EMBEDDING, window_size
    )
    X_test, y_test = prepare_dataset_combined(
        test_files, syscall_encoder, RETVAL_HASH_DIM,
        param_to_embedding, PAD_EMBEDDING, window_size
    )
    
    print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")
    print(f"Feature dimension: {X_train.shape[1]}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create SVM model with RBF kernel
    model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
    
    # Training
    print(f"\nTraining SVM with RBF kernel...")
    train_start_time = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    y_pred = model.predict(X_test_scaled)
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")


EXPERIMENT: Window Size = 250
Train samples: 1986, Test samples: 810
Feature dimension: 720

Training SVM with RBF kernel...
Training time: 0.08s

Evaluating...
Test time: 0.05s

Classification Report:
              precision    recall  f1-score   support

      benign       0.89      0.08      0.15       489
   malicious       0.41      0.98      0.58       321

    accuracy                           0.44       810
   macro avg       0.65      0.53      0.37       810
weighted avg       0.70      0.44      0.32       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign               41              448
True: malicious             5              316

Detection Rate: 0.9844
False Positive Rate: 0.9162
F1-score (weighted): 0.3234

EXPERIMENT: Window Size = 500
Train samples: 1986, Test samples: 810
Feature dimension: 720

Training SVM with RBF kernel...
Training time: 0.08s

Evaluating...
Test time: 0.06s

Classification Report:
              precision    r

In [8]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS (Combined Features)")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))

print(f"\nFeature composition:")
print(f"  - Syscall frequency histogram: {syscall_vocab_size} dimensions")
print(f"  - Return value (feature hashing): {RETVAL_HASH_DIM} dimensions")
print(f"  - Parameter sentence embeddings: {EMBEDDING_DIM} dimensions")
print(f"  - Total: {syscall_vocab_size + RETVAL_HASH_DIM + EMBEDDING_DIM} dimensions")


SUMMARY OF RESULTS (Combined Features)
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9844              0.9162              0.3234           0.08          0.05
         500         0.9907              0.8753              0.3694           0.08          0.06
        1000         0.9938              0.8691              0.3766           0.09          0.05
        2000         0.9938              0.8855              0.3592           0.09          0.05

Feature composition:
  - Syscall frequency histogram: 80 dimensions
  - Return value (feature hashing): 256 dimensions
  - Parameter sentence embeddings: 384 dimensions
  - Total: 720 dimensions
