## Feature: Syscall only

In [2]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test

print("XGBoost - Syscall Classification")


XGBoost - Syscall Classification


In [3]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs(file_path):
    """Load syscalls grouped by run."""
    df = pd.read_csv(file_path)
    runs = df.groupby('run')['syscall'].apply(list).tolist()
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

# Collect all syscalls for building vocabulary
all_syscalls = []
for path, _ in train_files + test_files:
    for run in load_runs(path):
        all_syscalls.extend(run)

# Build syscall encoder
syscall_encoder = LabelEncoder()
syscall_encoder.fit(all_syscalls)
vocab_size = len(syscall_encoder.classes_)
print(f"\nVocabulary size: {vocab_size}")


Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321

Vocabulary size: 80


In [4]:
def extract_frequency_features(syscalls, encoder, max_len):
    """Extract syscall frequency features from a sequence."""
    # Truncate sequence to max_len
    syscalls = syscalls[:max_len]
    
    # Encode syscalls
    encoded = encoder.transform(syscalls)
    
    # Count frequencies
    vocab_size = len(encoder.classes_)
    freq = np.zeros(vocab_size)
    for idx in encoded:
        freq[idx] += 1
    
    # Normalize by total count
    total = len(encoded)
    if total > 0:
        freq = freq / total
    
    return freq

def prepare_dataset(file_label_pairs, encoder, max_len):
    """Prepare feature matrix and labels from file-label pairs."""
    X = []
    y = []
    label_map = {'benign': 0, 'malicious': 1}
    
    for path, label in file_label_pairs:
        runs = load_runs(path)
        for run_syscalls in runs:
            if len(run_syscalls) > 0:  # Need at least 1 syscall
                features = extract_frequency_features(run_syscalls, encoder, max_len)
                X.append(features)
                y.append(label_map[label])
    
    return np.array(X), np.array(y)

print(f"Using syscall frequency histogram as features")


Using syscall frequency histogram as features


In [5]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Prepare datasets
    X_train, y_train = prepare_dataset(train_files, syscall_encoder, window_size)
    X_test, y_test = prepare_dataset(test_files, syscall_encoder, window_size)
    
    print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")
    print(f"Feature dimension: {X_train.shape[1]}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create XGBoost model
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    # Training
    print(f"\nTraining XGBoost...")
    train_start_time = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    y_pred = model.predict(X_test_scaled)
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")



EXPERIMENT: Window Size = 250
Train samples: 1986, Test samples: 810
Feature dimension: 80

Training XGBoost...
Training time: 0.36s

Evaluating...
Test time: 0.01s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.82      0.90       489
   malicious       0.79      0.99      0.88       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              402               87
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.1779
F1-score (weighted): 0.8901

EXPERIMENT: Window Size = 500
Train samples: 1986, Test samples: 810
Feature dimension: 80

Training XGBoost...
Training time: 0.14s

Evaluating...
Test time: 0.00s

Classification Report:
              precision    recall  f1-score   support


In [6]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))



SUMMARY OF RESULTS
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9907              0.1779              0.8901           0.36          0.01
         500         0.9969              0.1861              0.8876           0.14          0.00
        1000         1.0000              0.1861              0.8889           0.16          0.00
        2000         0.7352              0.1861              0.7831           0.24          0.00


## Feature: Return values only

In [2]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test

print("XGBoost - Return Value Classification")

XGBoost - Return Value Classification


In [3]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_retval(file_path):
    """Load return values grouped by run."""
    df = pd.read_csv(file_path)
    runs = df.groupby('run')['Ret'].apply(list).tolist()
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_retval(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

# Collect all return values for building vocabulary
all_retvals = []
for path, _ in train_files + test_files:
    for run in load_runs_retval(path):
        all_retvals.extend(run)

# Build return value encoder
retval_encoder = LabelEncoder()
retval_encoder.fit(all_retvals)
vocab_size = len(retval_encoder.classes_)
print(f"\nVocabulary size: {vocab_size}")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321

Vocabulary size: 42585


In [4]:
def extract_frequency_features(retvals, encoder, max_len):
    """Extract return value frequency features from a sequence."""
    # Truncate sequence to max_len
    retvals = retvals[:max_len]
    
    # Encode return values
    encoded = encoder.transform(retvals)
    
    # Count frequencies
    vocab_size = len(encoder.classes_)
    freq = np.zeros(vocab_size)
    for idx in encoded:
        freq[idx] += 1
    
    # Normalize by total count
    total = len(encoded)
    if total > 0:
        freq = freq / total
    
    return freq

def prepare_dataset(file_label_pairs, encoder, max_len):
    """Prepare feature matrix and labels from file-label pairs."""
    X = []
    y = []
    label_map = {'benign': 0, 'malicious': 1}
    
    for path, label in file_label_pairs:
        runs = load_runs_retval(path)
        for run_retvals in runs:
            if len(run_retvals) > 0:  # Need at least 1 return value
                features = extract_frequency_features(run_retvals, encoder, max_len)
                X.append(features)
                y.append(label_map[label])
    
    return np.array(X), np.array(y)

print(f"Using return value frequency histogram as features")

Using return value frequency histogram as features


In [5]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Prepare datasets
    X_train, y_train = prepare_dataset(train_files, retval_encoder, window_size)
    X_test, y_test = prepare_dataset(test_files, retval_encoder, window_size)
    
    print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")
    print(f"Feature dimension: {X_train.shape[1]}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create XGBoost model
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    # Training
    print(f"\nTraining XGBoost...")
    train_start_time = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    y_pred = model.predict(X_test_scaled)
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")


EXPERIMENT: Window Size = 250
Train samples: 1986, Test samples: 810
Feature dimension: 42585

Training XGBoost...
Training time: 5.66s

Evaluating...
Test time: 0.04s

Classification Report:
              precision    recall  f1-score   support

      benign       0.99      0.82      0.90       489
   malicious       0.79      0.99      0.88       321

    accuracy                           0.89       810
   macro avg       0.89      0.91      0.89       810
weighted avg       0.91      0.89      0.89       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              403               86
True: malicious             3              318

Detection Rate: 0.9907
False Positive Rate: 0.1759
F1-score (weighted): 0.8913

EXPERIMENT: Window Size = 500
Train samples: 1986, Test samples: 810
Feature dimension: 42585

Training XGBoost...
Training time: 5.26s

Evaluating...
Test time: 0.04s

Classification Report:
              precision    recall  f1-score   su

In [6]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))


SUMMARY OF RESULTS
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9907              0.1759              0.8913           5.66          0.04
         500         0.9969              0.0818              0.9498           5.26          0.04
        1000         1.0000              0.0041              0.9975           5.63          0.04
        2000         1.0000              0.0020              0.9988           6.06          0.04


## Feature: Parameters only

In [2]:
import sys
sys.path.insert(0, '../../configs')
from config_loader import get_split_with_labels

import time
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

# Config
SPLIT = '70'
WINDOW_SIZES = [250, 500, 1000, 2000]  # Different sliding window lengths to test

print("XGBoost - Parameters Classification (Sentence Embeddings)")

# Load sentence transformer model for semantic embeddings
print("Loading sentence transformer model...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
EMBEDDING_DIM = sentence_model.get_sentence_embedding_dimension()
print(f"Sentence embedding dimension: {EMBEDDING_DIM}")

XGBoost - Parameters Classification (Sentence Embeddings)
Loading sentence transformer model...
Sentence embedding dimension: 384


In [3]:
# Load data
train_files, test_files = get_split_with_labels(SPLIT)
print(f"Train files: {len(train_files)}, Test files: {len(test_files)}")

def load_runs_params_raw(file_path):
    """Load raw parameter strings grouped by run (list of param strings per run).
    
    Returns list of runs, where each run is a list of parameter strings (one per syscall).
    """
    df = pd.read_csv(file_path)
    runs = []
    for run_id, group in df.groupby('run'):
        # Keep parameters as list of strings (one per syscall)
        run_params = group['parameters'].tolist()
        runs.append(run_params)
    return runs

# Count runs per label for training and test sets
def count_runs_per_label(file_label_pairs):
    """Count total runs per label."""
    counts = {'benign': 0, 'malicious': 0}
    for path, label in file_label_pairs:
        runs = load_runs_params_raw(path)
        counts[label] += len(runs)
    return counts

train_counts = count_runs_per_label(train_files)
test_counts = count_runs_per_label(test_files)

print(f"\nTraining set:")
print(f"  Total runs: {sum(train_counts.values())}")
print(f"  Benign runs: {train_counts['benign']}")
print(f"  Malicious runs: {train_counts['malicious']}")

print(f"\nTest set:")
print(f"  Total runs: {sum(test_counts.values())}")
print(f"  Benign runs: {test_counts['benign']}")
print(f"  Malicious runs: {test_counts['malicious']}")

Train files: 21, Test files: 9

Training set:
  Total runs: 1986
  Benign runs: 1484
  Malicious runs: 502

Test set:
  Total runs: 810
  Benign runs: 489
  Malicious runs: 321


In [4]:
# Pre-compute sentence embeddings for all unique parameter strings
print("Collecting unique parameter strings...")

unique_params = set()
for path, _ in train_files + test_files:
    for run_params in load_runs_params_raw(path):
        for param_str in run_params:
            # Convert to string and handle NaN
            if pd.isna(param_str):
                unique_params.add('<EMPTY>')
            else:
                unique_params.add(str(param_str))

unique_params = list(unique_params)
print(f"Unique parameter strings: {len(unique_params)}")

# Compute embeddings for all unique strings in batches
print("Computing sentence embeddings (this may take a few minutes)...")
param_embeddings = sentence_model.encode(
    unique_params, 
    show_progress_bar=True, 
    batch_size=256,
    convert_to_numpy=True
)

# Create a mapping from parameter string to embedding
param_to_embedding = {param: emb for param, emb in zip(unique_params, param_embeddings)}
print(f"Embeddings computed. Shape per embedding: {EMBEDDING_DIM}")

# Create a zero embedding for padding
PAD_EMBEDDING = np.zeros(EMBEDDING_DIM, dtype=np.float32)

Collecting unique parameter strings...
Unique parameter strings: 266019
Computing sentence embeddings (this may take a few minutes)...


Batches:   0%|          | 0/1040 [00:00<?, ?it/s]

Embeddings computed. Shape per embedding: 384


In [5]:
def extract_sentence_embedding_features(run_params, param_to_embedding, pad_embedding, window_size):
    """Extract aggregated sentence embedding features from a run.
    
    Uses mean pooling over all parameter embeddings in the window to create
    a fixed-size feature vector (384 dimensions).
    
    Args:
        run_params: List of parameter strings (one per syscall) for a run
        param_to_embedding: Dict mapping parameter strings to embeddings
        pad_embedding: Zero embedding for missing/empty parameters
        window_size: Maximum number of syscalls to consider
    
    Returns:
        384-dimensional feature vector (mean of all embeddings)
    """
    # Take first window_size syscalls
    params_to_use = run_params[:window_size]
    
    embeddings = []
    for param_str in params_to_use:
        if pd.isna(param_str):
            key = '<EMPTY>'
        else:
            key = str(param_str)
        embeddings.append(param_to_embedding.get(key, pad_embedding))
    
    if embeddings:
        embeddings = np.array(embeddings)
        # Mean pooling across all syscall parameter embeddings
        return embeddings.mean(axis=0)  # Returns 384-dim vector
    return pad_embedding

def prepare_dataset_params(file_label_pairs, param_to_embedding, pad_embedding, window_size):
    """Prepare feature matrix and labels using sentence embeddings."""
    X = []
    y = []
    label_map = {'benign': 0, 'malicious': 1}
    
    for path, label in file_label_pairs:
        runs = load_runs_params_raw(path)
        for run_params in runs:
            if len(run_params) > 0:  # Need at least 1 syscall
                features = extract_sentence_embedding_features(
                    run_params, param_to_embedding, pad_embedding, window_size
                )
                X.append(features)
                y.append(label_map[label])
    
    return np.array(X), np.array(y)

print(f"Using sentence embeddings with mean pooling as features")
print(f"Feature dimension: {EMBEDDING_DIM}")

Using sentence embeddings with mean pooling as features
Feature dimension: 384


In [6]:
# Run experiments with different window sizes
results = []

for window_size in WINDOW_SIZES:
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: Window Size = {window_size}")
    print(f"{'='*60}")
    
    # Prepare datasets
    X_train, y_train = prepare_dataset_params(train_files, param_to_embedding, PAD_EMBEDDING, window_size)
    X_test, y_test = prepare_dataset_params(test_files, param_to_embedding, PAD_EMBEDDING, window_size)
    
    print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")
    print(f"Feature dimension: {X_train.shape[1]}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create XGBoost model
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    # Training
    print(f"\nTraining XGBoost...")
    train_start_time = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - train_start_time
    print(f"Training time: {train_time:.2f}s")
    
    # Testing
    print(f"\nEvaluating...")
    test_start_time = time.time()
    y_pred = model.predict(X_test_scaled)
    test_time = time.time() - test_start_time
    print(f"Test time: {test_time:.2f}s")
    
    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results.append({
        'Window Size': window_size,
        'Detection Rate': detection_rate,
        'False Positive Rate': false_positive_rate,
        'F1-score (weighted)': f1_weighted,
        'Train Time (s)': train_time,
        'Test Time (s)': test_time
    })
    
    # Print detailed results
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['benign', 'malicious']))
    
    labels = ['benign', 'malicious']
    cm_df = pd.DataFrame(cm, index=[f'True: {l}' for l in labels], columns=[f'Pred: {l}' for l in labels])
    print(f"Confusion Matrix:")
    print(cm_df)
    
    print(f"\nDetection Rate: {detection_rate:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"F1-score (weighted): {f1_weighted:.4f}")


EXPERIMENT: Window Size = 250
Train samples: 1986, Test samples: 810
Feature dimension: 384

Training XGBoost...
Training time: 5.83s

Evaluating...
Test time: 0.02s

Classification Report:
              precision    recall  f1-score   support

      benign       0.98      0.82      0.89       489
   malicious       0.78      0.98      0.87       321

    accuracy                           0.88       810
   macro avg       0.88      0.90      0.88       810
weighted avg       0.90      0.88      0.88       810

Confusion Matrix:
                 Pred: benign  Pred: malicious
True: benign              402               87
True: malicious             8              313

Detection Rate: 0.9751
False Positive Rate: 0.1779
F1-score (weighted): 0.8840

EXPERIMENT: Window Size = 500
Train samples: 1986, Test samples: 810
Feature dimension: 384

Training XGBoost...
Training time: 4.92s

Evaluating...
Test time: 0.01s

Classification Report:
              precision    recall  f1-score   suppor

In [7]:
# Summary Results Table
print("\n" + "="*80)
print("SUMMARY OF RESULTS (Sentence Embeddings)")
print("="*80)

results_df = pd.DataFrame(results)
results_df['Detection Rate'] = results_df['Detection Rate'].apply(lambda x: f"{x:.4f}")
results_df['False Positive Rate'] = results_df['False Positive Rate'].apply(lambda x: f"{x:.4f}")
results_df['F1-score (weighted)'] = results_df['F1-score (weighted)'].apply(lambda x: f"{x:.4f}")
results_df['Train Time (s)'] = results_df['Train Time (s)'].apply(lambda x: f"{x:.2f}")
results_df['Test Time (s)'] = results_df['Test Time (s)'].apply(lambda x: f"{x:.2f}")

print(results_df.to_string(index=False))

print("\nNote: Using sentence embeddings reduces feature dimension from")
print("131k+ token frequencies to just 384 semantic embedding dimensions.")


SUMMARY OF RESULTS (Sentence Embeddings)
 Window Size Detection Rate False Positive Rate F1-score (weighted) Train Time (s) Test Time (s)
         250         0.9751              0.1779              0.8840           5.83          0.02
         500         0.9907              0.1881              0.8840           4.92          0.01
        1000         0.9907              0.1861              0.8852           4.68          0.01
        2000         0.9938              0.1881              0.8852           4.57          0.01

Note: Using sentence embeddings reduces feature dimension from
131k+ token frequencies to just 384 semantic embedding dimensions.
