# Data Preprocessing and Graph Construction

Load data, split train/test, and build k-NN graphs for GAE training.


In [1]:
import pandas as pd
import numpy as np
import gzip
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch

import warnings
warnings.filterwarnings('ignore')


## Configuration


In [2]:
DATASET_PATH = "../e2-master"
SCENARIO = "pandex"
PROVIDER = "trace"

TEST_SIZE = 0.3
RANDOM_SEED = 42

K_NEIGHBORS = 5
OUTPUT_DIR = "data"

BASE_PATH = Path(DATASET_PATH) / SCENARIO / PROVIDER
Path(OUTPUT_DIR).mkdir(exist_ok=True)

print(f"Dataset: {SCENARIO}/{PROVIDER}")
print(f"k-NN neighbors: {K_NEIGHBORS}")
print(f"Test size: {TEST_SIZE}")
print(f"Output: {OUTPUT_DIR}/")


Dataset: pandex/trace
k-NN neighbors: 3
Test size: 0.3
Output: data/


## Load ProcessAll


In [3]:
process_file = BASE_PATH / "ProcessAll.csv.gz"

with gzip.open(process_file, 'rt') as f:
    df = pd.read_csv(f)

print(f"Loaded {len(df):,} processes")
print(f"Shape: {df.shape}")


Loaded 272,376 processes
Shape: (272376, 300)


## Extract Features


In [4]:
id_column = df.columns[0]
feature_columns = [col for col in df.columns if col != id_column]

process_ids = df[id_column].values
X = df[feature_columns].values.astype(np.float32)

print(f"Process IDs: {len(process_ids):,}")
print(f"Feature matrix: {X.shape}")
print(f"Sparsity: {(X == 0).mean():.1%}")


Process IDs: 272,376
Feature matrix: (272376, 299)
Sparsity: 98.0%


## Load Ground Truth


In [5]:
attack_uuids = set()
gt_files = list(BASE_PATH.glob(f"{PROVIDER}_{SCENARIO}_*.csv"))

for gt_file in gt_files:
    gt_df = pd.read_csv(gt_file)
    if 'uuid' in gt_df.columns:
        uuids = gt_df['uuid'].unique()
        attack_uuids.update(uuids)
        print(f"{gt_file.name}: {len(uuids)} attacks")

y = np.array([1 if pid in attack_uuids else 0 for pid in process_ids], dtype=np.int32)

print(f"\nTotal attacks: {len(attack_uuids)}")
print(f"Normal: {(y == 0).sum():,} ({(y == 0).mean():.1%})")
print(f"Attack: {(y == 1).sum():,} ({(y == 1).mean():.1%})")


trace_pandex_micro.csv: 82 attacks
trace_pandex_drakon2.csv: 25 attacks
trace_pandex_drakon.csv: 47 attacks

Total attacks: 153
Normal: 272,351 (100.0%)
Attack: 25 (0.0%)


## Train/Test Split (Unsupervised)


In [6]:
normal_mask = (y == 0)
anomaly_mask = (y == 1)

X_normal = X[normal_mask]
y_normal = y[normal_mask]
ids_normal = process_ids[normal_mask]

X_anomaly = X[anomaly_mask]
y_anomaly = y[anomaly_mask]
ids_anomaly = process_ids[anomaly_mask]

X_train, X_test_normal, y_train, y_test_normal, ids_train, ids_test_normal = train_test_split(
    X_normal, y_normal, ids_normal,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED
)

X_test = np.vstack([X_test_normal, X_anomaly])
y_test = np.concatenate([y_test_normal, y_anomaly])
ids_test = np.concatenate([ids_test_normal, ids_anomaly])

print("Train (normal only):")
print(f"  Samples: {len(X_train):,}")
print(f"  Attack: {(y_train == 1).sum()}")

print("\nTest (normal + all attacks):")
print(f"  Samples: {len(X_test):,}")
print(f"  Normal: {(y_test == 0).sum():,}")
print(f"  Attack: {(y_test == 1).sum():,}")


Train (normal only):
  Samples: 190,645
  Attack: 0

Test (normal + all attacks):
  Samples: 81,731
  Normal: 81,706
  Attack: 25


## Build k-NN Graphs


In [7]:
from utils.graph_construction import create_graph_data

print(f"Building train graph (k={K_NEIGHBORS}):")
train_graph = create_graph_data(X_train, y_train, k_neighbors=K_NEIGHBORS)

print(f"\nBuilding test graph (k={K_NEIGHBORS}):")
test_graph = create_graph_data(X_test, y_test, k_neighbors=K_NEIGHBORS)

Building train graph (k=3):
  [1/4] Normalizing features... (0.2s)
  [2/4] Finding k=3 neighbors for 190,645 nodes... (745.5s)
  [3/4] Building edge list... (0.0s)
  [4/4] Removing duplicates... (2.1s)
  Graph built: 190,645 nodes, 1,328,345 edges (total: 747.8s)

Building test graph (k=3):
  [1/4] Normalizing features... (0.1s)
  [2/4] Finding k=3 neighbors for 81,731 nodes... (99.1s)
  [3/4] Building edge list... (0.0s)
  [4/4] Removing duplicates... (0.8s)
  Graph built: 81,731 nodes, 567,521 edges (total: 100.1s)


## Save Processed Data


In [8]:
output_path = Path(OUTPUT_DIR)

torch.save(train_graph, output_path / "train_graph.pt")
torch.save(test_graph, output_path / "test_graph.pt")

print(f"Saved to {OUTPUT_DIR}/:")
print(f"  train_graph.pt, test_graph.pt")

Saved to data/:
  train_graph.pt, test_graph.pt
