In [29]:
import pandas as pd
import numpy as np
import itertools
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    roc_auc_score,
    precision_recall_fscore_support,
    confusion_matrix
)

PROJECT_ROOT = Path.cwd().parent
base = PROJECT_ROOT / "data/raw/hdfs/HDFS_v1/preprocessed"

X = pd.read_csv(base / "Event_occurrence_matrix.csv")
y = pd.read_csv(base / "anomaly_label.csv")


In [30]:
# Use ONLY event-count features
feature_cols = [c for c in X.columns if c.startswith("E")]
X_feat = X[feature_cols]

# Encode labels
labels = y["Label"].map({"Normal": 0, "Anomaly": 1}).values

print("Feature matrix shape:", X_feat.shape)
print("Normal samples:", (labels == 0).sum())
print("Anomaly samples:", (labels == 1).sum())

# Safety check
assert X_feat.shape[0] == len(labels)


Feature matrix shape: (575061, 29)
Normal samples: 558223
Anomaly samples: 16838


In [31]:
X_normal = X_feat[labels == 0]


In [32]:
X_train, X_val = train_test_split(
    X_normal,
    test_size=0.2,
    random_state=42
)

X_eval = X_feat
y_eval = labels

print("Train normals:", X_train.shape)
print("Validation normals:", X_val.shape)
print("Evaluation set:", X_eval.shape)

Train normals: (446578, 29)
Validation normals: (111645, 29)
Evaluation set: (575061, 29)


In [22]:
param_grid = {
    "contamination": [0.01, 0.02, 0.03, 0.04, 0.05],
    "max_samples": [256, 512, 1024, 2048],
    "n_estimators": [200, 400],
    "max_features": [0.5, 0.75, 1.0],
    "bootstrap": [False, True],
}


In [33]:
best_auc = 0.0
best_model = None
best_params = None

for params in itertools.product(
    param_grid["contamination"],
    param_grid["max_samples"],
    param_grid["n_estimators"],
    param_grid["max_features"],
    param_grid["bootstrap"],
):
    contamination, max_samples, n_estimators, max_features, bootstrap = params

    model = IsolationForest(
        contamination=contamination,
        max_samples=max_samples,
        n_estimators=n_estimators,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs=-1,
    )

    model.fit(X_train)

    scores = -model.decision_function(X_eval)
    auc = roc_auc_score(y_eval, scores)

    if auc > best_auc:
        best_auc = auc
        best_model = model
        best_params = {
            "contamination": contamination,
            "max_samples": max_samples,
            "n_estimators": n_estimators,
            "max_features": max_features,
            "bootstrap": bootstrap,
        }

print("BEST ROC-AUC:", best_auc)
print("BEST PARAMS:", best_params)



BEST ROC-AUC: 0.799629000259832
BEST PARAMS: {'contamination': 0.01, 'max_samples': 2048, 'n_estimators': 400, 'max_features': 0.5, 'bootstrap': True}


In [34]:
scores = -best_model.decision_function(X_eval)

thresholds = np.percentile(scores, np.linspace(90, 99.9, 200))

best_f1 = 0
best_threshold = None
best_metrics = None

for t in thresholds:
    y_pred = (scores > t).astype(int)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_eval, y_pred, average="binary"
    )

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t
        best_metrics = (precision, recall, f1)

print("BEST THRESHOLD:", best_threshold)
print("PRECISION:", best_metrics[0])
print("RECALL:", best_metrics[1])
print("F1:", best_metrics[2])


BEST THRESHOLD: 0.04927304214746986
PRECISION: 0.6597822005257229
RECALL: 0.20869461931345765
F1: 0.3170907778379354


In [35]:
y_pred = (scores > best_threshold).astype(int)

cm = confusion_matrix(y_eval, y_pred)
cm

array([[556411,   1812],
       [ 13324,   3514]])

In [36]:
results_iforest_optimized = {
    "roc_auc": best_auc,
    "precision": best_metrics[0],
    "recall": best_metrics[1],
    "f1": best_metrics[2],
    "threshold": float(best_threshold),
    "params": best_params,
}

results_iforest_optimized


{'roc_auc': 0.799629000259832,
 'precision': 0.6597822005257229,
 'recall': 0.20869461931345765,
 'f1': 0.3170907778379354,
 'threshold': 0.04927304214746986,
 'params': {'contamination': 0.01,
  'max_samples': 2048,
  'n_estimators': 400,
  'max_features': 0.5,
  'bootstrap': True}}