In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report


In [2]:
# Load dataset (adjust path if needed)
data = pd.read_csv("../data/raw/k8_synthetic_dataset.csv")
data.head()


Unnamed: 0,cpu_usage,memory_usage,network_io,disk_io,label
0,54.967142,41.71005,337.849431,107.373466,0.0
1,48.617357,44.39819,253.891734,92.133224,0.0
2,56.476885,57.472936,343.480296,100.574896,0.0
3,65.230299,56.103703,367.781893,125.569037,0.0
4,47.658466,49.790984,320.671745,103.821981,0.0


In [3]:
# Separate features and labels
X = data[['cpu_usage', 'memory_usage', 'network_io', 'disk_io']]
y_true = data['label']


In [4]:
# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Train model
iso_forest.fit(X)

# Predict anomalies
y_pred = iso_forest.predict(X)


In [5]:
# Convert predictions: -1 → 1 (Anomaly), 1 → 0 (Normal)
y_pred = [1 if pred == -1 else 0 for pred in y_pred]


In [6]:
# Print evaluation report
report = classification_report(y_true, y_pred, target_names=['Normal', 'Anomaly'], digits=4)

print("📊 Isolation Forest Baseline Performance:\n")
print(report)


📊 Isolation Forest Baseline Performance:

              precision    recall  f1-score   support

      Normal     0.9895    0.9895    0.9895       285
     Anomaly     0.8000    0.8000    0.8000        15

    accuracy                         0.9800       300
   macro avg     0.8947    0.8947    0.8947       300
weighted avg     0.9800    0.9800    0.9800       300

