In [1]:
# Block 1: Imports and Load Data (Subset)
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_kddcup99
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Fetch a subset (e.g., 10%) due to dataset size
kdd_data = fetch_kddcup99(subset='SA', percent10=True, as_frame=True)
df = kdd_data['data']
target = kdd_data['target']

# Define normal vs anomaly
df['label'] = (target != b'normal.').astype(int) # 1 for anomaly, 0 for normal
df.drop(columns=['target'], inplace=True, errors='ignore') # Drop original target if it exists

In [2]:
# Block 2: Preprocessing
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
numerical_cols = df.select_dtypes(include=np.number).drop(columns=['label']).columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough' # Keep other columns (like 'label') if any
)

# Separate features and labels
X = df.drop('label', axis=1)
y = df['label'] # 0: normal, 1: anomaly

In [3]:
# Block 3: Apply Preprocessing
X_processed = preprocessor.fit_transform(X)

In [4]:
# Block 4: Train Isolation Forest
# Contamination 'auto' or estimate based on expected anomaly rate
# KDD99 has a high anomaly rate, let's estimate it
anomaly_rate = y.mean()
print(f"Estimated anomaly rate: {anomaly_rate:.4f}")

# Use a value slightly higher than the actual rate if known, or 'auto'
iso_forest = IsolationForest(n_estimators=100, contamination=anomaly_rate, random_state=42, n_jobs=-1)
iso_forest.fit(X_processed)

Estimated anomaly rate: 0.0336


In [5]:
# Block 5: Predict Anomalies
# Predict returns 1 for inliers (normal), -1 for outliers (anomalies)
y_pred_iso = iso_forest.predict(X_processed)

# Convert predictions to match our labels (0: normal, 1: anomaly)
y_pred_mapped = np.where(y_pred_iso == 1, 0, 1)

In [6]:
# Block 6: Evaluate Performance
accuracy = accuracy_score(y, y_pred_mapped)
conf_matrix = confusion_matrix(y, y_pred_mapped)
class_report = classification_report(y, y_pred_mapped, target_names=['Normal (0)', 'Anomaly (1)'])

print(f"Isolation Forest Anomaly Detection Accuracy: {accuracy:.4f}\n")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Note: Accuracy might be misleading in imbalanced datasets. Focus on Precision/Recall/F1 for the anomaly class.

Isolation Forest Anomaly Detection Accuracy: 0.9507

Confusion Matrix:
[[94797  2481]
 [ 2481   896]]

Classification Report:
              precision    recall  f1-score   support

  Normal (0)       0.97      0.97      0.97     97278
 Anomaly (1)       0.27      0.27      0.27      3377

    accuracy                           0.95    100655
   macro avg       0.62      0.62      0.62    100655
weighted avg       0.95      0.95      0.95    100655

