# Day 2: Real ML Anomaly Detection with Isolation Forest

Moving from statistical methods to machine learning for better anomaly detection

## Step 1: Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score

df = pd.read_csv("../data/cloud_cost_daily.csv")
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date")

print(f"Data shape: {df.shape}")
df.head()

## Step 2: Feature Engineering

Convert raw data into ML-ready features that capture time-series patterns

In [None]:
# Create lag features (yesterday's cost affects today)
df["cost_lag_1"] = df["cost"].shift(1)
df["cost_lag_2"] = df["cost"].shift(2)

# Rolling statistics (trend awareness)
df["rolling_mean_3"] = df["cost"].rolling(3).mean()
df["rolling_std_3"] = df["cost"].rolling(3).std()

# Cost change rate
df["cost_change"] = df["cost"].pct_change()

# Remove rows with NaN values
df = df.dropna()

print(f"Features created. Shape after cleaning: {df.shape}")
df[["date", "cost", "cost_lag_1", "rolling_mean_3", "cost_change"]].head()

## Step 3: Train Isolation Forest Model

In [None]:
# Define features for ML model
features = [
    "cost",
    "cost_lag_1", 
    "cost_lag_2",
    "rolling_mean_3",
    "rolling_std_3",
    "cost_change"
]

X = df[features]

# Train Isolation Forest
model = IsolationForest(
    n_estimators=100,
    contamination=0.2,  # expect 20% anomalies
    random_state=42
)

# Predict anomalies (-1 = anomaly, 1 = normal)
df["anomaly_score"] = model.fit_predict(X)
df["is_anomaly"] = df["anomaly_score"] == -1

print(f"Anomalies detected: {df['is_anomaly'].sum()}")
print(f"Anomaly rate: {df['is_anomaly'].mean():.1%}")

## Step 4: Visual Validation (CRITICAL FOR DEMO)

In [None]:
plt.figure(figsize=(12,6))
plt.plot(df["date"], df["cost"], label="Daily Cost", linewidth=2)
plt.scatter(
    df[df["is_anomaly"]]["date"],
    df[df["is_anomaly"]]["cost"],
    color="red",
    s=100,
    label="ML Detected Anomaly",
    zorder=5
)
plt.legend()
plt.title("Isolation Forest - Cloud Cost Anomaly Detection")
plt.xlabel("Date")
plt.ylabel("Cost ($)")
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Show detected anomalies
print("\nDetected Anomalies:")
df[df["is_anomaly"]][["date", "cost", "cost_change"]].round(3)

## Step 5: Create Ground Truth for Validation

Since we don't have real labels, we simulate based on business logic

In [None]:
# Define ground truth: costs > $120 are true anomalies
df["true_anomaly"] = df["cost"] > 120

print(f"True anomalies (cost > $120): {df['true_anomaly'].sum()}")
print(f"ML detected anomalies: {df['is_anomaly'].sum()}")

# Show comparison
comparison = df[["date", "cost", "true_anomaly", "is_anomaly"]]
comparison[comparison["true_anomaly"] | comparison["is_anomaly"]]

## Step 6: Calculate Validation Metrics (HACKATHON REQUIREMENT)

In [None]:
# Calculate performance metrics
precision = precision_score(df["true_anomaly"], df["is_anomaly"])
recall = recall_score(df["true_anomaly"], df["is_anomaly"])
f1 = f1_score(df["true_anomaly"], df["is_anomaly"])

print("=== MODEL PERFORMANCE METRICS ===")
print(f"Precision: {precision:.3f} (How many alerts were correct)")
print(f"Recall: {recall:.3f} (How many true spikes we caught)")
print(f"F1-Score: {f1:.3f} (Overall balance)")

# Confusion matrix breakdown
true_positives = ((df["true_anomaly"]) & (df["is_anomaly"])).sum()
false_positives = ((~df["true_anomaly"]) & (df["is_anomaly"])).sum()
false_negatives = ((df["true_anomaly"]) & (~df["is_anomaly"])).sum()

print(f"\n=== DETAILED BREAKDOWN ===")
print(f"True Positives: {true_positives} (Correctly caught spikes)")
print(f"False Positives: {false_positives} (False alarms)")
print(f"False Negatives: {false_negatives} (Missed spikes)")

## Step 7: Business Impact Analysis (CRITICAL FOR JUDGES)

In [None]:
# Calculate business metrics
total_anomaly_cost = df[df["is_anomaly"]]["cost"].sum()
avg_anomaly_cost = df[df["is_anomaly"]]["cost"].mean()
avg_normal_cost = df[~df["is_anomaly"]]["cost"].mean()

# Estimate potential savings (assume 30% of anomaly cost is preventable)
estimated_monthly_savings = total_anomaly_cost * 0.3
cost_increase_factor = avg_anomaly_cost / avg_normal_cost if avg_normal_cost > 0 else 0

print("=== BUSINESS IMPACT ANALYSIS ===")
print(f"Total anomalous spending detected: ${total_anomaly_cost:.2f}")
print(f"Average anomaly cost: ${avg_anomaly_cost:.2f}")
print(f"Average normal cost: ${avg_normal_cost:.2f}")
print(f"Cost increase factor: {cost_increase_factor:.1f}x")
print(f"\nðŸ’° ESTIMATED MONTHLY SAVINGS: ${estimated_monthly_savings:.2f}")
print(f"ðŸ“Š ROI: Early detection prevents {estimated_monthly_savings/total_anomaly_cost:.1%} of anomaly costs")

## Step 8: Model Comparison (Baseline vs ML)

In [None]:
# Compare with Day 1 baseline (z-score method)
df["z_score"] = (df["cost"] - df["rolling_mean_3"]) / df["rolling_std_3"]
df["baseline_anomaly"] = df["z_score"].abs() > 2

# Baseline metrics
baseline_precision = precision_score(df["true_anomaly"], df["baseline_anomaly"])
baseline_recall = recall_score(df["true_anomaly"], df["baseline_anomaly"])
baseline_f1 = f1_score(df["true_anomaly"], df["baseline_anomaly"])

print("=== METHOD COMPARISON ===")
print(f"{'Metric':<12} {'Baseline (Z-Score)':<18} {'ML (Isolation Forest)':<20} {'Improvement':<12}")
print("-" * 65)
print(f"{'Precision':<12} {baseline_precision:<18.3f} {precision:<20.3f} {precision/baseline_precision if baseline_precision > 0 else 'N/A':<12}")
print(f"{'Recall':<12} {baseline_recall:<18.3f} {recall:<20.3f} {recall/baseline_recall if baseline_recall > 0 else 'N/A':<12}")
print(f"{'F1-Score':<12} {baseline_f1:<18.3f} {f1:<20.3f} {f1/baseline_f1 if baseline_f1 > 0 else 'N/A':<12}")

## Step 9: Save Model for API (Day 3 Preparation)

In [None]:
import joblib
import os

# Create models directory
os.makedirs("../models", exist_ok=True)

# Save trained model
joblib.dump(model, "../models/isolation_forest_model.pkl")

# Save feature names for API
with open("../models/feature_names.txt", "w") as f:
    f.write("\n".join(features))

print("âœ… Model saved for API deployment")
print(f"Model file: ../models/isolation_forest_model.pkl")
print(f"Features: {features}")