# Day 1: Baseline Anomaly Detection

Z-score based anomaly detection for cloud cost data

## Step 1: Load data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../data/cloud_cost_daily.csv")
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date")

print(f"Data shape: {df.shape}")
df.head()

## Step 2: Rolling statistics

In [None]:
df["rolling_mean"] = df["cost"].rolling(window=3).mean()
df["rolling_std"] = df["cost"].rolling(window=3).std()

df[["date", "cost", "rolling_mean", "rolling_std"]].head(5)

## Step 3: Z-Score anomaly detection

In [None]:
df["z_score"] = (df["cost"] - df["rolling_mean"]) / df["rolling_std"]
df["anomaly"] = df["z_score"].abs() > 2

print(f"Anomalies detected: {df['anomaly'].sum()}")
df[df["anomaly"]][["date", "cost", "z_score"]]

## Step 4: Visualize (VERY IMPORTANT)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(df["date"], df["cost"], label="Daily Cost")
plt.scatter(
    df[df["anomaly"]]["date"],
    df[df["anomaly"]]["cost"],
    color="red",
    label="Anomaly"
)
plt.legend()
plt.title("Cloud Cost Anomaly Detection")
plt.xlabel("Date")
plt.ylabel("Cost ($)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Baseline Metrics

In [None]:
# Basic detection stats
total_anomalies = df['anomaly'].sum()
anomaly_rate = (total_anomalies / len(df)) * 100
avg_anomaly_cost = df[df['anomaly']]['cost'].mean()
avg_normal_cost = df[~df['anomaly']]['cost'].mean()

print(f"Total anomalies: {total_anomalies}")
print(f"Anomaly rate: {anomaly_rate:.1f}%")
print(f"Average anomaly cost: ${avg_anomaly_cost:.2f}")
print(f"Average normal cost: ${avg_normal_cost:.2f}")
print(f"Cost increase factor: {avg_anomaly_cost/avg_normal_cost:.1f}x")