# Anomaly Detection using Gaussian Distribution

This notebook demonstrates the implementation of an anomaly detection system using Gaussian (Normal) distribution. Anomaly detection is widely used in various applications such as:
- Fraud detection in financial transactions
- Fault detection in manufacturing systems
- Network intrusion detection
- Health monitoring systems

## Theory Overview
The algorithm works by:
1. Modeling the normal behavior of the system using Gaussian distribution
2. Computing the probability of new examples
3. Flagging examples with very low probability as anomalies

The Gaussian distribution is characterized by two parameters:
- μ (mu): mean of the distribution
- σ² (sigma squared): variance of the distribution

## 1. Setup and Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# Set random seed for reproducibility
np.random.seed(42)

# Set plot style
plt.style.use('seaborn')

## 2. Data Generation

We'll generate synthetic data with known anomalies to demonstrate the algorithm:
- Normal data points will follow a Gaussian distribution
- Anomalies will be generated from a different distribution

In [None]:
# Generate normal data
n_normal = 1000
normal_data = np.random.normal(loc=10, scale=2, size=(n_normal, 2))

# Generate anomalies
n_anomalies = 50
anomalies = np.random.uniform(low=0, high=20, size=(n_anomalies, 2))

# Visualize the data
plt.figure(figsize=(10, 6))
plt.scatter(normal_data[:, 0], normal_data[:, 1], alpha=0.5, label='Normal Data')
plt.scatter(anomalies[:, 0], anomalies[:, 1], c='red', alpha=0.7, label='Anomalies')
plt.title('Generated Dataset with Normal Points and Anomalies')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True)
plt.show()

## 3. Implementing Gaussian Distribution Functions

We'll implement functions to:
1. Calculate parameters (μ, σ²) of the Gaussian distribution
2. Compute probability density for new examples

In [None]:
def estimate_gaussian_parameters(X):
    """
    Estimates parameters for a Gaussian distribution
    
    Parameters:
        X: numpy array of shape (m, n) where m is number of examples
           and n is number of features
    
    Returns:
        mu: numpy array of shape (n,) containing means of each feature
        sigma2: numpy array of shape (n,) containing variances of each feature
    """
    mu = np.mean(X, axis=0)
    sigma2 = np.var(X, axis=0)
    return mu, sigma2

def compute_gaussian_probability(X, mu, sigma2):
    """
    Computes probability density for each example under the Gaussian distribution
    
    Parameters:
        X: numpy array of shape (m, n)
        mu: numpy array of shape (n,)
        sigma2: numpy array of shape (n,)
    
    Returns:
        p: numpy array of shape (m,) containing probabilities
    """
    k = len(mu)
    
    # Compute probability for each feature
    p = np.ones((X.shape[0],))
    for i in range(k):
        p *= norm.pdf(X[:, i], mu[i], np.sqrt(sigma2[i]))
    
    return p

## 4. Training the Model

We'll:
1. Estimate Gaussian parameters from normal data
2. Find a good threshold for anomaly detection using a validation set

In [None]:
# Estimate parameters
mu, sigma2 = estimate_gaussian_parameters(normal_data)

# Compute probabilities for normal data and anomalies
p_normal = compute_gaussian_probability(normal_data, mu, sigma2)
p_anomaly = compute_gaussian_probability(anomalies, mu, sigma2)

# Visualize probability distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(np.log(p_normal), bins=50, alpha=0.5, label='Normal Data')
plt.hist(np.log(p_anomaly), bins=50, alpha=0.5, label='Anomalies')
plt.title('Log Probability Distribution')
plt.xlabel('Log Probability')
plt.ylabel('Count')
plt.legend()

plt.subplot(1, 2, 2)
plt.boxplot([np.log(p_normal), np.log(p_anomaly)], labels=['Normal', 'Anomaly'])
plt.title('Log Probability Boxplot')
plt.ylabel('Log Probability')

plt.tight_layout()
plt.show()

## 5. Selecting the Anomaly Threshold

We'll select a threshold that best separates normal data from anomalies. Points with probability below this threshold will be classified as anomalies.

In [None]:
def select_threshold(p_normal, p_anomaly):
    """
    Selects the best threshold for anomaly detection
    using F1 score as the metric
    """
    best_f1 = 0
    best_epsilon = 0
    
    # Try different thresholds
    step = (np.max(np.log(p_normal)) - np.min(np.log(p_normal))) / 1000
    for epsilon in np.arange(np.min(np.log(p_normal)), np.max(np.log(p_normal)), step):
        # Convert to linear scale
        eps = np.exp(epsilon)
        
        # Predict anomalies
        predictions_normal = (p_normal < eps).astype(int)
        predictions_anomaly = (p_anomaly < eps).astype(int)
        
        # Compute metrics
        tp = np.sum(predictions_anomaly == 1)
        fp = np.sum(predictions_normal == 1)
        fn = np.sum(predictions_anomaly == 0)
        
        # Compute precision and recall
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        # Compute F1 score
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        if f1 > best_f1:
            best_f1 = f1
            best_epsilon = eps
    
    return best_epsilon, best_f1

# Find best threshold
epsilon, f1_score = select_threshold(p_normal, p_anomaly)
print(f'Best threshold: {epsilon:.6f}')
print(f'F1 score: {f1_score:.4f}')

## 6. Visualizing Results

Let's create a contour plot to visualize the decision boundary of our anomaly detection system.

In [None]:
# Create a grid of points
x_min, x_max = min(min(normal_data[:, 0]), min(anomalies[:, 0])) - 1, max(max(normal_data[:, 0]), max(anomalies[:, 0])) + 1
y_min, y_max = min(min(normal_data[:, 1]), min(anomalies[:, 1])) - 1, max(max(normal_data[:, 1]), max(anomalies[:, 1])) + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
grid_points = np.c_[xx.ravel(), yy.ravel()]

# Compute probabilities for grid points
p_grid = compute_gaussian_probability(grid_points, mu, sigma2)
p_grid = p_grid.reshape(xx.shape)

# Plot decision boundary
plt.figure(figsize=(12, 8))
plt.contourf(xx, yy, p_grid, levels=np.linspace(0, np.max(p_grid), 20), cmap='viridis', alpha=0.3)
plt.colorbar(label='Probability Density')

# Plot data points
plt.scatter(normal_data[:, 0], normal_data[:, 1], alpha=0.5, label='Normal Data')
plt.scatter(anomalies[:, 0], anomalies[:, 1], c='red', alpha=0.7, label='True Anomalies')

# Plot threshold contour
plt.contour(xx, yy, p_grid, levels=[epsilon], colors='r', linestyles='dashed', linewidths=2)

plt.title('Anomaly Detection Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True)
plt.show()