# Anomaly Detection Project

This notebook demonstrates various anomaly detection techniques, including:
- Statistical methods (Z-score, IQR)
- Isolation Forest
- Local Outlier Factor (LOF)
- One-Class SVM
- Visualization of results

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from scipy import stats

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette('deep')

In [None]:
def generate_sample_data(n_samples=1000, n_outliers=50):
    """
    Generate sample data with outliers
    
    Parameters:
    -----------
    n_samples : int
        Number of normal samples
    n_outliers : int
        Number of outlier samples
    """
    # Generate normal data
    normal_data = np.random.multivariate_normal(
        mean=[0, 0],
        cov=[[1, 0.5], [0.5, 1]],
        size=n_samples
    )
    
    # Generate outliers
    outliers = np.random.uniform(low=-4, high=4, size=(n_outliers, 2))
    
    # Combine data
    X = np.vstack([normal_data, outliers])
    
    # Create labels (0 for normal, 1 for outliers)
    y = np.zeros(n_samples + n_outliers)
    y[n_samples:] = 1
    
    return X, y

# Generate sample data
X, true_labels = generate_sample_data()
df = pd.DataFrame(X, columns=['Feature 1', 'Feature 2'])

In [None]:
def detect_outliers_zscore(data, threshold=3):
    """
    Detect outliers using Z-score method
    """
    z_scores = np.abs(stats.zscore(data))
    return np.any(z_scores > threshold, axis=1)

def detect_outliers_iqr(data):
    """
    Detect outliers using IQR method
    """
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    outlier_mask = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)
    return outlier_mask

# Apply different anomaly detection methods
zscore_outliers = detect_outliers_zscore(df)
iqr_outliers = detect_outliers_iqr(df)

# Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest_outliers = iso_forest.fit_predict(X) == -1

# Local Outlier Factor
lof = LocalOutlierFactor(contamination=0.1)
lof_outliers = lof.fit_predict(X) == -1

# One-Class SVM
ocsvm = OneClassSVM(kernel='rbf', nu=0.1)
ocsvm_outliers = ocsvm.fit_predict(X) == -1

In [None]:
# Visualization function
def plot_results(X, outliers, title):
    plt.figure(figsize=(10, 6))
    plt.scatter(X[~outliers, 0], X[~outliers, 1], c='blue', label='Normal')
    plt.scatter(X[outliers, 0], X[outliers, 1], c='red', label='Outlier')
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

# Plot results for each method
plot_results(X, zscore_outliers, 'Z-score Method')
plot_results(X, iqr_outliers, 'IQR Method')
plot_results(X, iso_forest_outliers, 'Isolation Forest')
plot_results(X, lof_outliers, 'Local Outlier Factor')
plot_results(X, ocsvm_outliers, 'One-Class SVM')

In [None]:
# Compare detection results
results = pd.DataFrame({
    'Z-score': zscore_outliers,
    'IQR': iqr_outliers,
    'Isolation Forest': iso_forest_outliers,
    'LOF': lof_outliers,
    'One-Class SVM': ocsvm_outliers
})

# Calculate agreement between methods
agreement_matrix = results.corr()

# Visualize agreement
plt.figure(figsize=(10, 8))
sns.heatmap(agreement_matrix, annot=True, cmap='YlOrRd')
plt.title('Agreement between Different Detection Methods')
plt.show()

# Print summary statistics
print('\nNumber of outliers detected by each method:')
print(results.sum())
print('\nPercentage of data points flagged as outliers:')
print((results.sum() / len(results) * 100).round(2))