# 04: Concept Drift Detection & Mitigation

This notebook implements a full pipeline for detecting, analyzing, and mitigating concept drift in Federated Learning for Industrial IoT (IIoT) anomaly detection using Autoencoders.

---

## **Notebook Objectives**

1. **Detect Drift**  
   - Use statistical methods: KS-test + Bonferroni correction and Population Stability Index (PSI)
   - Identify which features have drifted and how severely

2. **Classify Drift**  
   - Use feature variance to distinguish:
     - **Healthy Drift** (low-impact features, low variance)
     - **Unhealthy Drift** (high-variance or security-relevant features)

3. **Simulate Drift Types**
   - Gradual Drift: Noise + systematic shift over time
   - Sudden Drift: Abrupt scaling at a midpoint
   - Healthy Drift: Subtle changes on low-impact features

4. **Run Federated Learning with Drift Injection**
   - Inject drift in Client 3 from Round 6 onward
   - Track detection and model performance over 20 FL rounds

5. **Mitigate Unhealthy Drift**
   - **Local Retraining**: Client 3 adapts locally
   - **Model Reset**: Revert to latest global model
   - **Client Freezing**: Temporarily exclude from aggregation

6. **Adaptive Aggregation**
   - Apply Drift-Aware Weighting based on KS-test drift score
   - Adjust client contribution dynamically each round

7. **Stress-Test Limits**
   - Test resilience under extreme drift conditions or multi-client drift
   - Evaluate when strategies break or fail

8. **Summarize Findings**
   - Compare all mitigation strategies across metrics (F1, Recall, FP/FN)
   - Identify which approaches are best for which drift scenarios



In [1]:
# === Part 1: Setup and Preprocessing ===

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import joblib

# Reproducibility
SEED = 42
np.random.seed(SEED)

# === Paths ===
client_id = "client_3"
client_data_path = f"D:/August-Thesis/FL-IDS-Surveillance/data/processed/federated/unsupervised/{client_id}/train.csv"

# Output directory for the two scalers
scaler_dir = "D:/August-Thesis/FL-IDS-Surveillance/notebooks/results/scalers"
os.makedirs(scaler_dir, exist_ok=True)

# === Load client training data ===
df_clean = pd.read_csv(client_data_path, low_memory=False)

# Drop label and irrelevant columns
X_clean = df_clean.drop(columns=["Attack_label", "http.request.method"], errors="ignore")
X_clean = X_clean.select_dtypes(include="number")

feature_names = X_clean.columns.tolist()

# === Fit the MinMaxScaler for AE training ===
minmax_scaler = MinMaxScaler()
X_clean_minmax = minmax_scaler.fit_transform(X_clean)

# === Fit StandardScaler for statistical drift detection ===
standard_scaler = StandardScaler()
X_clean_standard = standard_scaler.fit_transform(X_clean)

# === save the  scalers ===
minmax_path = os.path.join(scaler_dir, f"minmax_scaler_{client_id}.pkl")
standard_path = os.path.join(scaler_dir, f"standard_scaler_{client_id}.pkl")

joblib.dump(minmax_scaler, minmax_path)
joblib.dump(standard_scaler, standard_path)

print("Clean client data loaded and scaled.")
print(f"Saved MinMaxScaler to: {minmax_path}")
print(f"Saved StandardScaler to: {standard_path}")


Clean client data loaded and scaled.
Saved MinMaxScaler to: D:/August-Thesis/FL-IDS-Surveillance/notebooks/results/scalers\minmax_scaler_client_3.pkl
Saved StandardScaler to: D:/August-Thesis/FL-IDS-Surveillance/notebooks/results/scalers\standard_scaler_client_3.pkl


In [2]:
# === Part 2: Drift Detection Utilities ===
from scipy import stats
import numpy as np

def ks_psi_drift_detection(X_ref, X_new, feature_names, significance_level=0.05):
    """
    Detect drift using KS-test (with Bonferroni correction) and PSI per feature.
    both X_ref and X_new must be standardized using StandardScaler.
    """
    results = {}

    # KS-Test (with Bonferroni correction)
    ks_pvalues = []
    for i in range(X_ref.shape[1]):
        ks_stat, p_value = stats.ks_2samp(X_ref[:, i], X_new[:, i])
        ks_pvalues.append(p_value)

    corrected_alpha = significance_level / len(ks_pvalues)
    drifted_ks_features = [feature_names[i] for i, p in enumerate(ks_pvalues) if p < corrected_alpha]

    # The PSI Calculation 
    def compute_psi(expected, actual, buckets=10):
        def scale_range(arr):
            arr = np.array(arr)
            return (arr - arr.min()) / (arr.max() - arr.min() + 1e-8)

        expected_scaled = scale_range(expected)
        actual_scaled = scale_range(actual)
        breakpoints = np.linspace(0, 1, buckets + 1)
        expected_counts = np.histogram(expected_scaled, breakpoints)[0]
        actual_counts = np.histogram(actual_scaled, breakpoints)[0]

        expected_percents = expected_counts / len(expected_scaled)
        actual_percents = actual_counts / len(actual_scaled)

        expected_percents = np.where(expected_percents == 0, 1e-6, expected_percents)
        actual_percents = np.where(actual_percents == 0, 1e-6, actual_percents)

        psi = np.sum((actual_percents - expected_percents) * np.log(actual_percents / expected_percents))
        return psi

    psi_scores = [compute_psi(X_ref[:, i], X_new[:, i]) for i in range(X_ref.shape[1])]
    drifted_psi_features = [feature_names[i] for i, score in enumerate(psi_scores) if score > 0.2]

    # Final Results 
    results['ks_drifted_features'] = drifted_ks_features
    results['ks_pvalues'] = ks_pvalues
    results['psi_drifted_features'] = drifted_psi_features
    results['psi_scores'] = psi_scores
    results['ks_drift'] = len(drifted_ks_features) > 0
    results['psi_drift'] = len(drifted_psi_features) > 0

    return results


In [3]:
# === Part 3: Drift Simulation functions ===
import numpy as np
import pandas as pd

def simulate_gradual_drift(df, features, noise_factor=0.2, shift_factor=0.3):
    df_drifted = df.copy()
    n_rows = len(df)
    time_idx = np.linspace(0, 1, n_rows)

    for feature in features:
        if feature in df_drifted.columns:
            std = df[feature].std()
            shift = shift_factor * std * time_idx
            noise = np.random.normal(0, noise_factor * std * time_idx)
            df_drifted[feature] = df_drifted[feature].astype(float) + shift + noise

    return df_drifted


def simulate_sudden_drift(df, features, sudden_factor=1.5, drift_point=0.5):
    df_drifted = df.copy()
    n_rows = len(df)
    drift_idx = int(drift_point * n_rows)

    for feature in features:
        if feature in df_drifted.columns:
            df_drifted[feature] = df_drifted[feature].astype(float)
            df_drifted.loc[drift_idx:, feature] *= sudden_factor

    return df_drifted


def simulate_healthy_drift(df, features, noise_factor=0.02, shift_factor=0.01):
    df_drifted = df.copy()
    n_rows = len(df)
    time_idx = np.linspace(0, 1, n_rows)

    for feature in features:
        if feature in df_drifted.columns:
            std = df[feature].std()
            shift = shift_factor * std * time_idx
            noise = np.random.normal(0, noise_factor * std * time_idx)
            df_drifted[feature] = df_drifted[feature].astype(float) + shift + noise

    return df_drifted


In [4]:
# === Select Features for Drift Simulation ===

# Top 5 high-variance features => Unhealthy Drift
top5_drift_features = X_clean.var().sort_values(ascending=False).head(5).index.tolist()

# Low-impact 3 features => Healthy Drift
healthy_features = X_clean.var().sort_values().tail(8).head(3).index.tolist()

# === Simulate Drifted Datasets ===

df_gradual = simulate_gradual_drift(df_clean, top5_drift_features, noise_factor=0.15, shift_factor=0.25)
df_sudden = simulate_sudden_drift(df_clean, top5_drift_features, sudden_factor=1.5, drift_point=0.5)
df_healthy = simulate_healthy_drift(df_clean, healthy_features)

# === Save Drifted Datasets ===

save_dir = "D:/August-Thesis/FL-IDS-Surveillance/data/processed/federated/unsupervised/client_3"
os.makedirs(save_dir, exist_ok=True)

df_gradual.to_csv(os.path.join(save_dir, "train_gradual_drift.csv"), index=False)
df_sudden.to_csv(os.path.join(save_dir, "train_sudden_drift.csv"), index=False)
df_healthy.to_csv(os.path.join(save_dir, "train_healthy_drift.csv"), index=False)

print("Drifted datasets saved:")
print(f"  Gradual Drift: {top5_drift_features}")
print(f"  Sudden Drift: {top5_drift_features}")
print(f"  Healthy Drift: {healthy_features}")


Drifted datasets saved:
  Gradual Drift: ['tcp.ack_raw', 'tcp.seq', 'tcp.ack', 'tcp.dstport', 'tcp.srcport']
  Sudden Drift: ['tcp.ack_raw', 'tcp.seq', 'tcp.ack', 'tcp.dstport', 'tcp.srcport']
  Healthy Drift: ['udp.stream', 'udp.port', 'tcp.checksum']
