In [None]:
import numpy as np
import pandas as pd

def inject_anomalies(df, anomaly_fraction=0.01, random_seed=None):
    """
    Injects realistic anomalies into sensor data based on known correlations and statistical properties.

    Parameters:
    - df: DataFrame containing the sensor data
    - anomaly_fraction: Fraction of data points to make anomalous (default: 1%)
    - random_seed: Optional random seed for reproducibility

    Returns:
    - DataFrame with injected anomalies
    - Array indicating which points are anomalies (1=anomaly, 0=normal)
    """

    if random_seed is not None:
        np.random.seed(random_seed)

    # Create a copy of the dataframe and anomaly indicator
    df_anomalous = df.copy()
    anomaly_indicator = np.zeros(len(df))

    # Get number of anomalies to inject
    n_anomalies = int(len(df) * anomaly_fraction)

    if n_anomalies == 0:
        return df_anomalous, anomaly_indicator

    # Select random indices for anomalies
    anomaly_indices = np.random.choice(len(df), size=n_anomalies, replace=False)
    anomaly_indicator[anomaly_indices] = 1

    # Define anomaly types and their probabilities
    anomaly_types = ['shift', 'spike', 'noise', 'freeze', 'drift']
    anomaly_probs = [0.3, 0.3, 0.2, 0.1, 0.1]  # Adjust based on your expected anomaly distribution

    # Group channels by their correlation patterns
    cooling_group = ['cooling_channel0', 'cooling_channel1', 'cooling_channel10', 'cooling_channel11']
    temp_group = ['temperature_channel1', 'temperature_channel2', 'temperature_channel5']
    maxigauge_correlated = ['maxigauge_channel3', 'maxigauge_channel5']
    maxigauge_uncorrelated = ['maxigauge_channel1', 'maxigauge_channel2', 'maxigauge_channel4', 'maxigauge_channel6']

    # For each anomaly point, inject anomalies preserving correlations
    for idx in anomaly_indices:
        anomaly_type = np.random.choice(anomaly_types, p=anomaly_probs)

        if anomaly_type == 'shift':
            # Persistent shift in values - affects correlated groups together
            group = np.random.choice(['cooling', 'temp', 'maxigauge_corr', 'single'])

            if group == 'cooling':
                shift_amount = np.random.uniform(-5, 5)  # Based on std of cooling channels
                for ch in cooling_group:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] += shift_amount

            elif group == 'temp':
                shift_amount = np.random.uniform(-10, 10)  # Based on std of temp channels
                for ch in temp_group:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] += shift_amount

            elif group == 'maxigauge_corr':
                shift_amount = np.random.uniform(-100, 100)  # Based on std of maxigauge channels
                for ch in maxigauge_correlated:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] += shift_amount
            else:
                # Single channel shift
                ch = np.random.choice(df.columns)
                if ch in cooling_group:
                    df_anomalous.at[idx, ch] += np.random.uniform(-5, 5)
                elif ch in temp_group:
                    df_anomalous.at[idx, ch] += np.random.uniform(-10, 10)
                elif ch in maxigauge_correlated:
                    df_anomalous.at[idx, ch] += np.random.uniform(-100, 100)
                else:
                    df_anomalous.at[idx, ch] += np.random.uniform(-50, 50)

        elif anomaly_type == 'spike':
            # Temporary spike - can be positive or negative
            group = np.random.choice(['cooling', 'temp', 'maxigauge', 'single'])

            if group == 'cooling':
                spike_factor = np.random.choice([-1, 1]) * np.random.uniform(2, 5)
                for ch in cooling_group:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] *= spike_factor

            elif group == 'temp':
                spike_factor = np.random.choice([-1, 1]) * np.random.uniform(2, 5)
                for ch in temp_group:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] *= spike_factor

            elif group == 'maxigauge':
                spike_factor = np.random.choice([-1, 1]) * np.random.uniform(2, 10)
                for ch in maxigauge_correlated + maxigauge_uncorrelated:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] *= spike_factor
            else:
                # Single channel spike
                ch = np.random.choice(df.columns)
                if ch in cooling_group:
                    df_anomalous.at[idx, ch] *= np.random.choice([-1, 1]) * np.random.uniform(2, 5)
                elif ch in temp_group:
                    df_anomalous.at[idx, ch] *= np.random.choice([-1, 1]) * np.random.uniform(2, 5)
                else:
                    df_anomalous.at[idx, ch] *= np.random.choice([-1, 1]) * np.random.uniform(2, 10)

        elif anomaly_type == 'noise':
            # Add random noise to all channels
            for ch in df.columns:
                if ch in cooling_group:
                    df_anomalous.at[idx, ch] += np.random.normal(0, 2)  # About 40% of std
                elif ch in temp_group:
                    df_anomalous.at[idx, ch] += np.random.normal(0, 40)  # About 40% of std
                elif ch in maxigauge_correlated:
                    df_anomalous.at[idx, ch] += np.random.normal(0, 150)  # About 40% of std
                else:
                    df_anomalous.at[idx, ch] += np.random.normal(0, 20)  # Smaller noise for others

        elif anomaly_type == 'freeze':
            # Freeze values for a random channel or group
            group = np.random.choice(['cooling', 'temp', 'maxigauge_corr', 'single'])

            if group == 'cooling':
                for ch in cooling_group:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] = df_anomalous.at[idx-1, ch] if idx > 0 else df_anomalous.at[idx, ch]

            elif group == 'temp':
                for ch in temp_group:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] = df_anomalous.at[idx-1, ch] if idx > 0 else df_anomalous.at[idx, ch]

            elif group == 'maxigauge_corr':
                for ch in maxigauge_correlated:
                    if ch in df.columns:
                        df_anomalous.at[idx, ch] = df_anomalous.at[idx-1, ch] if idx > 0 else df_anomalous.at[idx, ch]
            else:
                ch = np.random.choice(df.columns)
                df_anomalous.at[idx, ch] = df_anomalous.at[idx-1, ch] if idx > 0 else df_anomalous.at[idx, ch]

        elif anomaly_type == 'drift':
            # Start a gradual drift
            ch = np.random.choice(df.columns)
            drift_direction = np.random.choice([-1, 1])

            if ch in cooling_group:
                drift_rate = np.random.uniform(0.1, 0.5)
            elif ch in temp_group:
                drift_rate = np.random.uniform(1, 5)
            elif ch in maxigauge_correlated:
                drift_rate = np.random.uniform(10, 50)
            else:
                drift_rate = np.random.uniform(1, 10)

            # Apply drift to this and subsequent points (simplified - would be better in a real implementation)
            df_anomalous.at[idx, ch] += drift_direction * drift_rate

    return df_anomalous, anomaly_indicator
