# Design a Perturbed Dataset

### Load the Dataset and Data Module

In [None]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset



class TSDataset(Dataset):
    """Time Series Dataset
    A sample consists of a (random) time window + consecutive time horizon.
    Args:
        df (pd.DataFrame): dataframe containing the data
        file_path (str): path to csv file containing the data
        input_len (int): length of input sequence
        target_len (int): length of target sequence
        stride (int): stride between samples. Only used if n_samples is None and samples are thus drawn sequentially.
        n_samples (int): number of samples to draw. If None, all possible samples are drawn sequentially. Else, samples are drawn randomly.
        mean_vals (np.array): mean values for each feature. If given, data is rescaled to have zero mean.
        sd_vals (np.array): standard deviation values for each feature. If given, data is rescaled to have unit variance.
        continuous_features (list): list of continuous features
        discrete_features (list): list of discrete features
        seed (int): seed for reproducibility
    """
    def __init__(self,
                 df=None,
                 file_path=None,
                 input_len=90,
                 target_len=30,
                 stride=1,
                 n_samples=None,
                 mean_vals=None,
                 sd_vals=None,
                 continuous_features=None,
                 discrete_features=None,
                 seed=42
                 ):
        super().__init__()
        if df is not None:
            self.df = df
        elif file_path is not None:
            if file_path.endswith(".parquet"):
                self.df = pd.read_parquet(file_path)
            elif file_path.endswith(".csv"):
                self.df = pd.read_csv(file_path)
            else:
                raise ValueError("File format not supported.")
            try:  # Try to convert the first column to datetime. If not possible, ignore it.
                self.df.set_index(pd.to_datetime(self.df.iloc[:,0], format="%Y-%m-%d %H:%M:%S"), inplace=True)
                self.df.drop(self.df.columns[0], axis=1, inplace=True)
            except ValueError:
                pass
        else:
            raise ValueError("Either df or file_path must be given.")

        self.input_len = input_len
        self.target_len = target_len
        self.stride = stride  # only used if n_samples is None and samples are drawn sequentially

        self.random_sampling = n_samples is not None
        self.n_samples = self.__len__() if n_samples is None else n_samples
        assert self.n_samples <= self.__len__(), "n_samples must be smaller than the number of possible samples."
        self.n_features = self.df.shape[1]
        self.feature_names = self.df.columns
        self.continuous_features, self.discrete_features = self.split_hybrid_data(continuous_features, discrete_features)

        # Rescale data if min and max values are given
        self.mean_vals = mean_vals
        self.sd_vals = sd_vals
        if mean_vals is not None and sd_vals is not None:
            self.scale_data()

        if seed is not None:
            self.rng = np.random.default_rng(seed)  # Using a local random number generator
        else:
            self.rng = np.random.default_rng()  # Default random generator without a fixed seed

        self.sample_idxs = self._create_sample_indices()

    def split_hybrid_data(self, continuous_features=None, discrete_features=None):
        """Split the time series data features into continuous and discrete features."""
        continuous_threshold = 32
        continuous_features = [feature for feature in self.df.columns if self.df[feature].nunique() > continuous_threshold] if continuous_features is None else continuous_features
        discrete_features = [feature for feature in self.df.columns if self.df[feature].nunique() <= continuous_threshold] if discrete_features is None else discrete_features
        assert len(continuous_features) + len(discrete_features) == self.n_features, "All features must be assigned to either continuous or discrete features."
        return continuous_features, discrete_features

    def set_scaler_params(self, mean_vals=None, sd_vals=None):
        """Set the parameters for scaling the data.
        Args:
            mean_vals (np.array): mean values for each feature. If given, data is rescaled to have zero mean.
            sd_vals (np.array): standard deviation values for each feature. If given, data is rescaled to have unit variance.
        """
        self.mean_vals = mean_vals if mean_vals is not None else self.df.mean()
        self.sd_vals = sd_vals if sd_vals is not None else self.df.std()

    def scale_data(self):
        """Scale data between min and max values."""
        assert self.mean_vals is not None and self.sd_vals is not None, "Mean and standard deviation values must be set first."
        # Avoid division by zero by replacing sd value of 0 with 1 (for constant features)
        self.sd_vals.replace(0, 1.0, inplace=True)
        # Standardize the data
        self.df = (self.df - self.mean_vals) / self.sd_vals

    def inverse_scale_data(self, scaled_data):
        df_ = pd.DataFrame(scaled_data, columns=self.df.columns)
        return (df_ * self.sd_vals) + self.mean_vals

    def _create_sample_indices(self):
        """Create an array of indices for sampling"""
        if self.random_sampling:
            sample_idxs = self.rng.integers(low=0, high=self.df.shape[0] - 2 * self.input_len - self.target_len, size=self.n_samples)  # -2*input_len because some perturbations might require more than input_len time steps
        else:
            max_n_samples = int((self.df.shape[0] - 2 * self.input_len - self.target_len) / self.stride) + 1  # -2*input_len because some perturbations might require more than input_len time steps
            sample_idxs = np.arange(max_n_samples) * self.stride
        return sample_idxs

    def __len__(self):
        """Number of samples"""
        if self.random_sampling:
            return self.n_samples
        else:
            return int((self.df.shape[0] - self.input_len - self.target_len) / self.stride) + 1

    def __getitem__(self, index):
        """Get one sample.
        A sample consists of a time window of length input_len and a consecutive time horizon of length target_len.
        Returns:
            x (np.array): input sequence
            y (np.array): target sequence
        """
        start_idx = self.sample_idxs[index]
        end_idx = start_idx + self.input_len + self.target_len
        df_ = self.df.iloc[start_idx:end_idx]
        x = df_.iloc[:self.input_len].to_numpy().astype(np.float32)
        y = df_.iloc[self.input_len:].to_numpy().astype(np.float32)
        del df_
        return x, y


In [None]:
three_tank_filepath = '../data/processed/three_tank_data.csv'
ds = TSDataset(file_path=three_tank_filepath, input_len=90, target_len=30, n_samples=100, mean_vals=None, sd_vals=None, seed=42)
ds.set_scaler_params()
ds.scale_data()
three_tank_args = dict(
    file_path=three_tank_filepath,
    mean_vals=ds.mean_vals,
    sd_vals=ds.sd_vals,
    n_samples=100,
    seed=42
)


In [None]:
swat_filepath = '../data/processed/SWaT_Dataset_Normal_v1.parquet'
ds0 = TSDataset(file_path=swat_filepath, input_len=90, target_len=30, n_samples=100, mean_vals=None, sd_vals=None, seed=42)
ds0.set_scaler_params()
ds0.scale_data()
swat_args = dict(
    file_path=swat_filepath,
    mean_vals=ds0.mean_vals,
    sd_vals=ds0.sd_vals,
    n_samples=100,
    seed=42
)
ds0.df.shape

In [None]:
import matplotlib.pyplot as plt

In [None]:
# concat and plot
def plot(ds, idx, original_ds=None):
    x, y = ds[idx]
    x = np.concatenate([x, y], axis=0)
    affected_sensors = [ds.df.columns.get_loc(sensor) for sensor in ds.affected_sensors]
    # color affected sensors, the rest in gray
    sensor_colors = {}
    for i in range(ds.n_features):
        if i in affected_sensors:
            # plt.plot(x[:, i])
            line, = plt.plot(x[:, i])
            sensor_colors[i] = line.get_color()
        else:
            plt.plot(x[:, i], color='grey', alpha=0.5)
    if original_ds is not None:
        x, y = original_ds[idx]
        x = np.concatenate([x, y], axis=0)
        for i in range(ds.n_features):
            if i in affected_sensors:
                plt.plot(x[:, i], linestyle='--', color=sensor_colors[i])
    title = f"{ds.__class__.__name__}, affected sensors: {ds.affected_sensors}"
    max_title_len = 60
    if len(title) > max_title_len:
        print(title)
        title = title[:max_title_len] + "..."
    plt.title(title)
    plt.show()

def plot_both(perturbed_ds, perturbed_ds0, idx):
    plot(perturbed_ds, idx, original_ds=ds)
    plot(perturbed_ds0, idx, original_ds=ds0)

In [None]:
ds.df.mean()

In [None]:
ds.df.std()

In [None]:
# count the unique features
ds0.df[ds0.discrete_features].nunique()

### Now test perturbed datasets

In [None]:
class OffsetDataset(TSDataset):
    """Add a constant offset to a random feature of the data."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.offset = self.set_params(severity)

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = min(int(self.n_features * prct_affected_sensors), len(self.continuous_features))
        affected_sensors = self.rng.choice(self.continuous_features, n_affected_sensors, replace=False)  # continuous features only

        min_offset = 0.
        max_offset = 2.
        offset = min_offset + severity * (max_offset - min_offset)

        return affected_sensors, offset

    def __getitem__(self, index):
        x, y = super().__getitem__(index)
        x0, y0 = ds[index]
        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            x[:, sensor_idx] += self.offset
            # y[:, sensor_idx] += self.offset
        return x, y

severity = 0.5
pert_ds = OffsetDataset(
    severity=severity,
    target_prct_affected_sensors=0.7,
    **three_tank_args
)
pert_ds0 = OffsetDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 10)

In [None]:
severity = 0.5
pert_ds = OffsetDataset(
    severity=severity,
    target_prct_affected_sensors=0.7,
    **three_tank_args
)
pert_ds0 = OffsetDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 10)

In [None]:
class DyingSignalDataset(TSDataset):
    """Multiply the a random feature of the data with a constant factor."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = min(target_prct_affected_sensors * 5, 1)  # no disturbance if flat sensor is hit, therefore we increase the percentage here
        self.affected_sensors, self.factor = self.set_params(severity)

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = min(int(self.n_features * prct_affected_sensors), len(self.continuous_features))
        affected_sensors = self.rng.choice(self.continuous_features, n_affected_sensors, replace=False)  # continuous features only

        min_factor = 1.
        max_factor = 0.
        factor = min_factor + severity * (max_factor - min_factor)

        return affected_sensors, factor

    def __getitem__(self, index):
        x, y = super().__getitem__(index)
        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            x[:, sensor_idx] *= self.factor
            # y[:, sensor_idx] *= self.factor
        return x, y

In [None]:
severity = 0.5
pert_ds = DyingSignalDataset(
    severity=severity,
    target_prct_affected_sensors=0.7,
    **three_tank_args
)
pert_ds0 = DyingSignalDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 10)

In [None]:
class NoisyDataset(TSDataset):
    """Add Gaussian noise to the data."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.sd = self.set_params(severity)
        self.noise = self._create_noise()

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = min(int(self.n_features * prct_affected_sensors), len(self.continuous_features))
        affected_sensors = self.rng.choice(self.continuous_features, n_affected_sensors, replace=False)  # continuous features only

        min_sd = 0.
        max_sd = 1
        sd = min_sd + severity * (max_sd - min_sd)

        return affected_sensors, sd

    def _create_noise(self):
        full_noise = self.rng.normal(0, self.sd, (self.n_samples, self.input_len, self.n_features))
        noise = np.zeros((self.n_samples, self.input_len, self.n_features))
        for i in self.affected_sensors:
            idx = self.df.columns.get_loc(i)
            noise[:, :, idx] = full_noise[:, :, idx]
        return noise.astype(np.float32)
        
    def __getitem__(self, index):
        x, y = super().__getitem__(index)
        x_with_noise = x + self.noise[index]
        return x_with_noise, y


In [None]:
severity = 0.5
pert_ds = NoisyDataset(
    severity=severity,
    target_prct_affected_sensors=0.7,
    **three_tank_args
)
pert_ds0 = NoisyDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 10)

In [None]:
class FlatSensorDataset(TSDataset):
    """Set a random sensor to the last value for a random duration."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05,  **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.flat_duration = self.set_params(severity)
        self.flat_start_pos = self.rng.integers(1, self.input_len - self.flat_duration + 2, size=(self.n_samples, self.n_features))  # only affected sensors sample from this

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = int(self.n_features * prct_affected_sensors)
        affected_sensors = self.rng.choice(self.feature_names, n_affected_sensors, replace=False)

        min_flat_duration = 1
        max_flat_duration = self.input_len
        flat_duration = int(min_flat_duration + severity * (max_flat_duration - min_flat_duration))

        return affected_sensors, flat_duration

    def __getitem__(self, index):
        x, y = super().__getitem__(index)
        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            start_pos = self.flat_start_pos[index, sensor_idx]
            end_pos = start_pos + self.flat_duration
            last_valid_value = x[start_pos - 1, sensor_idx]
            x[start_pos:end_pos, sensor_idx] = last_valid_value
        return x, y


In [None]:
severity = 0.5
pert_ds = FlatSensorDataset(
    severity=severity,
    target_prct_affected_sensors=0.7,
    **three_tank_args
)
pert_ds0 = FlatSensorDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 10)

In [None]:
class MissingValueDataset(TSDataset):
    """Remove a random time window from the data."""
    def __init__(self, severity=1., target_prct_affected_sensors=1., **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.affected_sensors, self.missing_duration = self.set_params(severity)
        self.missing_start_pos = self.rng.integers(0, self.input_len - self.missing_duration, size=(self.n_samples))

    def set_params(self, severity):
        affected_sensors = self.feature_names.values  # all sensors are affected, regardless of choice of targetr_prct_affected_sensors

        min_missing_duration = 1
        max_missing_duration = int(self.input_len * 0.5)
        missing_duration = min_missing_duration + int(severity * (max_missing_duration - min_missing_duration))

        return affected_sensors, missing_duration

    def __getitem__(self, index):
        start_idx = self.sample_idxs[index]
        end_idx = start_idx + self.input_len + self.target_len + self.missing_duration
        df_ = self.df.iloc[start_idx:end_idx]

        missing_start = self.missing_start_pos[index]
        missing_end = missing_start + self.missing_duration
        df_ = df_.drop(df_.index[missing_start:missing_end])

        x = df_.iloc[:self.input_len].to_numpy().astype(np.float32)
        y = df_.iloc[self.input_len:].to_numpy().astype(np.float32)

        return x, y


In [None]:
severity = 0.5
pert_ds = MissingValueDataset(
    severity=severity,
    **three_tank_args
)
pert_ds0 = MissingValueDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 10)

In [None]:
class OutlierDataset(TSDataset):
    """Add an outlier to a random sensor of the data."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.hickup_value = self.set_params(severity)
        self.fault_mask = self._create_fault_mask()

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = min(int(self.n_features * prct_affected_sensors), len(self.continuous_features))
        affected_sensors = self.rng.choice(self.continuous_features, n_affected_sensors, replace=False)  # continuous features only

        min_hickup_value = 1
        max_hickup_value = 100
        hickup_value = min_hickup_value + severity * (max_hickup_value - min_hickup_value)

        return affected_sensors, hickup_value

    def _create_fault_mask(self):
        fault_mask = np.zeros((self.n_samples, self.input_len, self.n_features), dtype=np.float32)
        for sample in range(self.n_samples):
            for sensor in self.affected_sensors:
                sensor_idx = self.df.columns.get_loc(sensor)
                hickup_postion = self.rng.integers(1, self.input_len)
                fault_mask[sample, hickup_postion, sensor_idx] = self.hickup_value
        return fault_mask

    def __getitem__(self, index):
        x, y = super().__getitem__(index)
        x_with_fault = x + self.fault_mask[index]
        return x_with_fault, y


In [None]:
severity = 0.5
pert_ds = OutlierDataset(
    severity=severity,
    **three_tank_args
)
pert_ds0 = OutlierDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 10)

In [None]:
class FasterSamplingDataset(TSDataset):
    """Irregularly sample the data by warping the time axis of the input sequence.
    The time axis is warped by a factor between 1 and 3 during a fixed duration.
    After the warped time frame, the sensor remains flat until synchronizing with the original time axis.
    """
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.warp_factor, self.warp_duration = self.set_params(severity)
        self.warp_start_pos = self.rng.integers(0, int(self.input_len * 0.5), size=(self.n_samples))  # only affected sensors sample from this. 
        # only first half of the input_len is warped to avoid leakage. same start point for all affected sensors of a sample

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = int(self.n_features * prct_affected_sensors)
        affected_sensors = self.rng.choice(self.feature_names, n_affected_sensors, replace=False)

        min_warp_factor = 1.
        max_warp_factor = 3.
        warp_factor = min_warp_factor + severity * (max_warp_factor - min_warp_factor)

        warp_duration = int(self.input_len * 0.5)

        return affected_sensors, warp_factor, warp_duration

    def __getitem__(self, index):
        x, y = super().__getitem__(index)

        original_time_index = np.arange(self.input_len)
        irreg_time = np.full(self.warp_duration, self.warp_factor)
        irreg_time_index = np.cumsum(irreg_time) + self.warp_start_pos[index] - 1

        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            x[self.warp_start_pos[index]:(self.warp_start_pos[index] + self.warp_duration), sensor_idx] = np.interp(irreg_time_index, original_time_index, x[:, sensor_idx])
        
        return x, y

In [None]:
severity = 0.5
pert_ds = FasterSamplingDataset(
    severity=severity,
    target_prct_affected_sensors=0.7,
    **three_tank_args
)
pert_ds0 = FasterSamplingDataset(
    severity=severity,
    target_prct_affected_sensors=0.2,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 1)


In [None]:
class SlowerSamplingDataset(TSDataset):
    """Irregularly sample the data by warping the time axis of the input sequence.
    The time axis is warped by a factor between 1 and 0 during a fixed duration.
    After the warped time frame, the sensor immediately jumps back to the original time axis.
    """
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.warp_factor, self.warp_duration = self.set_params(severity)
        self.warp_start_pos = self.rng.integers(0, int(self.input_len * 0.5), size=(self.n_samples))  # only affected sensors sample from this. 
        # only first half of the input_len is warped to avoid leakage. same start point for all affected sensors of a sample

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = int(self.n_features * prct_affected_sensors)
        affected_sensors = self.rng.choice(self.feature_names, n_affected_sensors, replace=False)

        min_warp_factor = 1.
        max_warp_factor = 0.
        warp_factor = min_warp_factor + severity * (max_warp_factor - min_warp_factor)

        warp_duration = int(self.input_len * 0.5)

        return affected_sensors, warp_factor, warp_duration

    def __getitem__(self, index):
        x, y = super().__getitem__(index)

        original_time_index = np.arange(self.input_len)
        irreg_time = np.full(self.warp_duration, self.warp_factor)
        irreg_time_index = np.cumsum(irreg_time) + self.warp_start_pos[index] - 1

        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            x[self.warp_start_pos[index]:(self.warp_start_pos[index] + self.warp_duration), sensor_idx] = np.interp(irreg_time_index, original_time_index, x[:, sensor_idx])
        
        return x, y


In [None]:
severity = .5
pert_ds = SlowerSamplingDataset(
    severity=severity,
    target_prct_affected_sensors=0.7,
    **three_tank_args
)
pert_ds0 = SlowerSamplingDataset(
    severity=severity,
    target_prct_affected_sensors=0.2,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 1)

In [None]:
class WrongValueDataset(TSDataset):
    """A discrete sensor or actuator shows a wrong value."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.wrong_duration = self.set_params(severity)
        self.wrong_start_pos = self.rng.integers(1, self.input_len - self.wrong_duration + 2, size=(self.n_samples, self.n_features))  # only affected sensors sample from this

    def set_params(self, severity):
        if len(self.discrete_features) == 0:
            raise ValueError("No discrete features available.")
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = min(int(self.n_features * prct_affected_sensors), len(self.discrete_features))
        affected_sensors = self.rng.choice(self.discrete_features, n_affected_sensors, replace=False)  # discrete features only

        min_wrong_duration = 1
        max_wrong_duration = self.input_len
        wrong_duration = int(min_wrong_duration + severity * (max_wrong_duration - min_wrong_duration))

        return affected_sensors, wrong_duration

    def __getitem__(self, index):
        x, y = super().__getitem__(index)
        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            start_pos = self.wrong_start_pos[index, sensor_idx]
            end_pos = start_pos + self.wrong_duration
            x[start_pos:end_pos, sensor_idx] = 2  # set a fixed wrong value
        return x, y



In [None]:
severity = 0.5
pert_ds0 = WrongValueDataset(
    severity=severity,
    **swat_args
)
plot(pert_ds0, 10, ds0)

In [None]:
class OscillationDataset(TSDataset):
    """A discrete sensor or actuator oscillates between two values."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.osc_duration = self.set_params(severity)
        self.osc_start_pos = self.rng.integers(1, self.input_len - self.osc_duration + 1, size=(self.n_samples, self.n_features))  # only affected sensors sample from this

    def set_params(self, severity):
        if len(self.discrete_features) == 0:
            raise ValueError("No discrete features available.")
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = min(int(self.n_features * prct_affected_sensors), len(self.discrete_features))
        affected_sensors = self.rng.choice(self.discrete_features, n_affected_sensors, replace=False)  # discrete features only

        min_osc_duration = 1
        max_osc_duration = self.input_len - 1  # start value is not affected (technically it is, but it is not visible)
        osc_duration = int(min_osc_duration + severity * (max_osc_duration - min_osc_duration))

        return affected_sensors, osc_duration

    def __getitem__(self, index):
        x, y = super().__getitem__(index)
        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            start_pos = self.osc_start_pos[index, sensor_idx]
            end_pos = start_pos + self.osc_duration

            last_value = x[start_pos - 1, sensor_idx]
            if self.df[sensor].nunique() == 1:
                wrong_value = 1  # if there was only one value, it would be standardized to 0, so set this to 1. Why is this sensor in the data to begin with?
            else:
                unique_values = self.df[sensor].unique().astype(np.float32)
                filtered_values = unique_values[unique_values != last_value]  # force a different value
                wrong_value = self.rng.choice(filtered_values)
            oscillating_values = self.rng.choice([last_value, wrong_value], size=self.osc_duration, replace=True)
            x[start_pos:end_pos, sensor_idx] = oscillating_values

        return x, y


In [None]:
severity = 0.5
pert_ds0 = OscillationDataset(
    severity=severity,
    **swat_args
)
plot(pert_ds0, 10, ds0)

The affected actuator only has two standardized values: [-18.366959  ,   0.05444548] (the -18 is very rare).

### Other Perturbations

In [None]:
class ClippedIrregularSamplingDataset(TSDataset):
    """Irregularly sample the data by randomly warping the time axis of the input sequence.
    Clip the minimum and maximum values of the time axis warping factor to avoid negative time indices."""
    def __init__(self, severity=1., target_prct_affected_sensors= 0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.warp_factor = self.set_params(severity)

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = int(self.n_features * prct_affected_sensors)
        affected_sensors = self.rng.choice(self.continuous_features, n_affected_sensors, replace=False)  # continuous features only

        min_warp_factor = 0.
        max_warp_factor = 2.
        warp_factor = min_warp_factor + severity * (max_warp_factor - min_warp_factor)

        return affected_sensors, warp_factor

    def __getitem__(self, index):
        x, y = super().__getitem__(index)

        original_time_index = np.arange(x.shape[0])

        noisy_time = self.rng.normal(loc=1.0, scale=self.warp_factor, size=x.shape[0] - 1)
        noisy_time = np.clip(noisy_time, 0., 2.)  # Make sure the timestamps do not overlap
        noisy_time = np.insert(noisy_time, 0, 0)  # First time point is always the same
        noisy_time_index = np.cumsum(noisy_time)

        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            x[:, sensor_idx] = np.interp(noisy_time_index, original_time_index, x[:, sensor_idx])

        return x, y

In [None]:
severity = 0.5
pert_ds = ClippedIrregularSamplingDataset(
    severity=severity,
    **three_tank_args
)
pert_ds0 = ClippedIrregularSamplingDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 1)

In [None]:
class SortedIrregularSamplingDataset(TSDataset):
    """Irregularly sample the data by randomly warping the time axis of the input sequence.
    Sort the time indices to avoid negative time indices."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.warp_factor = self.set_params(severity)

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = int(self.n_features * prct_affected_sensors)
        affected_sensors = self.rng.choice(self.continuous_features, n_affected_sensors, replace=False)  # continuous features only

        min_warp_factor = 0.
        max_warp_factor = 2.
        warp_factor = min_warp_factor + severity * (max_warp_factor - min_warp_factor)

        return affected_sensors, warp_factor

    def __getitem__(self, index):
        x, y = super().__getitem__(index)

        original_time_index = np.arange(x.shape[0])

        noisy_time = self.rng.normal(loc=1.0, scale=self.warp_factor, size=x.shape[0] - 1)
        noisy_time = np.insert(noisy_time, 0, 0)  # First time point is always the same
        noisy_time_index = np.cumsum(noisy_time)
        noisy_time_index = np.sort(noisy_time_index)

        for sensor in self.affected_sensors:
            sensor_idx = self.df.columns.get_loc(sensor)
            x[:, sensor_idx] = np.interp(noisy_time_index, original_time_index, x[:, sensor_idx])

        return x, y

In [None]:
severity = 0.5
pert_ds = SortedIrregularSamplingDataset(
    severity=severity,
    **three_tank_args
)
pert_ds0 = SortedIrregularSamplingDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 1)

In [None]:
class WarpedWindowDataset(TSDataset):
    """Irregularly sample the data by warping the time index of a window of the input sequence by a factor of 2."""
    def __init__(self, severity=1., target_prct_affected_sensors=0.05, **kwargs):
        super().__init__(**kwargs)
        assert 0 <= severity <= 1, "Severity must be between 0 and 1."
        self.severity = severity
        self.target_prct_affected_sensors = target_prct_affected_sensors
        self.affected_sensors, self.warp_duration = self.set_params(severity)
        self.warp_start_pos = self.rng.integers(0, self.input_len - self.warp_duration + 1, size=(self.n_samples))

    def set_params(self, severity):
        min_prct_affected_sensors = 1 / self.n_features + 1e-9  # at least one sensor must be affected
        prct_affected_sensors = max(min_prct_affected_sensors, self.target_prct_affected_sensors)
        n_affected_sensors = int(self.n_features * prct_affected_sensors)
        affected_sensors = self.rng.choice(self.n_features, n_affected_sensors, replace=False)

        min_warp_duration = 1
        max_warp_duration = self.input_len
        warp_duration = min_warp_duration + int(severity * (max_warp_duration - min_warp_duration))

        return affected_sensors, warp_duration

    def __getitem__(self, index):
        start_idx = self.sample_idxs[index]
        warp_start_idx = start_idx + self.warp_start_pos[index]
        warp_end_idx = warp_start_idx + 2 * self.warp_duration
        end_idx = start_idx + self.input_len + self.target_len + self.warp_duration

        pre_warp = self.df.iloc[start_idx:warp_start_idx].to_numpy()
        warped = self.df.iloc[warp_start_idx:warp_end_idx:2].to_numpy()
        post_warp = self.df.iloc[warp_end_idx:end_idx].to_numpy()

        stack = np.vstack([pre_warp, warped, post_warp])

        x = stack[:self.input_len].astype(np.float32)
        y = stack[self.input_len:].astype(np.float32)

        return x, y

In [None]:
severity = 0.5
pert_ds = WarpedWindowDataset(
    severity=severity,
    **three_tank_args
)
pert_ds0 = WarpedWindowDataset(
    severity=severity,
    **swat_args
)
plot_both(pert_ds, pert_ds0, 10)

In [None]:
# warp the frequency domain
from scipy.fftpack import fft, ifft

class FrequencyWarpedDataset(TSDataset):
    """Warp the frequency domain of the data.
    The frequency domain is warped by a random factor that is sampled from a normal distribution.
    Args:
        warp_factor (float): standard deviation of the normal distribution
    """
    def __init__(self, warp_factor=0.2, **kwargs):
        super().__init__(**kwargs)
        self.warp_factor = warp_factor
        self.df = self.frequency_warp()

    def frequency_warp(self):
        # Save the original time index
        original_time_index = pd.to_numeric(self.df.index) if isinstance(self.df.index, pd.DatetimeIndex) else self.df.index.copy()

        # Generate the warped time index
        warp = self.rng.normal(loc=1.0, scale=self.warp_factor, size=len(original_time_index) - 1)
        warp = np.insert(warp, 0, 1)
        noisy_time_index = np.cumsum(warp)
        noisy_time_index = np.interp(noisy_time_index, (noisy_time_index[0], noisy_time_index[-1]), (original_time_index[0], original_time_index[-1]))

        df_new = pd.DataFrame()
        for feature in self.df.columns:
            # Interpolate each feature from the original time index to the warped time index
            interp_func = interp1d(original_time_index, self.df[feature].values, fill_value='extrapolate')
            df_new[feature] = interp_func(noisy_time_index)

        # Handling DateTimeIndex for the new DataFrame
        if isinstance(self.df.index, pd.DatetimeIndex):
            df_new.index = pd.to_datetime(df_new.index)

        return df_new

    def __getitem__(self, index):
        x, y = super().__getitem__(index)
        # Apply frequency warp to x
        x_with_frequency_warp = self.frequency_warp(x)
        return x_with_frequency_warp, y

In [None]:
from scipy.fft import fft, ifft


class FrequencyDataset(TSDataset):
    """
    Transform the time series into the frequency domain, apply noise, and then transform it back.
    Args:
        noise_sd (float): standard deviation of the Gaussian noise to be added in the frequency domain.
        ignore_imaginary (bool): whether to ignore the imaginary part after inverse FFT. Defaults to True.
    """
    def __init__(self, noise_sd=0.01, ignore_imaginary=True, **kwargs):
        super().__init__(**kwargs)
        self.noise_sd = noise_sd
        self.ignore_imaginary = ignore_imaginary
        self.df = self.apply_frequency_noise()

    def apply_frequency_noise(self):
        df_new = pd.DataFrame(index=self.df.index)
        for feature in self.df.columns:
            # FFT transformation
            freq_data = fft(self.df[feature].to_numpy())

            # Apply Gaussian noise in the frequency domain
            noise = self.rng.normal(0, self.noise_sd, len(freq_data))
            freq_data_noisy = freq_data + noise

            # Inverse FFT to transform back to time domain
            time_data_noisy = ifft(freq_data_noisy)

            # Check and handle imaginary parts
            if not self.ignore_imaginary and np.max(np.abs(time_data_noisy.imag)) > 1e-10:
                raise ValueError("Significant imaginary component detected after inverse FFT")

            df_new[feature] = time_data_noisy.real

        return df_new

In [None]:
fds = FrequencyDataset(
    noise_sd=10,
    file_path=f"../data/processed/three_tank_data.csv",
    mean_vals=ds.mean_vals,
    sd_vals=ds.sd_vals,
    n_samples=100,
    seed=42
)
plot(fds, 1, ds)

Does not really work for the three tank data. Try SWaT instead.

In [None]:
# plot the frequency domain
def plot_frequency_domain(df, original_df=None):
    fig = plt.figure(figsize=(15, 5))
    for feature in df.columns:
        freq_data = fft(df[feature].to_numpy())
        plt.plot(np.abs(freq_data), label=feature)
    plt.legend()
    if original_df is not None:
        for feature in original_df.columns:
            freq_data = fft(original_df[feature].to_numpy())
            plt.plot(np.abs(freq_data), color='grey', linestyle='--', alpha=0.5)
    plt.show()

plot_frequency_domain(fds.df, ds.df)

In [None]:
plot_frequency_domain(ds.df-fds.df)

In [None]:
class FrequencyFilterDataset(TSDataset):
    def __init__(self, factor_range=(0.75, 1.25), **kwargs):
        super().__init__(**kwargs)
        if len(factor_range) != 2 or factor_range[0] > factor_range[1]:
            raise ValueError("factor_range must be a tuple (min_factor, max_factor) with min_factor <= max_factor")
        self.factor_range = factor_range
        self.df = self.apply_frequency_perturbation()

    def apply_frequency_perturbation(self):
        df_new = pd.DataFrame(index=self.df.index)
        band = self.rng.choice([0, 1, 2])
        factor = self.rng.uniform(*self.factor_range)
        for feature in self.df.columns:
            feature_data = self.df[feature].to_numpy()
            df_new[feature] = self.perturb_feature(feature_data, factor, band)
        return df_new

    def perturb_feature(self, data, factor, band):
        freq_data = fft(data)        
        n = len(data)
        half_n = (n + 1) // 2  # Correctly handles both even and odd n

        # Calculate start and end indices for each band
        band_ranges = [(0, half_n//3), (half_n//3, 2*half_n//3), (2*half_n//3, half_n)]
        start, end = band_ranges[band]

        # Apply the perturbation factor to the selected frequency band
        freq_data[start:end] *= factor
        freq_data[-end:-start if start != 0 else None] *= factor

        time_data_altered = ifft(freq_data).real
        return time_data_altered

In [None]:
ffds = FrequencyFilterDataset(
    factor_range=(10, 10),
    file_path=f"../data/processed/three_tank_data.csv",
    mean_vals=ds.mean_vals,
    sd_vals=ds.sd_vals,
    n_samples=100,
    seed=42
)
plot(ffds, 30, ds)

In [None]:
seed = 1000
ds0 = TSDataset(
    file_path=f"../data/processed/SWaT_Dataset_Normal_v1_sensors.parquet",
    first_column_is_date=True,
    n_samples=10,
    seed=seed
)
ds0.set_scaler_params()
ds0.scale_data()
ffds0 = FrequencyFilterDataset(
    factor_range=(0, 0),
    file_path=f"../data/processed/SWaT_Dataset_Normal_v1_sensors.parquet",
    first_column_is_date=True,
    n_samples=10,
    mean_vals=ds0.mean_vals,
    sd_vals=ds0.sd_vals,
    seed=seed
)
plot(ffds0, 1, ds0)

In [None]:
def plot_time_series_and_frequency_spectrum(time, time_series, sampling_rate):
    # FFT transformation
    y_fft = np.fft.fft(time_series)
    n = len(time_series)

    # Frequency bins and amplitude scaling
    frequencies = np.fft.fftfreq(n, d=1/sampling_rate)
    positive_freqs = frequencies[:n//2]
    positive_amplitude = np.abs(y_fft[:n//2]) * 2/n

    # Plotting
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    ax[0].plot(time, time_series)
    ax[0].set_title('Time Series')
    ax[0].set_xlabel('Time')
    ax[0].set_ylabel('Amplitude')

    ax[1].stem(positive_freqs, positive_amplitude)
    ax[1].set_title('Frequency Spectrum')
    ax[1].set_xlabel('Frequency (Hz)')
    ax[1].set_ylabel('Amplitude')

    plt.show()

plot_time_series_and_frequency_spectrum(ds0.df.index, ds0.df['LIT101'], 1)
plot_time_series_and_frequency_spectrum(ffds0.df.index, ffds0.df['LIT101'], 1)