# Generate the 3 tank dataset


In [None]:
cd ..

In [None]:
from typing import Tuple

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from torch.utils.data import Dataset, Subset, DataLoader
from scipy.integrate import odeint
from tqdm import tqdm

### Simulation Class

In [None]:
class ThreeTankSimulation:
    """Simulates the three tank system.
    The system is simulated using the scipy odeint function.
    """
    def __init__(self, tank_1_lvl=0, tank_2_lvl=0, tank_3_lvl=0, seed=42):
        self.tank_levels = np.array([tank_1_lvl, tank_2_lvl, tank_3_lvl])
        self.seed = seed
        self.state_df = pd.DataFrame(columns=["q1", "q3", "kv1", "kv2", "kv3", "duration"])

    def add_state(self, q1: float, q3: float, kv1: float, kv2: float, kv3: float, duration: int, name=None) -> None:
        """Add a state to the state dataframe.
        A state consists of specific settings to the system's parameters.
        Args:
            q1 (float): inflow tank 1
            q3 (float): inflow tank 3
            kv1 (float): coefficient of the valve between tank 1 and 2
            kv2 (float): coefficient of the valve between tank 2 and 3
            kv3 (float): coefficient of the outgoing valve on tank 3
            duration (int): number of time steps of the state
            name (string): the name of the state
        """
        if name is not None:
            self.state_df.loc[name] = [q1, q3, kv1, kv2, kv3, duration]
        else:
            self.state_df.append(dict(q1=q1, q3=q3, kv1=kv1, kv2=kv2, kv3=kv3, duration=duration),
                                 ignore_index=True)

    @staticmethod
    def _system_dynamics_function(x, t, q1, q3, kv1, kv2, kv3):
        # ensure non-negative tank levels
        x1, x2, x3 = x * (x > 0)
        # ODE
        dh1_dt = q1 - kv1 * np.sign(x1 - x2) * np.sqrt(np.abs(x1 - x2))
        dh2_dt = kv1 * np.sign(x1 - x2) * np.sqrt(np.abs(x1 - x2)) \
                 - kv2 * np.sign(x2 - x3) * np.sqrt(np.abs(x2 - x3))
        dh3_dt = q3 + kv2 * np.sign(x2 - x3) * np.sqrt(np.abs(x2 - x3)) - kv3 * np.sqrt(x3)

        return dh1_dt, dh2_dt, dh3_dt

    def _compute_section(self, duration: int = 10, x0: np.array = np.array([30, 10, 50]),
                         kv1: float = 1, kv2: float = 1, kv3: float = 1,
                         q1: float = 1, q3: float = 1):
        t = np.array(range(duration))
        y = odeint(self._system_dynamics_function, x0, t, (q1, q3, kv1, kv2, kv3))
        # non-negativity
        y = y * (y > 0)
        y_stop = y[-1, :]
        return y, y_stop

    @staticmethod
    def _duplicate_row(row, factor):
        return pd.concat([row.copy()] * factor, axis = 1)

    def _configuration_seq(self, cycle: list, nb_of_cycles: int,
                           sd_q: float, sd_kv: float, sd_dur: float,
                           leaky: bool, periodic_inflow: bool) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """generates configuration dataframes
        The configuration dataframe describes the state at every time step.
        Outputs original state configuration and configuration with noise.
        """
        # generate cycle of states
        seq = list()
        for i in range(nb_of_cycles):
            for state in cycle:
                if type(state) is str:
                    seq.append(self.state_df.loc[state])
                else:
                    seq.append((self.state_df.iloc[state, :]))
        seq_df = pd.concat(seq, axis=1).T.astype({"duration": int})
        seq_len = seq_df.shape[0]

        # add periodic inflow
        if periodic_inflow:
            amplitude = 0.5 * seq_df["q1"].max()
            wave = amplitude * np.cos(np.linspace(np.pi, 5*np.pi, 2000))

            q1_mask = seq_df["q1"] > 0
            q3_mask = seq_df["q3"] > 0
            
            seq_df.loc[q1_mask, "q1"] += wave[:q1_mask.sum()]
            seq_df.loc[q3_mask, "q3"] += wave[:q3_mask.sum()]

        # add noise
        np.random.seed(self.seed)
        seq_df_noise = seq_df.copy()
        if sd_q is not None:
            q_noise = np.random.normal(0, sd_q, 2 * seq_len)
            seq_df_noise["q1"] = seq_df["q1"] + q_noise[:seq_len]
            seq_df_noise["q3"] = seq_df["q3"] + q_noise[seq_len:]
            if not leaky:
                seq_df_noise["q1"].where(seq_df["q1"] > 0, other=0, inplace=True)  # no leaky inflow
                seq_df_noise["q3"].where(seq_df["q3"] > 0, other=0, inplace=True)  # (set back to 0 if no inflow)
        if sd_kv is not None:
            kv_noise = np.random.normal(0, sd_kv, 3 * seq_len)
            seq_df_noise["kv1"] = seq_df["kv1"] + kv_noise[:seq_len]
            seq_df_noise["kv2"] = seq_df["kv2"] + kv_noise[seq_len:2*seq_len]
            seq_df_noise["kv3"] = seq_df["kv3"] + kv_noise[2*seq_len:]
            if not leaky:
                seq_df_noise["kv1"].where(seq_df["kv1"] > 0, other=0, inplace=True)  # no leaky valve
                seq_df_noise["kv2"].where(seq_df["kv2"] > 0, other=0, inplace=True)
                seq_df_noise["kv3"].where(seq_df["kv3"] > 0, other=0, inplace=True)
        if sd_dur is not None:
            dur_noise = np.random.normal(0, sd_dur, seq_len)
            seq_df_noise["duration"] = round(seq_df["duration"] + dur_noise).astype(int)
        # no negative inflow etc.
        seq_df = seq_df.where(seq_df >= 0, 0)
        seq_df_noise = seq_df_noise.where(seq_df_noise >= 0, 0) 

        return seq_df, seq_df_noise

    @staticmethod
    def _export_config(seq_df, seq_df_noise, export_path):
        """exports state configuration dataframe
        Transforms the dataframe so that the state at every time step is exported.
        """
        seq0 = list()
        seq0_noise = list()
        for (_, row), (_, row_noise) in zip(seq_df.iterrows(), seq_df_noise.iterrows()):
            duration = int(row_noise.duration)  # actual duration
            seq0 += [row] * duration
            seq0_noise += [row_noise] * duration
        seq0_df = pd.concat(seq0_noise, axis=1).T
        seq0_df.to_csv(f"{export_path[:-4]}_config.csv", index=False)

    def simulate(self, cycle: list, nb_of_cycles: int = 10,
                 sd_q: float = None, sd_kv: float = None, sd_dur: float = None, sd_white_noise: float = None, 
                 leaky: bool = False, periodic_inflow = False,
                 export_path: str = None) -> np.array:
        """Simulates the dynamics in the three-tank system
        Args:
            cycle (list): sequence of states that compose a typical cycle.
                          Either list of integers or list of state names.
            nb_of_cycles (int): number of successive cycles to simulate
            sd_q (float): if set, white noise with this standard deviation is added to the inflow
            sd_kv (float): if set, white noise with this standard deviation is added to the valve coefficients
            sd_dur (float): if set, white noise with this standard deviation is added to the duration
            leaky (bool): if true, add noise on closed valves or stopped inflow
            periodic_inflow (bool): if true, add periodic variation to the inflow
            export_path (str): if set, save simulation data at export path
        """
        seq_denoised, seq = self._configuration_seq(cycle, nb_of_cycles, sd_q, sd_kv, sd_dur, leaky, periodic_inflow)

        y_ls = []
        y_stop = self.tank_levels
        for config in tqdm(seq.itertuples(), total=len(seq)):
            y, y_stop = self._compute_section(duration=config.duration, x0=y_stop,
                                            kv1=config.kv1, kv2=config.kv2, kv3=config.kv3,
                                            q1=config.q1, q3=config.q3)
            y_ls.append(y)
        y_out = np.concatenate(y_ls)

        if sd_white_noise is not None:
            np.random.seed(self.seed)
            y_out += np.random.normal(0, sd_white_noise, y_out.shape)

        if export_path is not None:
            y_df = pd.DataFrame(y_out, columns=['h1', 'h2', 'h3'])
            y_df.to_csv(export_path, index=False)
            self._export_config(seq_denoised, seq, export_path)

        return y_out


### Plot the Dataset

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

In [None]:
def plot_simulation(data: np.array, interval=None, title=None, export_path=None, height=None, width=None):
    
    def add_trace_to_figure(fig, signal_data, signal_name):
        fig.add_trace(
            go.Scatter(
                x=np.array(range(signal_data.shape[0])),
                y=signal_data,
                name=signal_name,
                mode="lines",
                opacity=1
            ),
            row=1, col=1
        )

    fig = make_subplots(rows=1, cols=1, shared_xaxes=True)

    if interval is not None:
        data = data[interval[0]:interval[1], :]

    # signal_names = [r'$h_1$', r'$h_2$', r'$h_3$']
    signal_names = ['h1', 'h2', 'h3']
    for signal_data, signal_name in zip(data.T, signal_names):
        add_trace_to_figure(fig, signal_data, signal_name)

    fig.update_xaxes(title_text='Time Step')
    fig.update_yaxes(title_text='Water Level')
    fig.update_layout(
        title=title,
        font=dict(family="Serif", size=18),
        margin=dict(l=5, t=50, b=5, r=5),
        height=height,
        width=width
    )

    if export_path is not None:
        pio.write_image(fig, export_path)

    fig.show()

## Generate 3 Tank Systems

- there are 3 tanks
- there are 4 system states, each with a fixed duration of 50
    1. fill tank 1 (q1)
    2. mix tanks 1 and 2 (v12)
    3. mix tanks 2 and 3 (v23)
    4. empty tank 3 (v3)
    
- then there are two additional states that are not part of the first dataset to train on

    5. nothing happens (rest)
    6. fill tank 1 while emptying tank 3

In [None]:
tank_1_lvl=7
tank_2_lvl=3
tank_3_lvl=0
q = 0.1
kv = 0.1
duration = 50

system = ThreeTankSimulation(tank_1_lvl=tank_1_lvl, tank_2_lvl=tank_2_lvl, tank_3_lvl=tank_3_lvl)
system.add_state(q1=q, q3=0, kv1=0,  kv2=0,  kv3=0,  duration=duration, name="q1")
system.add_state(q1=0, q3=0, kv1=kv, kv2=0,  kv3=0,  duration=duration, name="v12")
system.add_state(q1=0, q3=0, kv1=0,  kv2=kv, kv3=0,  duration=duration, name="v23")
system.add_state(q1=0, q3=0, kv1=0,  kv2=0,  kv3=kv, duration=duration, name="v3")
system.add_state(q1=0, q3=0, kv1=0,  kv2=0,  kv3=0,  duration=duration, name="rest")
system.add_state(q1=q, q3=0, kv1=0,  kv2=0,  kv3=kv, duration=duration, name="q1+v3")
system.add_state(q1=0, q3=0, kv1=kv, kv2=kv, kv3=0,  duration=duration, name="v12+v23")
system.add_state(q1=0, q3=q, kv1=0,  kv2=0,  kv3=0,  duration=duration, name="q3")

# similar system with different initial conditions
c = 1.5
system2 = ThreeTankSimulation(tank_1_lvl=c*tank_1_lvl, tank_2_lvl=c*tank_2_lvl, tank_3_lvl=0)
system2.add_state(q1=c*q, q3=0, kv1=0,    kv2=0,    kv3=0,    duration=duration, name="q1")
system2.add_state(q1=0,   q3=0, kv1=c*kv, kv2=0,    kv3=0,    duration=duration, name="v12")
system2.add_state(q1=0,   q3=0, kv1=0,    kv2=c*kv, kv3=0,    duration=duration, name="v23")
system2.add_state(q1=0,   q3=0, kv1=0,    kv2=0,    kv3=c*kv, duration=duration, name="v3")

## Generate Different Scenarios
- in every filling phase, the inflow varies, resulting in a diverse but physically feasible 3 tank dataset
- to be more realistic, some noise is added to the simulation
- the standard scenario: fill T1 -> mix T1 & T2 -> mix T2 & T3 -> empty T3
    - with different initial tank level
    - with varying amount of inflow variation
    - with varying duration of each phase
- other scenarios:
    - fill T1 & empty T3 -> mix T1 & T2 -> mix T2 & T3
    - fill T1 & empty T3 -> rest -> mix T1 & T2 -> mix T2 & T3
    - fill T1 & fill T3 -> mix T1 & T2 -> mix T2 & T3 -> empty T3
    - fill T1 -> mix T1 & T2 & T3 -> empty T3
- furthermore, a scenario with faulty sensors is added later in the DataLoader

### Standard Scenario

In [None]:
sd_noise = 0.5
sd_q = 0.05  # for one scenario, we add noise on the inflow
sd_dur = duration * 0.1  # for one scenario, we add noise on the duration

In [None]:

print("start standard simulation")
y1 = system.simulate(cycle=["q1", "v12", "v23", "q1", "v12", "v23", "v3"],
                    nb_of_cycles=1000,
                    sd_q=None, sd_kv=None, sd_dur=None, sd_white_noise=sd_noise,
                    periodic_inflow=True,
                    export_path="data/processed/simulation_standard.csv"
                    )
plot_simulation(y1, title="Standard Dataset")

In [None]:
plot_simulation(y1, interval=[0,1000], title='Standard Simulation', 
                export_path='visualizations/simulation_standard.pdf', height=400, width=800)

### Simulations with Different Settings, but the Same Cycle

In [None]:
print("start scale simulation")
y12 = system2.simulate(cycle=["q1", "v12", "v23", "q1", "v12", "v23", "v3"],
                    nb_of_cycles=1000,
                    sd_q=None, sd_kv=None, sd_dur=None, sd_white_noise=sd_noise,
                    periodic_inflow=True,
                    export_path="data/processed/simulation_scale.csv"
                    )
# plot_simulation(y12, title="Similar System with Different Initial Conditions")

In [None]:
print("start duration simulation")
y2 = system.simulate(cycle=["q1", "v12", "v23", "q1", "v12", "v23", "v3"],
                    nb_of_cycles=1000,
                    sd_q=None, sd_kv=None, sd_dur=sd_dur, sd_white_noise=sd_noise,
                    periodic_inflow=True,
                    export_path="data/processed/simulation_duration.csv"
                    )
# plot_simulation(y2, title="Standard Dataset with Variation on Phase Duration")


In [None]:
print("start noise simulation")
y3 = system.simulate(cycle=["q1", "v12", "v23", "q1", "v12", "v23", "v3"],
                    nb_of_cycles=1000,
                    sd_q=None, sd_kv=None, sd_dur=None, sd_white_noise=sd_noise * 3,
                    periodic_inflow=True,
                    export_path="data/processed/simulation_noise.csv"
                    )
# plot_simulation(y3, title="Standard Dataset with more Noise")


### Simulations with same Settings, but Different Cycle


In [None]:
print("start switch simulation")
y4 = system.simulate(cycle=["q1", "v23", "v12", "q1", "v12", "v23", "v3"],
                    nb_of_cycles=1000,
                    sd_q=None, sd_kv=None, sd_dur=None, sd_white_noise=sd_noise,
                    periodic_inflow=True,
                    export_path="data/processed/simulation_switch.csv"
                    )
# plot_simulation(y4, title="Switch Order of Mixing")


In [None]:
print("start q1+v3 simulation")
y41 = system.simulate(cycle=["v12", "v23", "q1", "v12", "v23", "q1+v3"],
                    nb_of_cycles=1000,
                    sd_q=None, sd_kv=None, sd_dur=None, sd_white_noise=sd_noise,
                    periodic_inflow=True,
                    export_path="data/processed/simulation_q1+v3.csv"
                    )
# plot_simulation(y41, title="Fill Tank 1 while Emptying Tank 3")


In [None]:
print("start q1+v3+rest simulation")
y42 = system.simulate(cycle=["v12", "v23", "q1", "v12", "v23", "q1+v3", "rest"],
                    nb_of_cycles=1000,
                    sd_q=None, sd_kv=None, sd_dur=None, sd_white_noise=sd_noise,
                    periodic_inflow=True,
                    export_path="data/processed/simulation_q1+v3+rest.csv"
                    )
# plot_simulation(y42, title="Fill Tank 1 while Emptying Tank 3, then Rest")


In [None]:
print("start v12+v23 simulation")
y43 = system.simulate(cycle=["q1", "v12+v23", "q1", "v12", "v23", "v3"],
                    nb_of_cycles=1000,
                    sd_q=None, sd_kv=None, sd_dur=None, sd_white_noise=sd_noise,
                    periodic_inflow=True,
                    export_path="data/processed/simulation_v12+v23.csv"
                    )
# plot_simulation(y43, title="Mix Tank 1, 2 and 3")


## Build DataLoader

In [None]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from torch.utils.data import Dataset, Subset, DataLoader

In [None]:
class ThreeTankDataset(Dataset):
    """Three tank dataset
    A sample consists of a random time window + consecutive time window
    Args:
        file (str): path to csv file containing the data
        input_len (int): length of input sequence
        pred_len (int): length of prediction sequence
        nb_of_samples (int): number of samples to draw
        ordered_samples (bool): if true, samples are arranged in order of time
        faulty_input (bool): if true, input sequence is faulty
        seed (int): random seed
    """
    def __init__(self,
                 file,
                 input_len=250,  # should contain at least 4 phases (one standard cycle)
                 pred_len=50,
                 nb_of_samples=1000,
                 ordered_samples=True,
                 faulty_input=False,
                 seed=42
                 ):
        super().__init__()
        # read data
        self.X = pd.read_csv(file)

        self.input_len = input_len
        self.pred_len = pred_len
        self.phase_len = 50  # defined in simulation as duration of one phase
        self.nb_of_samples = nb_of_samples
        self.nb_of_features = self.X.shape[1]

        self.ordered_samples = ordered_samples
        self.faulty_input = faulty_input
        self.sample_idxs = self._create_samples(seed)
        self.fault_mask = self._create_faults(seed)

    def _create_samples(self, seed):
        """Create array of random start numbers"""
        np.random.seed(seed)
        start_idxs = np.random.randint(0, self.X.shape[0] - self.input_len - self.pred_len, self.nb_of_samples)
        if self.ordered_samples:  # for now important due to train/test split in datamodule
            start_idxs = np.sort(start_idxs)
        return start_idxs
    
    def _create_faults(self, seed):
        """Create a mask that simulates faulty sensors"""
        fault_mask = np.ones((self.nb_of_samples, self.input_len, self.nb_of_features), dtype=np.float32)
        if self.faulty_input:
            np.random.seed(seed)
            for i in range(self.nb_of_samples):
                # choose a random sensor and a random position in the sequence
                sensor = np.random.randint(0, self.nb_of_features)
                # either add a point anomaly or simulate a dead sensor
                if np.random.rand() < 0.5:
                    pos = np.random.randint(0, self.input_len)
                    fault_mask[i, pos, sensor] = np.random.rand() * 8 + 2  # random value between 2 and 10
                else:
                    # make sure the fault is not at the end of the sequence. The last {phase_len} steps should not be faulty
                    pos = np.random.randint(0, self.input_len - self.phase_len - 1)
                    # choose a random number of consecutive timesteps to be faulty
                    # no longer than a phase and no longer than the remaining sequence
                    nb_of_steps = np.random.randint(1, min(self.phase_len, self.input_len - self.phase_len - pos))
                    fault_mask[i, pos:pos+nb_of_steps, sensor] = 0
        return fault_mask

    def __len__(self):
        """Size of dataset"""
        return self.nb_of_samples

    def __getitem__(self, index):
        """Get one sample
        Simple setup: always yield two samples, x1(t) and concurrent sample x2(t+input_len) (without configurations).
        Note that the model can effectively see x2 via a different x1 if dataloader is not chronological.
        """
        start_idx = self.sample_idxs[index]
        x1 = self.X.iloc[start_idx: start_idx + self.input_len].to_numpy(dtype=np.float32)
        x2 = self.X.iloc[start_idx + self.input_len: start_idx + self.input_len + self.pred_len].to_numpy(dtype=np.float32)
        if self.faulty_input:
            # element-wise multiplication of input with fault mask
            x1 = x1 * self.fault_mask[index]
        return x1, x2
    

class ThreeTankDataModule(pl.LightningDataModule):
    """Data module for three tank dataset
    Args:
        eval_ood (bool): if true, evaluate on out-of-distribution data
        batch_size (int): batch size
        num_workers (int): number of workers for dataloader
        pin_memory (bool): pin memory for dataloader
        train_split (float): fraction of data used for training
        val_split (float): fraction of data used for validation
    """
    def __init__(self,
                 eval_ood=True,
                 batch_size=64, num_workers=8, pin_memory=False,
                 train_split=0.5, val_split=0.25):
        super(ThreeTankDataModule, self).__init__()

        self.scenarios = [
            "standard",
            "fault",
            "noise",
            "duration",
            "scale",
            "switch",
            "q1+v3",
            "q1+v3+rest",
            "v12+v23"
        ]
        self.eval_ood = eval_ood

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory

        self.train_split = train_split
        self.val_split = val_split

        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None

    def setup(self, stage=None) -> None:
        # standard training set to train on
        ds_std = ThreeTankDataset(
            "data/processed/simulation_standard.csv", 
            nb_of_samples=1000
            )
        # [train | val | test] for standard training set  TODO purge
        val_start_idx = int(len(ds_std) * self.train_split)
        test_start_idx = int(len(ds_std) * (self.train_split + self.val_split))

        self.ds_std_train = Subset(ds_std, range(val_start_idx))
        self.ds_std_val = Subset(ds_std, range(val_start_idx, test_start_idx))
        self.ds_std_test = Subset(ds_std, range(test_start_idx, len(ds_std)))   

        # list of ood datasets to validate model on
        self.ds_ood_list = [
            ThreeTankDataset(
                "data/processed/simulation_standard.csv",
                nb_of_samples=100,
                ordered_samples=True,
                faulty_input=True,  # simulate faulty sensors
                seed=1234  # sample from training dataset but use different seed to avoid overfitting
                )
        ]
        self.ds_ood_list += [
            ThreeTankDataset(
                f"data/processed/simulation_{scenario}.csv", 
                nb_of_samples=100,
                )
            for scenario in self.scenarios[2:]
        ]
        

    def train_dataloader(self) -> DataLoader:
        # [batch_size, seq_len, features], [batch_size, features]
        return DataLoader(
            self.ds_std_train,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory
        )

    def val_dataloader(self) -> DataLoader:
        dl_std = DataLoader(
            self.ds_std_val,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory
            )
        if not self.eval_ood:
            dl_ood_list = []
        else:
            dl_ood_list = [
                DataLoader(
                Subset(ds, range(int(self.train_split * len(ds)), int((self.train_split + self.val_split) * len(ds)))),
                batch_size=self.batch_size,
                num_workers=self.num_workers,
                pin_memory=self.pin_memory
                )
                for ds in self.ds_ood_list
            ]
        return [dl_std] + dl_ood_list

    def test_dataloader(self) -> DataLoader:
        dl_std = DataLoader(
            self.ds_std_test,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory
            )
        if not self.eval_ood:
            dl_ood_list = []
        else:
            dl_ood_list = [
                DataLoader(
                Subset(ds, range(int((self.train_split + self.val_split) * len(ds)), len(ds))),
                batch_size=self.batch_size,
                num_workers=self.num_workers,
                pin_memory=self.pin_memory
                )
                for ds in self.ds_ood_list
            ]
        return [dl_std] + dl_ood_list


In [None]:
dm = ThreeTankDataModule()
dm.setup()
x1, x2 = dm.ds_std_train[42]

In [None]:
x2.shape

In [None]:
dl = dm.train_dataloader()
next(iter(dl))[0].shape

In [None]:
def plot_sample(ds, index):
    # (x1, s1), (x2, s2) = ds[index]
    x1, x2 = ds[index]

    x = np.concatenate((x1, x2))

    fig = go.Figure()

    for sig, name in zip([x[:, 0], x[:, 1], x[:, 2]],
                         ['h1', 'h2', 'h3']):
        fig.add_trace(go.Scatter(x=np.array(range(x.shape[0])), y=sig, name=name,
                      mode="lines", opacity=1))

    fig.add_vline(x=len(x1), line_dash="dash")
    fig.update_xaxes(title_text=r'time')
    fig.update_layout(title_text=f"Sample {index}",
                      font_family="Serif", font_size=14,
                      margin_l=5, margin_t=50, margin_b=5, margin_r=5)
    fig.show()

In [None]:
plot_sample(dm.ds_std_train, 42)

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
def plot_sample_data(ds, index, title=None):
    x1, x2 = ds[index]
    x = np.concatenate((x1, x2))

    data = [go.Scatter(x=np.array(range(x.shape[0])), y=sig, name=name,
                      mode="lines", opacity=1) for sig, name in zip([x[:, 0], x[:, 1], x[:, 2]], ['h1', 'h2', 'h3'])]
    layout = go.Layout(title_text=title,
                       font_family="Serif", font_size=14,
                       margin_l=5, margin_t=50, margin_b=5, margin_r=5,
                       xaxis_title="time")
    layout.shapes = [dict(type='line', x0=len(x1), x1=len(x1), y0=0, y1=1, yref='paper', xref='x', line=dict(dash='dash'))]

    return data, layout

def interactive_sample_plot(datamodule, sample_index=0):
    datasets = datamodule.ds_dict
    scenarios = datamodule.scenarios
    def on_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            new_data, new_layout = plot_sample_data(datasets[dataset_dropdown.value], sample_dropdown.value, title=dataset_dropdown.value)
            for i, trace in enumerate(new_data):
                fig.data[i].update(trace)
            fig.layout.update(new_layout)

    dataset_dropdown = widgets.Dropdown(options=[
                                            "standard",
                                            "fault",
                                            "noise",
                                            "duration",
                                            "scale",
                                            "switch",
                                            "q1+v3",
                                            "q1+v3+rest",
                                            "v12+v23"
                                            ],
                                            value="standard",
                                            description='Dataset:'
                                        )
    dataset_dropdown.observe(on_change)

    max_sample_index = 99

    sample_dropdown = widgets.Dropdown(options=list(range(max_sample_index + 1)),
                                       value=sample_index,
                                       description='Sample:')
    sample_dropdown.observe(on_change)

    data, layout = plot_sample_data(datasets["standard"], sample_index, title=scenarios[0])
    fig = go.FigureWidget(data=data, layout=layout)

    display(widgets.VBox([dataset_dropdown, sample_dropdown, fig]))

In [None]:
interactive_sample_plot(dm)  # TODO copy from notebook 4

## Data Augmentation

In [None]:
from scipy.interpolate import interp1d

In [None]:
data = pd.read_csv("data/processed/simulation_standard.csv")

In [None]:
def add_noise(data, sd=0.1):
    data = data.copy()
    data["h1"] += np.random.normal(0, sd, len(data))
    data["h2"] += np.random.normal(0, sd, len(data))
    data["h3"] += np.random.normal(0, sd, len(data))
    return data
data_noisy = add_noise(data, 1)
data_noisy.to_csv("data/processed/simulation_standard+.csv", index=False)
data_noisy2 = add_noise(data, 2)
data_noisy2.to_csv("data/processed/simulation_standard++.csv", index=False)

In [None]:
plot_simulation(data_noisy.to_numpy()-data.to_numpy(), interval=[0, 5000], title='Added Noise', 
                # export_path='visualizations/simulation_standard+noise.pdf', 
                # height=400, width=800
                )

In [None]:
plot_simulation(data_noisy2.to_numpy(), interval=[0, 5000], title='Standard Simulation with even more Added Noise', 
                # export_path='visualizations/simulation_standard+noise.pdf', 
                # height=400, width=800
                )

In [None]:
def augment_frequency(df, std_dev=0.1):
    # List of features
    features = df.columns
    
    # Apply FFT to each feature
    fft_df = df[features].apply(np.fft.fft)
    
    # Generate Gaussian noise for each feature
    for feature in features:
        noise = np.random.normal(0, std_dev, df.shape[0])
        
        # Apply noise in frequency domain
        fft_df[feature] += noise

    # Apply inverse FFT to return to time domain
    augmented_df = fft_df.apply(np.fft.ifft)
    
    # Real part of complex number
    augmented_df = augmented_df.apply(np.real)
    
    return augmented_df

data_augmented = augment_frequency(data, std_dev=500)
data_augmented.to_csv("data/processed/simulation_frequency.csv", index=False)


In [None]:
plot_simulation(data_augmented.to_numpy()-data.to_numpy(), interval=[0, 5000], title='Augmented Data')

In [None]:
plot_simulation(data_augmented.to_numpy(), interval=[0, 5000], title='Augmented Data')

In [None]:
def random_time_warp(df, warp_factor=0.2):
    # Create a copy of the dataframe to avoid changing the original one
    df_copy = df.copy()
    
    # Generate a random sequence of warp factors
    warp = np.random.normal(loc=1.0, scale=warp_factor, size=len(df.index)-1)

    # Adding 1 at the beginning of the warp array to maintain the original starting point
    warp = np.insert(warp, 0, 1)
    
    # Cumulative sum of warp factors will be our new time index
    time_index = np.cumsum(warp)
    
    # Normalize the new time index to match the range of the original index
    time_index = np.interp(time_index, (time_index[0], time_index[-1]), (df.index[0], df.index[-1]))
    
    # Apply the interpolation function to the new time index for each feature
    for feature in ['h1', 'h2', 'h3']:
        interp_func = interp1d(df.index, df[feature].values, fill_value='extrapolate')
        df_copy[feature] = interp_func(time_index)
    
    # Reset the index of the dataframe to match the original
    df_copy.index = range(len(df))
    
    return df_copy


In [None]:
df_warped = random_time_warp(data, warp_factor=1)

In [None]:
df0 = pd.concat([data.iloc[-5000:], df_warped.iloc[-5000:]], axis=1)
df0.columns = ['h1', 'h2', 'h3', 'h1_warped', 'h2_warped', 'h3_warped']
df0.shape

In [None]:
# plot the data
fig = go.Figure()
for col in df0.columns:
    fig.add_trace(go.Scatter(x=df0.index, y=df0[col], name=col))
fig.show()

In [None]:
df_warped.to_csv("data/processed/simulation_time_warp.csv", index=False)

In [None]:
plot_simulation(df_warped.to_numpy(), interval=None, title='Time Warped Data')