# mount drive

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# importing libraries

In [47]:
import pandas as pd
import numpy as np
import kagglehub
import os
import json
import joblib
from sklearn.preprocessing import MinMaxScaler
from scipy.signal import savgol_filter

# Project Paths

In [48]:
BASE_PATH = "/content/drive/MyDrive/DepiProject"
RAW_PATH = os.path.join(BASE_PATH, "raw_data")
SAVE_PATH = os.path.join(BASE_PATH, "swat_preprocessed")

os.makedirs(SAVE_PATH, exist_ok=True)
os.makedirs(RAW_PATH, exist_ok=True)

print("raw path:", RAW_PATH)
print("Save path:", SAVE_PATH)

raw path: /content/drive/MyDrive/DepiProject/raw_data
Save path: /content/drive/MyDrive/DepiProject/swat_preprocessed


# Configuration

In [49]:
CONFIG = {
    "stabilization_hours": 5,
    "smoothing_window": 5,
    "train_ratio": 0.8,
    "window_size": 30,
    "stride_train": 5,
    "stride_val": 5,
    "stride_test": 1
}

# Load SWaT Data

In [50]:

ab109316_ts_training_path = kagglehub.dataset_download('ab109316/ts-training')
print('Data source import complete.')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df_attack_28_2 = pd.read_csv(ab109316_ts_training_path+"/TS Training/data/anomaly_detection/multivariate/SWAT/SWaT.csv")
df_normal_22_28 = pd.read_csv(ab109316_ts_training_path+"/TS Training/data/anomaly_detection/multivariate/SWAT/SWaT_Dataset_Normal_v0.csv")
df_attack_28_2.to_csv('df_attack.csv')
df_normal_22_28.to_csv('df_normal.csv')

Using Colab cache for faster access to the 'ts-training' dataset.
Data source import complete.
/kaggle/input/ts-training/TS Training/data/anomaly_detection/multivariate/A Dataset to Support Research in the Design of Secure Water Treatment Systems (1).pdf
/kaggle/input/ts-training/TS Training/data/anomaly_detection/multivariate/process_1_attack.csv
/kaggle/input/ts-training/TS Training/data/anomaly_detection/multivariate/process_1_normal.csv
/kaggle/input/ts-training/TS Training/data/anomaly_detection/multivariate/SWAT/SWaT_Dataset_Normal_v1.csv
/kaggle/input/ts-training/TS Training/data/anomaly_detection/multivariate/SWAT/SWaT.csv
/kaggle/input/ts-training/TS Training/data/anomaly_detection/multivariate/SWAT/SWaT_Dataset_Normal_v0.csv
/kaggle/input/ts-training/TS Training/data/anomaly_detection/multivariate/SWAT/readme/List_of_attacks_Final.xlsx
/kaggle/input/ts-training/TS Training/data/anomaly_detection/multivariate/SWAT/readme/A Dataset to Support Research in the Design of Secure Wa

In [51]:
! cp df_attack.csv /content/drive/MyDrive/DepiProject/raw_data/df_attack.csv
! cp df_normal.csv /content/drive/MyDrive/DepiProject/raw_data/df_normal.csv

In [52]:
def load_swat(normal_path, attack_path):

    normal = pd.read_csv(normal_path, low_memory=False)
    attack = pd.read_csv(attack_path, low_memory=False)

    for df in [normal, attack]:
        df.columns = df.columns.str.strip()

        df["Timestamp"] = pd.to_datetime(
            df["Timestamp"].str.strip(),
            format="%d/%m/%Y %I:%M:%S %p",
            errors="coerce"
        )

        df.sort_values("Timestamp", inplace=True)
        df.set_index("Timestamp", inplace=True)

        df["label"] = df["Normal/Attack"].map(
            lambda x: 0 if x == "Normal" else 1
        )

        df.drop(columns=["Normal/Attack"], inplace=True)

    return normal, attack




In [53]:
NORMAL_PATH = os.path.join(RAW_PATH, "df_normal.csv")
ATTACK_PATH = os.path.join(RAW_PATH, "df_attack.csv")

normal_raw, attack_raw = load_swat(NORMAL_PATH, ATTACK_PATH)

print("Normal shape:", normal_raw.shape)
print("Attack shape:", attack_raw.shape)

Normal shape: (496800, 53)
Attack shape: (449919, 53)


# cleaning data

## Remove Stabilization Period

In [54]:
def remove_stabilization(df, hours):
    cutoff = df.index[0] + pd.Timedelta(hours=hours)
    return df[df.index > cutoff]

normal = remove_stabilization(normal_raw, CONFIG["stabilization_hours"])
attack = attack_raw.copy()

print("Normal after stabilization:", normal.shape)

Normal after stabilization: (478799, 53)


## Remove Constant Features

In [55]:
def drop_constant_columns(df):
    constant_cols = [c for c in df.columns if df[c].nunique() <= 1]
    constant_cols.remove("label")
    return df.drop(columns=constant_cols), constant_cols

normal, dropped_cols = drop_constant_columns(normal)
attack = attack.drop(columns=dropped_cols)

print("Dropped constant columns:", dropped_cols)

Dropped constant columns: ['P102', 'P201', 'P202', 'P204', 'P206', 'P401', 'P402', 'P403', 'P404', 'UV401', 'P501', 'P502', 'P601', 'P603']


## Fix Timestamp Gaps

In [56]:
def fix_gaps(df):

    full_range = pd.date_range(
        start=df.index.min(),
        end=df.index.max(),
        freq="1S"
    )

    df = df.reindex(full_range)

    df.interpolate(method="linear", limit=30, inplace=True)
    df.fillna(method="ffill", inplace=True)

    return df

normal = fix_gaps(normal)
attack = fix_gaps(attack)

  full_range = pd.date_range(
  df.fillna(method="ffill", inplace=True)
  full_range = pd.date_range(
  df.fillna(method="ffill", inplace=True)


# feature engineering

## Identify Sensor Columns

In [57]:
feature_cols = [c for c in normal.columns if c != "label"]

## Smooth Sensors (Rolling Average)

In [58]:
def smooth(df, feature_cols, window):

    df_smooth = df.copy()

    for col in feature_cols:
        df_smooth[col] = (
            df[col]
            .rolling(window=window, center=True, min_periods=1)
            .mean()
        )

    return df_smooth

normal = smooth(normal, feature_cols, CONFIG["smoothing_window"])
attack = smooth(attack, feature_cols, CONFIG["smoothing_window"])

## Proper Train / Validation Split

In [59]:
split_idx = int(len(normal) * CONFIG["train_ratio"])

normal_train = normal.iloc[:split_idx]
normal_val   = normal.iloc[split_idx:]

print("Train shape:", normal_train.shape)
print("Val shape:", normal_val.shape)

Train shape: (383039, 39)
Val shape: (95760, 39)


## Fit Scaler ONLY On Train

In [60]:
scaler = MinMaxScaler()
scaler.fit(normal_train[feature_cols])

## Apply Scaling

In [61]:
def scale(df, scaler, feature_cols):
    df_scaled = df.copy()
    df_scaled[feature_cols] = scaler.transform(df[feature_cols])
    return df_scaled

normal_train = scale(normal_train, scaler, feature_cols)
normal_val   = scale(normal_val, scaler, feature_cols)
attack       = scale(attack, scaler, feature_cols)

## Window Function

In [62]:
def create_windows(data, labels, window_size, stride):

    n_samples = len(data)
    n_features = data.shape[1]

    n_windows = (n_samples - window_size) // stride + 1

    X = np.zeros((n_windows, window_size, n_features), dtype=np.float32)
    y = np.zeros(n_windows, dtype=np.int8)

    for i, start in enumerate(
        range(0, n_samples - window_size + 1, stride)
    ):
        end = start + window_size
        X[i] = data[start:end]
        y[i] = 1 if labels[start:end].max() > 0 else 0

    return X, y

## Create Windows

In [63]:
WINDOW = CONFIG["window_size"]

X_train, y_train = create_windows(
    normal_train[feature_cols].values,
    normal_train["label"].values,
    WINDOW,
    CONFIG["stride_train"]
)

X_val, y_val = create_windows(
    normal_val[feature_cols].values,
    normal_val["label"].values,
    WINDOW,
    CONFIG["stride_val"]
)

X_attack, y_attack = create_windows(
    attack[feature_cols].values,
    attack["label"].values,
    WINDOW,
    CONFIG["stride_test"]
)

print("Train windows:", X_train.shape)
print("Val windows:", X_val.shape)
print("Attack windows:", X_attack.shape)

Train windows: (76602, 30, 38)
Val windows: (19147, 30, 38)
Attack windows: (449971, 30, 38)


## Save Everything to Drive

In [64]:
np.save(os.path.join(SAVE_PATH, "X_train.npy"), X_train)
np.save(os.path.join(SAVE_PATH, "y_train.npy"), y_train)

np.save(os.path.join(SAVE_PATH, "X_val.npy"), X_val)
np.save(os.path.join(SAVE_PATH, "y_val.npy"), y_val)

np.save(os.path.join(SAVE_PATH, "X_attack.npy"), X_attack)
np.save(os.path.join(SAVE_PATH, "y_attack.npy"), y_attack)

joblib.dump(scaler, os.path.join(SAVE_PATH, "scaler.pkl"))

with open(os.path.join(SAVE_PATH, "config.json"), "w") as f:
    json.dump(CONFIG, f, indent=2)

print("All files saved to:", SAVE_PATH)

All files saved to: /content/drive/MyDrive/DepiProject/swat_preprocessed
