In [63]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# --- Path and Import Setup for your custom functions ---
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)
from preprocess import create_windows_vectorized

print("✅ All libraries and custom functions imported.")

✅ All libraries and custom functions imported.


In [64]:
def clean_swat_data(df):
    """Cleans a raw SWaT DataFrame that has already been loaded."""
    df.columns = df.columns.str.strip()
    if 'Timestamp' in df.columns:
        df['Timestamp'] = df['Timestamp'].str.strip()
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %I:%M:%S %p', errors='coerce')
        df.dropna(subset=['Timestamp'], inplace=True)
        df = df.set_index('Timestamp')
    if 'Normal/Attack' in df.columns:
        df['Label'] = (df['Normal/Attack'] != 'Normal').astype(int)
        df = df.drop(columns=['Normal/Attack'])
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

print("✅ Helper function defined.")

✅ Helper function defined.


In [65]:
# --- Define file paths ---
normal_data_path = '../data/SWaT/Physical/SWaT_Dataset_Normal_v0.csv'
attack_data_path = '../data/SWaT/Physical/SWaT_Dataset_Attack_v0.csv'

# --- Load each file, then clean ---
normal_df_raw = pd.read_csv(normal_data_path, skiprows=1, low_memory=False)
attack_df_raw = pd.read_csv(attack_data_path, low_memory=False)

normal_df = clean_swat_data(normal_df_raw)
attack_df = clean_swat_data(attack_df_raw)

combined_df = pd.concat([normal_df, attack_df])
combined_df.sort_index(inplace=True)

# --- Feature Selection ---
selected_features = [
    'FIT101', 'LIT101', 'AIT201', 'AIT202', 'FIT201', 'AIT402',
    'FIT401', 'LIT301', 'LIT401', 'AIT502', 'FIT501', 'PIT501'
]
subset_df = combined_df[selected_features + ['Label']].copy()
print(f"✅ Data loaded and processed. Using {len(selected_features)} features.")

✅ Data loaded and processed. Using 12 features.


In [66]:
features = subset_df.drop('Label', axis=1)
labels = subset_df['Label']

scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

scaled_df = pd.DataFrame(features_scaled, columns=features.columns, index=features.index)
scaled_df['Label'] = labels

WINDOW_SIZE = 50
STEP_SIZE = 50

X, y = create_windows_vectorized(scaled_df, window_size=WINDOW_SIZE, stride=STEP_SIZE)
print(f"Windowing complete. Shape of X: {X.shape}")

✅ Vectorized windows created: 18934 sequences of shape (50, 12)
Windowing complete. Shape of X: (18934, 50, 12)


In [67]:
X_normal = X[y == 0]
y_normal = y[y == 0]
X_attack = X[y == 1]
y_attack = y[y == 1]

if len(X_attack) >= 10: # Ensure there are enough attack samples for a split
    # Split Normal Data
    X_normal_train, X_normal_temp, y_normal_train, y_normal_temp = train_test_split(X_normal, y_normal, test_size=0.2, random_state=42)
    X_normal_val, X_normal_test, y_normal_val, y_normal_test = train_test_split(X_normal_temp, y_normal_temp, test_size=0.5, random_state=42)

    # Split Attack Data
    X_attack_train, X_attack_temp, y_attack_train, y_attack_temp = train_test_split(X_attack, y_attack, test_size=0.2, random_state=42)
    X_attack_val, X_attack_test, y_attack_val, y_attack_test = train_test_split(X_attack_temp, y_attack_temp, test_size=0.5, random_state=42)

    # Combine
    X_train = np.concatenate([X_normal_train, X_attack_train])
    y_train = np.concatenate([y_normal_train, y_attack_train])
    X_val = np.concatenate([X_normal_val, X_attack_val])
    y_val = np.concatenate([y_normal_val, y_attack_val])
    X_test = np.concatenate([X_normal_test, X_attack_test])
    y_test = np.concatenate([y_normal_test, y_attack_test])

    # Shuffle
    X_train, y_train = shuffle(X_train, y_train, random_state=42)

    print("✅ New data split complete:")
    print("Training set label distribution:", dict(zip(*np.unique(y_train, return_counts=True))))
    print("Validation set label distribution:", dict(zip(*np.unique(y_val, return_counts=True))))
    print("Test set label distribution:", dict(zip(*np.unique(y_test, return_counts=True))))
else:
    print(f"🚨 Error: Not enough attack samples ({len(X_attack)}) to perform a split.")

✅ New data split complete:
Training set label distribution: {np.int64(0): np.int64(14248), np.int64(1): np.int64(899)}
Validation set label distribution: {np.int64(0): np.int64(1781), np.int64(1): np.int64(112)}
Test set label distribution: {np.int64(0): np.int64(1781), np.int64(1): np.int64(113)}


In [68]:
save_path = '../data/processed'
os.makedirs(save_path, exist_ok=True)

np.save(os.path.join(save_path, 'X_train.npy'), X_train)
np.save(os.path.join(save_path, 'y_train.npy'), y_train)
np.save(os.path.join(save_path, 'X_val.npy'), X_val)
np.save(os.path.join(save_path, 'y_val.npy'), y_val)
np.save(os.path.join(save_path, 'X_test.npy'), X_test)
np.save(os.path.join(save_path, 'y_test.npy'), y_test)

# This tells you the correct number to use in your train.py and evaluate.py scripts
print(f"✅ Data saved. The INPUT_DIM for your model is: {X_train.shape[2]}")

✅ Data saved. The INPUT_DIM for your model is: 12
