In [3]:
"""
preprocess_eclss.py

Preprocessing pipeline for the ECLSS synthetic dataset.

- Loads raw data from: data/eclss_synthetic_dataset_full/
    - cycles_3d.npy        (N, T, 3)
    - labels_system.npy    (N,)
    - labels_sensor.npy    (N,)

- Splits data into:
    * Nominal (class 0) → for VAE:
        - X_train_nom_*  (train)
        - X_val_nom_*    (validation)
        - X_test_nom_*   (part of combined test set)
    * Faulty (classes 1–5) → for SVM and anomaly evaluation:
        - X_train_fault_*      (SVM train)
        - X_test_fault_*       (SVM test)
        - X_test_all_*         (VAE anomaly test: nominal + faulty)

- Normalizes using StandardScaler fitted ONLY on nominal training data.

- Saves preprocessed arrays under:
    data/eclss_preprocessed/

This script is safe to run:
  * as a standalone .py in a repo (uses REPO_ROOT/data/…)
  * inside a Jupyter/Colab notebook (falls back to current working directory).
"""

from __future__ import annotations

from pathlib import Path

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib


# ============================================================
# PATH CONFIGURATION
# ============================================================

# If running as a script, __file__ is defined.
# If running inside a notebook, fall back to current working directory.
try:
    REPO_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    REPO_ROOT = Path.cwd()

DATA_ROOT = REPO_ROOT / "data"

RAW_DIR = DATA_ROOT / "eclss_synthetic_dataset_full"
OUT_DIR = DATA_ROOT / "eclss_preprocessed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

CYCLES_FILE = RAW_DIR / "cycles_3d.npy"
SYS_LABELS_FILE = RAW_DIR / "labels_system.npy"
SENSOR_LABELS_FILE = RAW_DIR / "labels_sensor.npy"


# ============================================================
# HELPER: SCALE 3D ARRAY
# ============================================================

def scale_3d(X: np.ndarray, scaler: StandardScaler) -> np.ndarray:
    """
    Scale a 3D array (N, T, C) using a fitted StandardScaler
    (which expects 2D input of shape (N*T, C)).

    Returns (N, T, C) scaled.
    """
    N, T, C = X.shape
    X_2d = X.reshape(-1, C)          # (N*T, C)
    X_scaled_2d = scaler.transform(X_2d)
    return X_scaled_2d.reshape(N, T, C)


# ============================================================
# MAIN PREPROCESSING PIPELINE
# ============================================================

def main() -> None:
    print("============================================================")
    print(" PREPROCESSING ECLSS SYNTHETIC DATASET")
    print("============================================================")

    print(f"Raw data directory: {RAW_DIR}")
    print(f"Output directory:   {OUT_DIR}\n")

    # --------------------------------------------------------
    # 1) LOAD RAW DATA
    # --------------------------------------------------------
    X = np.load(CYCLES_FILE)           # (N, T, 3)
    y_sys = np.load(SYS_LABELS_FILE)   # (N,)
    y_sensor = np.load(SENSOR_LABELS_FILE)  # (N,)

    print(f"Total samples: {len(X)}")
    print(f"  Nominal (class 0): {(y_sys == 0).sum()}")
    print(f"  Faulty  (class 1–5): {(y_sys > 0).sum()}\n")

    # --------------------------------------------------------
    # 2) SEPARATE NOMINAL AND FAULTY
    # --------------------------------------------------------
    nom_mask = (y_sys == 0)
    fault_mask = (y_sys > 0)

    X_nom = X[nom_mask]
    y_sys_nom = y_sys[nom_mask]
    y_sensor_nom = y_sensor[nom_mask]

    X_fault = X[fault_mask]
    y_sys_fault = y_sys[fault_mask]
    y_sensor_fault = y_sensor[fault_mask]

    print("Separated data:")
    print(f"  Nominal samples: {len(X_nom)}")
    print(f"  Faulty samples:  {len(X_fault)}\n")

    # --------------------------------------------------------
    # 3) SPLIT NOMINAL DATA FOR VAE
    #    70% train, 15% val, 15% test
    # --------------------------------------------------------
    X_nom_train, X_nom_temp, y_sys_nom_train, y_sys_nom_temp = train_test_split(
        X_nom, y_sys_nom, test_size=0.30, random_state=42
    )

    X_nom_val, X_nom_test, y_sys_nom_val, y_sys_nom_test = train_test_split(
        X_nom_temp, y_sys_nom_temp, test_size=0.50, random_state=42
    )

    print("Nominal split (for VAE):")
    print(f"  Train: {len(X_nom_train)}")
    print(f"  Val:   {len(X_nom_val)}")
    print(f"  Test:  {len(X_nom_test)}\n")

    # --------------------------------------------------------
    # 4) SPLIT FAULTY DATA FOR SVM + ANOMALY TEST
    #    70% train, 30% test – stratified by system fault type
    # --------------------------------------------------------
    (X_fault_train,
     X_fault_test,
     y_sys_fault_train,
     y_sys_fault_test,
     y_sensor_fault_train,
     y_sensor_fault_test) = train_test_split(
        X_fault,
        y_sys_fault,
        y_sensor_fault,
        test_size=0.30,
        stratify=y_sys_fault,
        random_state=42,
    )

    print("Faulty split (for SVM and evaluation):")
    print(f"  Train (faulty): {len(X_fault_train)}")
    print(f"  Test  (faulty): {len(X_fault_test)}")
    print("  Fault distribution (train):", np.bincount(y_sys_fault_train))
    print("  Fault distribution (test): ", np.bincount(y_sys_fault_test))
    print()

    # --------------------------------------------------------
    # 5) COMBINED TEST SET FOR ANOMALY DETECTION
    #    (nominal test + faulty test)
    # --------------------------------------------------------
    X_test_all = np.concatenate([X_nom_test, X_fault_test], axis=0)
    y_sys_test_all = np.concatenate([y_sys_nom_test, y_sys_fault_test])
    y_binary_test_all = (y_sys_test_all > 0).astype(int)  # 0=nominal, 1=anomaly

    print("Combined test set (for VAE anomaly detection):")
    print(f"  Total:   {len(X_test_all)}")
    print(f"  Nominal: {(y_binary_test_all == 0).sum()}")
    print(f"  Anomaly: {(y_binary_test_all == 1).sum()}\n")

    # --------------------------------------------------------
    # 6) NORMALIZATION (FIT SCALER ON NOMINAL TRAIN ONLY)
    # --------------------------------------------------------
    N_nom_train, T, C = X_nom_train.shape
    X_nom_train_2d = X_nom_train.reshape(-1, C)  # (N_nom_train*T, 3)

    scaler = StandardScaler()
    scaler.fit(X_nom_train_2d)

    print("Scaler fitted on NOMINAL training data only:")
    print(f"  Means: {scaler.mean_}")
    print(f"  Stds:  {np.sqrt(scaler.var_)}\n")

    # Scale all relevant splits
    X_nom_train_scaled = scale_3d(X_nom_train, scaler)
    X_nom_val_scaled = scale_3d(X_nom_val, scaler)
    X_nom_test_scaled = scale_3d(X_nom_test, scaler)

    X_fault_train_scaled = scale_3d(X_fault_train, scaler)
    X_fault_test_scaled = scale_3d(X_fault_test, scaler)

    X_test_all_scaled = scale_3d(X_test_all, scaler)

    # Flattened versions (for MLP-based VAE / simple models)
    X_nom_train_flat = X_nom_train_scaled.reshape(len(X_nom_train), -1)
    X_nom_val_flat = X_nom_val_scaled.reshape(len(X_nom_val), -1)
    X_nom_test_flat = X_nom_test_scaled.reshape(len(X_nom_test), -1)

    X_fault_train_flat = X_fault_train_scaled.reshape(len(X_fault_train), -1)
    X_fault_test_flat = X_fault_test_scaled.reshape(len(X_fault_test), -1)

    X_test_all_flat = X_test_all_scaled.reshape(len(X_test_all), -1)

    # --------------------------------------------------------
    # 7) SAVE ALL PREPROCESSED DATA
    # --------------------------------------------------------
    print("Saving preprocessed data to disk...")

    # --- VAE training data (nominal only) ---
    np.save(OUT_DIR / "X_train_nom_scaled.npy", X_nom_train_scaled)
    np.save(OUT_DIR / "X_train_nom_flat.npy", X_nom_train_flat)

    np.save(OUT_DIR / "X_val_nom_scaled.npy", X_nom_val_scaled)
    np.save(OUT_DIR / "X_val_nom_flat.npy", X_nom_val_flat)

    np.save(OUT_DIR / "X_test_nom_scaled.npy", X_nom_test_scaled)
    np.save(OUT_DIR / "X_test_nom_flat.npy", X_nom_test_flat)
    np.save(OUT_DIR / "y_test_nom_sys.npy", y_sys_nom_test)

    # --- SVM training data (faulty only) ---
    np.save(OUT_DIR / "X_train_fault_scaled.npy", X_fault_train_scaled)
    np.save(OUT_DIR / "X_train_fault_flat.npy", X_fault_train_flat)
    np.save(OUT_DIR / "y_train_fault_sys.npy", y_sys_fault_train)
    np.save(OUT_DIR / "y_train_fault_sensor.npy", y_sensor_fault_train)

    # --- Combined test set for anomaly detection (nominal + faulty) ---
    np.save(OUT_DIR / "X_test_all_scaled.npy", X_test_all_scaled)
    np.save(OUT_DIR / "X_test_all_flat.npy", X_test_all_flat)
    np.save(OUT_DIR / "y_test_all_sys.npy", y_sys_test_all)
    np.save(OUT_DIR / "y_test_all_binary.npy", y_binary_test_all)

    # --- Faulty test set for SVM evaluation ---
    np.save(OUT_DIR / "X_test_fault_scaled.npy", X_fault_test_scaled)
    np.save(OUT_DIR / "X_test_fault_flat.npy", X_fault_test_flat)
    np.save(OUT_DIR / "y_test_fault_sys.npy", y_sys_fault_test)
    np.save(OUT_DIR / "y_test_fault_sensor.npy", y_sensor_fault_test)

    # --- Save scaler ---
    joblib.dump(scaler, OUT_DIR / "scaler.pkl")

    print(f"\n✅ Saved all preprocessed data to: {OUT_DIR.resolve()}")
    print("============================================================")
    print(" PREPROCESSING COMPLETE")
    print("============================================================\n")


if __name__ == "__main__":
    main()


 PREPROCESSING ECLSS SYNTHETIC DATASET
Raw data directory: C:\Users\ahasa\project_root\data\eclss_synthetic_dataset_full
Output directory:   C:\Users\ahasa\project_root\data\eclss_preprocessed

Total samples: 720
  Nominal (class 0): 120
  Faulty  (class 1–5): 600

Separated data:
  Nominal samples: 120
  Faulty samples:  600

Nominal split (for VAE):
  Train: 84
  Val:   18
  Test:  18

Faulty split (for SVM and evaluation):
  Train (faulty): 420
  Test  (faulty): 180
  Fault distribution (train): [ 0 84 84 84 84 84]
  Fault distribution (test):  [ 0 36 36 36 36 36]

Combined test set (for VAE anomaly detection):
  Total:   198
  Nominal: 18
  Anomaly: 180

Scaler fitted on NOMINAL training data only:
  Means: [20.90136217  0.30160151 14.70007224]
  Stds:  [0.22969965 0.0816969  0.2204265 ]

Saving preprocessed data to disk...

✅ Saved all preprocessed data to: C:\Users\ahasa\project_root\data\eclss_preprocessed
 PREPROCESSING COMPLETE

