In [4]:
"""
EDA for ECLSS Synthetic Dataset
Generates exploratory plots for:
  - Distribution of sensors (hist + KDE)
  - Boxplots per sensor per class
  - Mean cycle plots
  - Sensor correlations (heatmap)
  - Sample overlay plots (10 per class)
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# --------------------
# Load dataset
# --------------------
data_dir = Path("data/eclss_synthetic_dataset_full")

X = np.load(data_dir / "cycles_3d.npy")       # (N, T, 3)
y_sys = np.load(data_dir / "labels_system.npy")
y_sensor = np.load(data_dir / "labels_sensor.npy")
meta = pd.read_csv(data_dir / "metadata.csv")

sensor_names = ["O₂ (%)", "CO₂ (%)", "Pressure (psi)"]
system_classes = {
    0: "Nominal",
    1: "CO₂ Leak",
    2: "Valve Stiction",
    3: "Vacuum Anomaly",
    4: "CDRA Degradation",
    5: "OGA Degradation",
}

out_dir = Path("data/eclss_EDA")
out_dir.mkdir(exist_ok=True)


# ============================================================
# 1) HISTOGRAMS + KDE PER SENSOR
# ============================================================
for idx, s in enumerate(sensor_names):
    sensor_all = X[:, :, idx].ravel()

    plt.figure(figsize=(8, 4))
    sns.histplot(sensor_all, kde=True, bins=50)
    plt.title(f"{s} Distribution (All Samples)")
    plt.xlabel(s)
    plt.ylabel("Count")
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_dir / f"hist_{s.replace('%','pct')}.png")
    plt.close()


# ============================================================
# 2) BOXPLOTS PER SENSOR PER SYSTEM CLASS
# ============================================================
df_box = []
for i in range(len(X)):
    for t in range(X.shape[1]):
        df_box.append([system_classes[y_sys[i]], X[i, t, 0], X[i, t, 1], X[i, t, 2]])

df_box = pd.DataFrame(df_box, columns=["Class", "O2", "CO2", "Pressure"])

for col in ["O2", "CO2", "Pressure"]:
    plt.figure(figsize=(10, 4))
    sns.boxplot(data=df_box, x="Class", y=col)
    plt.xticks(rotation=45)
    plt.title(f"{col} Distribution per Class")
    plt.tight_layout()
    plt.savefig(out_dir / f"boxplot_{col}.png")
    plt.close()


# ============================================================
# 3) MEAN CYCLE PLOTS PER CLASS
# ============================================================
for class_id, class_name in system_classes.items():
    X_class = X[y_sys == class_id]

    mean_cycle = X_class.mean(axis=0)

    plt.figure(figsize=(10, 5))
    for idx, s in enumerate(sensor_names):
        plt.plot(mean_cycle[:, idx], label=s)

    plt.title(f"Mean Cycle – {class_name}")
    plt.xlabel("Timestep")
    plt.ylabel("Value")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_dir / f"mean_cycle_{class_name}.png")
    plt.close()


# ============================================================
# 4) CORRELATION HEATMAP (Nominal Class)
# ============================================================
nominal = X[y_sys == 0].mean(axis=0)
corr = np.corrcoef(nominal.T)

plt.figure(figsize=(6, 4))
sns.heatmap(corr, annot=True, cmap="coolwarm", xticklabels=sensor_names,
            yticklabels=sensor_names, vmin=-1, vmax=1)
plt.title("Sensor Correlation Heatmap (Nominal)")
plt.tight_layout()
plt.savefig(out_dir / "correlation_nominal.png")
plt.close()


# ============================================================
# 5) OVERLAY PLOTS (First 10 Samples per Class)
# ============================================================
for class_id, class_name in system_classes.items():
    X_c = X[y_sys == class_id]
    n_plot = min(10, len(X_c))

    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    fig.suptitle(f"Overlay of First {n_plot} Samples – {class_name}")

    for idx, s in enumerate(sensor_names):
        for i in range(n_plot):
            axes[idx].plot(X_c[i, :, idx], alpha=0.3)
        axes[idx].set_title(s)
        axes[idx].grid(alpha=0.3)

    plt.tight_layout()
    plt.savefig(out_dir / f"overlay_{class_name}.png")
    plt.close()


print(f"\n✅ EDA plots saved to: {out_dir}")



✅ EDA plots saved to: data\eclss_EDA
