# Feature Analysis â€” Human Review Before AutoML

Loads inIT-OWL data, applies `RobotFeatureExtractor`, and visualizes:
1. Correlation matrix of features vs failure label
2. Top 10 most correlated features over time

Run this notebook to validate feature quality before training the anomaly model.

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("../").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load inIT-OWL data

Use `InITOWLIngestor` to load from `data/init_owl/` or `data/init_owl.csv`, then pivot to wide format for the feature extractor.

In [None]:
from pipeline.ingestion.init_owl_ingestor import InITOWLIngestor

input_path = PROJECT_ROOT / "data" / "init_owl"
if not input_path.exists() or not list(input_path.glob("*.csv")):
    input_path = PROJECT_ROOT / "data" / "init_owl.csv"
if not input_path.exists():
    raise FileNotFoundError(
        "No inIT-OWL data found. Place CSVs in data/init_owl/ or data/init_owl.csv"
)

ingestor = InITOWLIngestor(input_path)
long_df = ingestor.ingest()
print(f"Loaded {len(long_df)} rows (long form)")
long_df.head()

In [None]:
# Pivot to wide: one row per (timestamp, machine_id), columns = sensor_type values
# Map to names RobotFeatureExtractor expects: temperature_c, vibration_rms, joint_speed_rpm (current proxy), torque (optional)
if long_df.empty or "sensor_type" not in long_df.columns:
    wide_df = pd.DataFrame()
else:
    wide_df = long_df.pivot_table(
        index=["timestamp", "machine_id"],
        columns="sensor_type",
        values="value",
        aggfunc="first",
    ).reset_index()
    wide_df.columns.name = None
    # Rename for extractor: temperature -> temperature_c, vibration -> vibration_rms, current -> joint_speed_rpm (proxy)
    rename = {"temperature": "temperature_c", "vibration": "vibration_rms", "current": "joint_speed_rpm"}
    wide_df = wide_df.rename(columns={k: v for k, v in rename.items() if k in wide_df.columns})
    # Add label for correlation (failure_class)
    if "label" in long_df.columns:
        label_df = long_df.groupby(["timestamp", "machine_id"])["label"].first().reset_index()
        label_df = label_df.rename(columns={"label": "failure_class"})
        wide_df = wide_df.merge(label_df, on=["timestamp", "machine_id"], how="left")
        wide_df["failure_class"] = wide_df["failure_class"].fillna(0).astype(int)

print(f"Wide shape: {wide_df.shape}")
wide_df.head()

## 2. Apply RobotFeatureExtractor

Extract features using the config in `config/feature_config.yaml`.

In [None]:
from pipeline.cleansing.robot_feature_extractor import RobotFeatureExtractor

extractor = RobotFeatureExtractor()
features_df = extractor.extract(wide_df)
print(f"Features shape: {features_df.shape}")
features_df.head()

In [None]:
# Merge failure_class back for correlation (from wide_df if present)
if "failure_class" in wide_df.columns and not features_df.empty:
    keys = ["timestamp", "machine_id"]
    if keys[0] in features_df.columns and keys[1] in features_df.columns:
        merge_df = wide_df[["timestamp", "machine_id", "failure_class"]].drop_duplicates(keys)
        features_df = features_df.merge(merge_df, on=keys, how="left")
        features_df["failure_class"] = features_df["failure_class"].fillna(0).astype(int)
feature_cols = [c for c in features_df.columns if c not in ("timestamp", "machine_id", "failure_class")]

## 3. Correlation matrix: features vs failure label

Features that correlate with `failure_class` are good candidates for the model.

In [None]:
if "failure_class" not in features_df.columns or not feature_cols:
    print("No failure_class or feature columns; skipping correlation matrix.")
else:
    corr_df = features_df[feature_cols + ["failure_class"]].copy()
    corr_df = corr_df.dropna(how="all", subset=feature_cols)
    if corr_df.empty:
        print("No rows after dropna; skipping.")
    else:
        corr = corr_df.corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr, annot=True, fmt=".2f", cmap="RdBu_r", center=0, vmin=-1, vmax=1)
        plt.title("Correlation matrix: features vs failure_class")
        plt.tight_layout()
        plt.show()

## 4. Top 10 most correlated features over time

Plot the top 10 features (by absolute correlation with failure_class) over time for visual review.

In [None]:
if "failure_class" not in features_df.columns or not feature_cols:
    print("No failure_class or feature columns; skipping time series.")
else:
    corr_with_label = features_df[feature_cols + ["failure_class"]].corr()["failure_class"].drop("failure_class")
    top10 = corr_with_label.abs().nlargest(10).index.tolist()
    if not top10:
        top10 = feature_cols[:10]

    plot_df = features_df.copy()
    plot_df["timestamp"] = pd.to_datetime(plot_df["timestamp"])
    plot_df = plot_df.sort_values("timestamp")

    n_plots = len(top10)
    fig, axes = plt.subplots(n_plots, 1, figsize=(12, 2 * n_plots), sharex=True)
    if n_plots == 1:
        axes = [axes]
    for ax, col in zip(axes, top10):
        if col in plot_df.columns:
            ax.plot(plot_df["timestamp"], plot_df[col], alpha=0.7, label=col)
        ax.set_ylabel(col)
        ax.legend(loc="upper right", fontsize=8)
        ax.grid(True, alpha=0.3)
    if "failure_class" in plot_df.columns:
        axes[-1].plot(plot_df["timestamp"], plot_df["failure_class"], color="red", alpha=0.5, label="failure_class")
        axes[-1].legend(loc="upper right", fontsize=8)
    axes[-1].set_xlabel("timestamp")
    plt.suptitle("Top 10 features (by |corr| with failure_class) over time")
    plt.tight_layout()
    plt.show()