## EEL5825 - Machine Learning and Pattern Recognition 
## Course Project

Student: Longho Bernard Che

Student ID: 5756998

In [None]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
import os
import json
from sklearn.model_selection import train_test_split
import yaml

## Data Loading

In [None]:
class DataLoader:
    def __init__(self, config_path="config/parameters.yaml", use_cached=True):
        """
        Parameters
        ----------
        config_path : str
            Relative path to the YAML config from project root.
        use_cached : bool
            If True, will try to reuse data/processed/signals.npy etc.
            before reprocessing raw .mat files.
        """
        # Get the absolute path to config file
        current_dir = os.getcwd()
        project_root = current_dir
        config_path = os.path.join(project_root, config_path)

        with open(config_path, "r") as file:
            self.config = yaml.safe_load(file)

        self.files = self.config["dataset"]["files"]
        self.segment_length = self.config["preprocessing"]["segment_length"]
        self.samples_per_class = self.config["preprocessing"]["samples_per_class"]
        self.project_root = project_root
        self.random_state = self.config["preprocessing"]["random_state"]
        self.use_cached = use_cached

    def _load_cached_data(self):
        """Try to load cached processed data if available."""
        processed_dir = os.path.join(self.project_root, "data", "processed")
        signals_path = os.path.join(processed_dir, "signals.npy")
        labels_path = os.path.join(processed_dir, "labels.npy")
        label_map_path = os.path.join(processed_dir, "label_map.json")

        if (
            os.path.exists(signals_path)
            and os.path.exists(labels_path)
            and os.path.exists(label_map_path)
        ):
            print("--  Reusing cached processed data from data/processed/ ...")
            X = np.load(signals_path)
            y = np.load(labels_path)
            with open(label_map_path, "r") as f:
                label_map_raw = json.load(f)

            # JSON turns int keys into strings ‚Üí convert back
            label_map = {int(k): v for k, v in label_map_raw.items()}

            print(f"   Loaded {X.shape[0]} samples from cache")
            print(f"   Classes: {list(label_map.values())}")
            return X, y, label_map

        return None, None, None

    def load_data(self):
        """Load and segment all vibration data."""
        # Try cached version first
        if self.use_cached:
            X_cached, y_cached, label_map_cached = self._load_cached_data()
            if X_cached is not None:
                return X_cached, y_cached, label_map_cached

        all_signals = []
        all_labels = []
        label_map = {}

        print("--  Loading CWRU Bearing Dataset from raw .mat files...")

        for label, (class_name, filename) in enumerate(self.files.items()):
            print(f"   Loading {class_name} from {filename}...")
            filepath = os.path.join(self.project_root, "data", "raw", filename)

            try:
                mat_data = loadmat(filepath)

                # Extract vibration signal (handle different key names)
                signal_keys = [
                    key
                    for key in mat_data.keys()
                    if "DE_time" in key or "X" in key or "driven" in key
                ]
                if not signal_keys:
                    print(f"     --  No vibration signal found in {filename}")
                    continue

                signal_key = signal_keys[0]
                signal = mat_data[signal_key].flatten()

                # Segment into fixed-length samples
                segments = []
                for i in range(
                    0, len(signal) - self.segment_length, self.segment_length
                ):
                    segments.append(signal[i : i + self.segment_length])
                    if len(segments) >= self.samples_per_class:
                        break

                all_signals.extend(segments)
                all_labels.extend([label] * len(segments))
                label_map[label] = class_name
                print(f"     --  Loaded {len(segments)} samples")

            except Exception as e:
                print(f"     --  ERROR loading {filename}: {e}")
                continue

        # Convert to numpy arrays
        X = np.array(all_signals)
        y = np.array(all_labels)

        # Save processed data
        self.save_processed_data(X, y, label_map)

        print(f"\n--  Dataset Summary:")
        print(f"   Total samples: {len(X)}")
        print(f"   Number of classes: {len(np.unique(y))}")
        print(f"   Signal length: {self.segment_length}")
        print(f"   Classes: {list(label_map.values())}")

        return X, y, label_map

    def create_features(self, signals):
        """Extract features for traditional ML models."""
        print("--  Extracting features for traditional ML...")

        features = []
        for signal in signals:
            # Time-domain features
            mean = np.mean(signal)
            std = np.std(signal)
            rms = np.sqrt(np.mean(signal**2))
            peak_to_peak = np.max(signal) - np.min(signal)
            skewness = np.mean((signal - mean) ** 3) / (std**3) if std != 0 else 0
            kurtosis = np.mean((signal - mean) ** 4) / (std**4) if std != 0 else 0

            feature_vector = [mean, std, rms, peak_to_peak, skewness, kurtosis]
            features.append(feature_vector)

        feature_names = ["mean", "std", "rms", "peak_to_peak", "skewness", "kurtosis"]
        features_array = np.array(features)

        return features_array, feature_names

    def save_processed_data(self, X, y, label_map):
        """Save processed data to files."""
        processed_dir = os.path.join(self.project_root, "data", "processed")
        os.makedirs(processed_dir, exist_ok=True)

        np.save(os.path.join(processed_dir, "signals.npy"), X)
        np.save(os.path.join(processed_dir, "labels.npy"), y)

        with open(os.path.join(processed_dir, "label_map.json"), "w") as f:
            json.dump(label_map, f, indent=2)

        print("-- Saved processed data to data/processed/")

    def prepare_splits(self, X, y, X_features=None):
        """
        Prepare train/validation/test splits with CONSISTENT indices
        between raw signals and feature matrices.

        Uses preprocessing.test_size and preprocessing.val_size from config
        as FINAL fractions of the whole dataset.
        """
        random_state = self.random_state
        test_size_cfg = float(self.config["preprocessing"]["test_size"])
        val_size_cfg = float(self.config["preprocessing"]["val_size"])

        print(
            f"\n--  Preparing data splits (test={test_size_cfg}, val={val_size_cfg})..."
        )

        if test_size_cfg + val_size_cfg >= 1.0:
            raise ValueError(
                "test_size + val_size must be < 1.0 in config/preprocessing."
            )

        # First: split into train vs holdout (val+test)
        holdout_size = test_size_cfg + val_size_cfg
        relative_test_size = test_size_cfg / holdout_size  # within holdout

        indices = np.arange(len(y))

        # Split indices ‚Äì ensures consistent splits for both X and X_features
        idx_train, idx_hold, y_train, y_hold = train_test_split(
            indices,
            y,
            test_size=holdout_size,
            random_state=random_state,
            stratify=y,
        )

        idx_val, idx_test, y_val, y_test = train_test_split(
            idx_hold,
            y_hold,
            test_size=relative_test_size,
            random_state=random_state,
            stratify=y_hold,
        )

        # Raw signals for deep learning
        X_train_raw = X[idx_train]
        X_val_raw = X[idx_val]
        X_test_raw = X[idx_test]

        # Features for traditional ML (if provided)
        if X_features is not None:
            X_train_feat = X_features[idx_train]
            X_val_feat = X_features[idx_val]
            X_test_feat = X_features[idx_test]

            print(
                f"   Traditional ML: {X_train_feat.shape[0]} train, "
                f"{X_val_feat.shape[0]} val, {X_test_feat.shape[0]} test"
            )
        else:
            X_train_feat = X_val_feat = X_test_feat = None

        # Reshape for deep learning (N, L, 1)
        X_train_raw = X_train_raw.reshape(X_train_raw.shape[0], X_train_raw.shape[1], 1)
        X_val_raw = X_val_raw.reshape(X_val_raw.shape[0], X_val_raw.shape[1], 1)
        X_test_raw = X_test_raw.reshape(X_test_raw.shape[0], X_test_raw.shape[1], 1)

        print(
            f"   Deep Learning: {X_train_raw.shape[0]} train, "
            f"{X_val_raw.shape[0]} val, {X_test_raw.shape[0]} test"
        )

        splits = {
            "traditional": {
                "X_train": X_train_feat,
                "X_val": X_val_feat,
                "X_test": X_test_feat,
                "y_train": y_train,
                "y_val": y_val,
                "y_test": y_test,
            },
            "deep_learning": {
                "X_train": X_train_raw,
                "X_val": X_val_raw,
                "X_test": X_test_raw,
                "y_train": y_train,
                "y_val": y_val,
                "y_test": y_test,
            },
        }

        return splits

## Traditional Machine Learning Methods

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib
import yaml
import os

In [None]:
class TraditionalML:
    def __init__(self):
        # Get project root
        current_dir = os.getcwd()
        self.project_root = current_dir

        config_path = os.path.join(self.project_root, "config", "parameters.yaml")
        with open(config_path, "r") as file:
            self.config = yaml.safe_load(file)

        self.models = {
            "LogisticRegression": LogisticRegression(
                max_iter=self.config["models"]["traditional"]["logistic_regression"][
                    "max_iter"
                ],
                random_state=42,
            ),
            "RandomForest": RandomForestClassifier(
                n_estimators=self.config["models"]["traditional"]["random_forest"][
                    "n_estimators"
                ],
                random_state=42,
            ),
            "SVM": SVC(
                kernel=self.config["models"]["traditional"]["svm"]["kernel"],
                random_state=42,
                probability=True,
            ),
        }

        self.results = {}

    def train_and_evaluate(self, X_train, X_test, y_train, y_test):
        """Train and evaluate all traditional ML models """
        print("\n-- Training Traditional ML Models...")
        print("=" * 50)

        # Check data shapes
        """
        print(f"DEBUG - X_train shape: {X_train.shape}")
        print(f"DEBUG - X_test shape: {X_test.shape}")
        print(f"DEBUG - y_train shape: {y_train.shape}")
        print(f"DEBUG - y_test shape: {y_test.shape}")
        """

        if X_train is None or X_test is None:
            print("-- ERROR: Feature data is None. Check data loading.")
            return {}

        # Hyperparameter tuning for RandomForest
        try:
            self.tune_random_forest(X_train, y_train)
        except Exception as e:
            print(f"-- RandomForest tuning failed, using default params: {e}")

        for name, model in self.models.items():
            print(f"\nTraining {name}...")

            try:
                # Train model
                model.fit(X_train, y_train)

                # Predictions (hard labels)
                y_pred = model.predict(X_test)

                # Check prediction shape
                #print(f"DEBUG - y_pred shape: {y_pred.shape}")
                #print(f"DEBUG - y_test shape: {y_test.shape}")

                # Try to get class probabilities for ROC/AUC
                y_proba = None
                if hasattr(model, "predict_proba"):
                    try:
                        y_proba = model.predict_proba(X_test)
                        print(f"DEBUG - y_proba shape: {y_proba.shape}")
                    except Exception as e:
                        print(f"   -- Could not compute predict_proba for {name}: {e}")

                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)

                # Store results
                self.results[name] = {
                    "model": model,
                    "accuracy": accuracy,
                    "predictions": y_pred,
                    "true_labels": y_test,
                }

                print(f"   -- {name} Accuracy: {accuracy:.4f}")

                # Save model
                models_dir = os.path.join(self.project_root, "models")
                os.makedirs(models_dir, exist_ok=True)
                joblib.dump(model, os.path.join(models_dir, f"{name.lower()}.pkl"))

                # Save probabilities for ROC if available
                if y_proba is not None:
                    # Use sorted unique labels to define class order
                    class_labels = sorted(np.unique(y_train))
                    proba_df = pd.DataFrame(
                        y_proba,
                        columns=[f"class_{c}" for c in class_labels],
                    )
                    proba_df["true_label"] = y_test

                    results_dir = os.path.join(self.project_root, "results")
                    os.makedirs(results_dir, exist_ok=True)
                    proba_path = os.path.join(results_dir, f"{name}_proba.csv")
                    proba_df.to_csv(proba_path, index=False)
                    print(f"   -- Saved probabilities for {name} to {proba_path}")

            except Exception as e:
                print(f"   -- ERROR training {name}: {e}")
                continue

        return self.results

    def tune_random_forest(self, X_train, y_train):
        """
        Simple hyperparameter tuning for RandomForest using GridSearchCV.

        Searches over a small grid and updates self.models['RandomForest']
        to the best estimator found (based on CV accuracy).
        """
        if "RandomForest" not in self.models:
            print("-- RandomForest not found in models dict; skipping RF tuning.")
            return

        print("\n-- Hyperparameter tuning for RandomForest (3-fold CV)...")

        rf = self.models["RandomForest"]

        param_grid = {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "max_features": ["sqrt", "log2"],
        }

        grid = GridSearchCV(
            rf,
            param_grid=param_grid,
            cv=3,
            scoring="accuracy",
            n_jobs=-1,
            verbose=1,
        )

        grid.fit(X_train, y_train)

        print(f"   -- Best RF params: {grid.best_params_}")
        print(f"   -- Best CV accuracy: {grid.best_score_:.4f}")

        # Replace the RandomForest model with the best estimator
        self.models["RandomForest"] = grid.best_estimator_

    def save_results(self):
        """Save traditional ML results."""
        results_dir = os.path.join(self.project_root, "results")
        os.makedirs(results_dir, exist_ok=True)

        # Save accuracy comparison
        results_df = pd.DataFrame(
            {
                "Model": list(self.results.keys()),
                "Accuracy": [result["accuracy"] for result in self.results.values()],
                "Type": "Traditional ML",
            }
        )
        results_df.to_csv(
            os.path.join(results_dir, "traditional_ml_results.csv"), index=False
        )

        # Save detailed predictions
        all_predictions = []
        for model_name, result in self.results.items():
            for i, (true, pred) in enumerate(
                zip(result["true_labels"], result["predictions"])
            ):
                all_predictions.append(
                    {
                        "model": model_name,
                        "true_label": true,
                        "predicted_label": pred,
                        "correct": true == pred,
                    }
                )

        pd.DataFrame(all_predictions).to_csv(
            os.path.join(results_dir, "traditional_predictions.csv"), index=False
        )
        print("-- Saved traditional ML results to results/")

## Deep Learning Methods

In [None]:
import os
import numpy as np
import pandas as pd
import yaml
from sklearn.metrics import accuracy_score

# Optional plotting imports (kept in case you want to extend later)
import matplotlib.pyplot as plt  # noqa: F401
import seaborn as sns  # noqa: F401

try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks
    from tensorflow.keras.optimizers import Adam

    TF_AVAILABLE = True
    print("--  Using TensorFlow Keras")
except ImportError:
    try:
        import keras
        from keras import layers, models, callbacks
        from keras.optimizers import Adam

        TF_AVAILABLE = True
        print("--  Using standalone Keras")
    except ImportError:
        TF_AVAILABLE = False
        print("--  Deep learning libraries not available")


In [None]:
class DeepLearningModels:
    def __init__(self):
        current_dir = os.getcwd()
        self.project_root = current_dir

        config_path = os.path.join(self.project_root, "config", "parameters.yaml")
        with open(config_path, "r") as file:
            self.config = yaml.safe_load(file)

        self.results = {}

        # Deep learning hyperparameters from config
        dl_cfg = self.config.get("models", {}).get("deep_learning", {})
        self.epochs = int(dl_cfg.get("epochs", 30))
        self.batch_size = int(dl_cfg.get("batch_size", 32))
        self.learning_rate = float(dl_cfg.get("learning_rate", 0.001))

        # Ensure models and results directories exist
        os.makedirs(os.path.join(self.project_root, "models"), exist_ok=True)
        os.makedirs(os.path.join(self.project_root, "results"), exist_ok=True)

    # ------------------------------------------------------------------
    # DATA PREPROCESSING
    # ------------------------------------------------------------------
    def preprocess_data(self, X_train, X_val, X_test):
        """
        Preprocess data for deep learning models.

        Uses GLOBAL z-score normalization based on the TRAIN set only.
        This preserves relative amplitude differences between samples,
        which is important for vibration/fault diagnosis.
        """
        print("--  Preprocessing data for deep learning...")

        # Ensure float32 and correct shape
        X_train = X_train.astype(np.float32)
        X_val = X_val.astype(np.float32)
        X_test = X_test.astype(np.float32)

        # Compute global mean/std on TRAIN ONLY
        train_mean = X_train.mean()
        train_std = X_train.std()

        if train_std < 1e-8:
            print("‚ö†Ô∏è Train std is extremely small; skipping normalization.")
            return X_train, X_val, X_test

        print(f"   Global train mean: {train_mean:.5f}, std: {train_std:.5f}")

        X_train_norm = (X_train - train_mean) / train_std
        X_val_norm = (X_val - train_mean) / train_std
        X_test_norm = (X_test - train_mean) / train_std

        print(
            f"   Data shapes - Train: {X_train_norm.shape}, "
            f"Val: {X_val_norm.shape}, Test: {X_test_norm.shape}"
        )
        print(
            f"   Train range - Min: {X_train_norm.min():.3f}, "
            f"Max: {X_train_norm.max():.3f}"
        )

        return X_train_norm, X_val_norm, X_test_norm

    # ------------------------------------------------------------------
    # MODEL DEFINITIONS
    # ------------------------------------------------------------------
    def create_1d_cnn(self, input_shape, num_classes):
        """
        Create a 1D CNN model for vibration signals.

        This is the '1D CNN' mentioned in your project proposal.
        """
        print("   Building 1D CNN architecture...")

        model = models.Sequential(
            [
                layers.Input(shape=input_shape, name="input_layer"),
                # Convolutional block 1
                layers.Conv1D(16, kernel_size=7, activation="relu", padding="same"),
                layers.BatchNormalization(),
                layers.MaxPooling1D(2),
                # REMOVE Dropout(0.2) here

                # Convolutional block 2
                layers.Conv1D(32, kernel_size=5, activation="relu", padding="same"),
                layers.BatchNormalization(),
                layers.MaxPooling1D(2),
                # REMOVE Dropout(0.2) here

                # Convolutional block 3
                layers.Conv1D(64, kernel_size=3, activation="relu", padding="same"),
                layers.BatchNormalization(),
                layers.GlobalAveragePooling1D(),

                # Dense head
                layers.Dense(64, activation="relu"),
                layers.Dropout(0.3),  # keep this one
                layers.Dense(num_classes, activation="softmax"),

            ]
        )

        optimizer = Adam(learning_rate=self.learning_rate)

        model.compile(
            optimizer=optimizer,
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"],
        )

        return model

    def create_lstm(self, input_shape, num_classes):
        """
        Create an LSTM model for temporal vibration patterns.

        Simplified vs. your previous version to better suit the
        relatively small dataset and reduce over-regularization.
        """
        print("   Building LSTM architecture...")

        model = models.Sequential(
            [
                layers.Input(shape=input_shape, name="input_layer"),
                layers.LSTM(
                    64,
                    return_sequences=False,
                    dropout=0.0,
                    recurrent_dropout=0.0,
                    name="lstm_1",
                ),
                layers.Dense(32, activation="relu", name="dense_1"),
                layers.Dropout(0.2, name="dropout_1"),
                layers.Dense(num_classes, activation="softmax", name="output_layer"),
            ]
        )

        optimizer = Adam(learning_rate=self.learning_rate)

        model.compile(
            optimizer=optimizer,
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"],
        )

        return model

    # ------------------------------------------------------------------
    # TRAINING UTILITIES
    # ------------------------------------------------------------------
    def get_callbacks(self, model_name):
        """Get training callbacks for better convergence."""
        models_dir = os.path.join(self.project_root, "models")
        os.makedirs(models_dir, exist_ok=True)

        callbacks_list = [
            callbacks.EarlyStopping(
                monitor="val_accuracy",
                patience=10,
                restore_best_weights=True,
                mode="max",
                verbose=1,
            ),
            callbacks.ReduceLROnPlateau(
                monitor="val_accuracy",
                factor=0.5,
                patience=5,
                min_lr=1e-7,
                mode="max",
                verbose=1,
            ),
            callbacks.ModelCheckpoint(
                filepath=os.path.join(models_dir, f"best_{model_name}.h5"),
                monitor="val_accuracy",
                save_best_only=True,
                mode="max",
                verbose=1,
            ),
        ]

        return callbacks_list

    def analyze_dataset(self, X_train, y_train, X_val, y_val, X_test, y_test):
        """Analyze dataset characteristics."""
        print("\n--  DATASET ANALYSIS:")
        print("=" * 40)

        print(f"   Training samples: {X_train.shape[0]}")
        print(f"   Validation samples: {X_val.shape[0]}")
        print(f"   Test samples: {X_test.shape[0]}")
        print(f"   Input shape: {X_train.shape[1:]}")
        print(f"   Number of classes: {len(np.unique(y_train))}")

        # Class distribution
        train_unique, train_counts = np.unique(y_train, return_counts=True)
        val_unique, val_counts = np.unique(y_val, return_counts=True)
        test_unique, test_counts = np.unique(y_test, return_counts=True)

        print(f"\n   Class Distribution:")
        print(f"     Train: {dict(zip(train_unique, train_counts))}")
        print(f"     Val:   {dict(zip(val_unique, val_counts))}")
        print(f"     Test:  {dict(zip(test_unique, test_counts))}")

        # Data statistics
        print(f"\n   Data Statistics:")
        print(f"     Train - Min: {X_train.min():.3f}, Max: {X_train.max():.3f}")
        print(f"     Val   - Min: {X_val.min():.3f}, Max: {X_val.max():.3f}")
        print(f"     Test  - Min: {X_test.min():.3f}, Max: {X_test.max():.3f}")

        return True

    def train_single_model(self, model, model_name, X_train, y_train, X_val, y_val):
        """Train a single model with comprehensive logging."""
        print(f"\n--  Training {model_name}...")
        print("-" * 40)

        # Display model architecture
        print(f"   {model_name} Architecture:")
        model.summary()

        # Get callbacks
        training_callbacks = self.get_callbacks(model_name)

        # Train model
        history = model.fit(
            X_train,
            y_train,
            validation_data=(X_val, y_val),
            epochs=self.epochs,  # from config
            batch_size=self.batch_size,  # from config
            callbacks=training_callbacks,
            verbose=1,
            shuffle=True,
        )

        return history

    def evaluate_model(self, model, model_name, X_test, y_test):
        """Comprehensive model evaluation."""
        print(f"\n-- Evaluating {model_name}...")

        # Basic evaluation
        test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

        # Predictions (probabilities and hard labels)
        y_pred_proba = model.predict(X_test, verbose=0)
        y_pred = np.argmax(y_pred_proba, axis=1)

        # Additional metrics
        test_accuracy_manual = accuracy_score(y_test, y_pred)

        print(f"   Test Loss: {test_loss:.4f}")
        print(f"   Test Accuracy: {test_accuracy:.4f}")
        print(f"   Manual Accuracy: {test_accuracy_manual:.4f}")

        # Save probabilities for ROC/AUC
        try:
            results_dir = os.path.join(self.project_root, "results")
            os.makedirs(results_dir, exist_ok=True)

            # Map internal name "CNN" to display name "1D CNN" for consistency
            display_name = "1D CNN" if model_name == "CNN" else model_name

            # Use a file-friendly stem
            stem = display_name.replace(" ", "_")
            proba_path = os.path.join(results_dir, f"{stem}_proba.csv")

            class_labels = sorted(np.unique(y_test))
            proba_df = pd.DataFrame(
                y_pred_proba,
                columns=[f"class_{c}" for c in class_labels],
            )
            proba_df["true_label"] = y_test
            proba_df.to_csv(proba_path, index=False)
            print(f"   -- Saved probabilities for {display_name} to {proba_path}")
        except Exception as e:
            print(f"   -- Could not save probabilities for {model_name}: {e}")

        return {
            "model": model,
            "accuracy": test_accuracy,
            "predictions": y_pred,
            "probabilities": y_pred_proba,
            "true_labels": y_test,
            "loss": test_loss,
        }


    # ------------------------------------------------------------------
    # MAIN TRAINING ENTRY POINT
    # ------------------------------------------------------------------
    def train_models(self, X_train, X_val, X_test, y_train, y_val, y_test):
        """
        Main training function for deep learning models.

        Trains exactly the models required by your proposal:
        - 1D CNN
        - LSTM
        """
        if not TF_AVAILABLE:
            print("--  Deep learning libraries not available. Skipping DL training.")
            return {}

        print("\n DEEP LEARNING MODEL TRAINING")
        print("=" * 50)

        # Preprocess data
        X_train_processed, X_val_processed, X_test_processed = self.preprocess_data(
            X_train, X_val, X_test
        )

        # Analyze dataset
        self.analyze_dataset(
            X_train_processed,
            y_train,
            X_val_processed,
            y_val,
            X_test_processed,
            y_test,
        )

        num_classes = len(np.unique(y_train))
        input_shape = (X_train_processed.shape[1], X_train_processed.shape[2])

        # Define models to train (internal keys are short; display names handled later)
        models_to_train = {
            "CNN": self.create_1d_cnn(input_shape, num_classes),
            "LSTM": self.create_lstm(input_shape, num_classes),
        }

        trained_models = {}
        training_histories = {}


        for model_name, model in models_to_train.items():
            try:
                # Train model
                history = self.train_single_model(
                    model,
                    model_name,
                    X_train_processed,
                    y_train,
                    X_val_processed,
                    y_val,
                )

                # Evaluate model
                results = self.evaluate_model(
                    model, model_name, X_test_processed, y_test
                )

                # Store results
                trained_models[model_name] = results
                training_histories[model_name] = history.history

                # Save model
                model_path = os.path.join(
                    self.project_root, "models", f"{model_name}_final.h5"
                )
                model.save(model_path)
                print(f"-- Saved {model_name} to {model_path}")

            except Exception as e:
                print(f"--  Error training {model_name}: {e}")
                continue

        # Store final results
        self.results = trained_models

        # Save training histories
        self.save_training_histories(training_histories)

        # Generate comprehensive report
        self.generate_detailed_report()

        return self.results

    # ------------------------------------------------------------------
    # REPORTING
    # ------------------------------------------------------------------
    def save_training_histories(self, training_histories):
        """Save training histories for analysis."""
        results_dir = os.path.join(self.project_root, "results")
        os.makedirs(results_dir, exist_ok=True)

        for model_name, history in training_histories.items():
            history_df = pd.DataFrame(history)
            history_path = os.path.join(
                results_dir, f"{model_name}_training_history.csv"
            )
            history_df.to_csv(history_path, index=False)
            print(f"-- Saved {model_name} training history")

    def generate_detailed_report(self):
        """Generate detailed performance report."""
        if not self.results:
            print("--  No results to generate report")
            return

        print("\n--  DETAILED PERFORMANCE REPORT")
        print("=" * 50)

        # Create results dataframe
        results_data = []
        for model_name, result in self.results.items():
            # Map internal name "CNN" to display name "1D CNN" for your report
            display_name = "1D CNN" if model_name == "CNN" else model_name

            results_data.append(
                {
                    "Model": display_name,
                    "Accuracy": result["accuracy"],
                    "Loss": result["loss"],
                    "Type": "Deep Learning",
                }
            )

        results_df = pd.DataFrame(results_data)

        # Save to file
        results_dir = os.path.join(self.project_root, "results")
        results_df.to_csv(
            os.path.join(results_dir, "deep_learning_results.csv"), index=False
        )

        # Print ranking
        ranked_results = results_df.sort_values("Accuracy", ascending=False)
        print("\nüèÜ MODEL RANKINGS:")
        for _, row in ranked_results.iterrows():
            stars = "‚≠ê" * min(5, int(row["Accuracy"] * 10))
            print(f"   {row['Model']:20} {row['Accuracy']:.4f} {stars}")

        # Save predictions
        all_predictions = []
        for model_name, result in self.results.items():
            display_name = "1D CNN" if model_name == "CNN" else model_name
            for true, pred in zip(result["true_labels"], result["predictions"]):
                all_predictions.append(
                    {
                        "model": display_name,
                        "true_label": int(true),
                        "predicted_label": int(pred),
                        "correct": bool(true == pred),
                    }
                )

        predictions_df = pd.DataFrame(all_predictions)
        predictions_df.to_csv(
            os.path.join(results_dir, "deep_learning_predictions.csv"), index=False
        )

        print("-- Saved detailed reports to results/ folder")

    def save_results(self):
        """
        Save deep learning results to CSV files.

        This mirrors TraditionalML.save_results() so the pipeline can safely call:
            dl_models.save_results()

        It uses the in-memory `self.results` (populated by `train_models`) and
        reuses `generate_detailed_report()` which already writes:

            - results/deep_learning_results.csv
            - results/deep_learning_predictions.csv
        """
        if not self.results:
            print(
                "‚ö†Ô∏è No in-memory deep learning results found. "
                "If you already trained models in a previous run, the CSV files "
                "in results/ are already on disk. "
                "If this is a fresh run, call train_models(...) before save_results()."
            )
            return

        self.generate_detailed_report()

## Run the whole experiment

In [None]:
from datetime import datetime
import time

In [None]:
class PipelineExecutor:
    """Main pipeline executor with comprehensive error handling"""

    def __init__(self):
        self.start_time = None
        self.results = {}
        self.pipeline_status = {
            "data_loading": False,
            "traditional_ml": False,
            "deep_learning": False,
            "visualization": False,
        }

    def print_header(self):
        """Print pipeline header"""
        print("\n" + "=" * 70)
        print("--  BEARING FAULT DIAGNOSIS PIPELINE - ROBUST EXECUTION")
        print("=" * 70)
        print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("=" * 70)

    def print_step_header(self, step_number, step_name):
        """Print step header"""
        print(f"\n{'‚îÅ'*50}")
        print(f"--  STEP {step_number}: {step_name}")
        print(f"{'_'*50}")

    def validate_environment(self):
        """Validate that all required components are available"""
        print("\nüîç VALIDATING ENVIRONMENT...")

        # Check required directories
        required_dirs = ["data/raw", "config"]
        for dir_path in required_dirs:
            if not os.path.exists(dir_path):
                print(f"--  Missing directory: {dir_path}")
                return False
            print(f"--  Directory exists: {dir_path}")

        # Check required files
        required_files = ["config/parameters.yaml"]
        for file_path in required_files:
            if not os.path.exists(file_path):
                print(f"--  Missing file: {file_path}")
                return False
            print(f"--  File exists: {file_path}")

        # Check for data files
        data_files = os.listdir("data/raw")
        if not data_files:
            print("--  No data files found in data/raw/")
            print("   Please download CWRU .mat files first")
            return False

        print(f"--  Found {len(data_files)} data files in data/raw/")
        return True

    def load_data(self):
        """Step 1: Data loading and preprocessing"""
        self.print_step_header(1, "DATA LOADING AND PREPROCESSING")

        try:

            print("--  Initializing data loader...")
            loader = DataLoader(use_cached=True)  # or False to force reprocessing

            print("--  Loading and segmenting vibration data...")
            X, y, label_map = loader.load_data()

            if len(X) == 0:
                raise ValueError("No data loaded - check your .mat files")

            print("--  Extracting features for traditional ML...")
            features, feature_names = loader.create_features(X)

            print("--  Preparing train/validation/test splits...")
            splits = loader.prepare_splits(X, y, features)

            # Store results
            self.results["data"] = {
                "X": X,
                "y": y,
                "label_map": label_map,
                "features": features,
                "feature_names": feature_names,
                "splits": splits,
            }

            self.pipeline_status["data_loading"] = True
            print("--  Data loading completed successfully!")

            return True

        except Exception as e:
            print(f"--  Data loading failed: {e}")
            return False

    def train_traditional_ml(self):
        """Step 2: Traditional Machine Learning"""
        self.print_step_header(2, "TRADITIONAL MACHINE LEARNING")

        try:
            if not self.pipeline_status["data_loading"]:
                raise ValueError("Data not loaded - run Step 1 first")

            data = self.results["data"]
            splits = data["splits"]

            print("--  Initializing traditional ML models...")
            traditional_ml = TraditionalML()

            print("--  Training Logistic Regression, Random Forest, and SVM...")
            traditional_results = traditional_ml.train_and_evaluate(
                splits["traditional"]["X_train"],
                splits["traditional"]["X_test"],
                splits["traditional"]["y_train"],
                splits["traditional"]["y_test"],
            )

            if not traditional_results:
                raise ValueError("Traditional ML training returned no results")

            print("-- Saving traditional ML results...")
            traditional_ml.save_results()

            self.results["traditional_ml"] = traditional_results
            self.pipeline_status["traditional_ml"] = True

            print("--  Traditional ML training completed successfully!")
            return True

        except Exception as e:
            print(f"--  Traditional ML training failed: {e}")
            return False

    def train_deep_learning(self):
        """Step 3: Deep Learning Models"""
        self.print_step_header(3, "DEEP LEARNING MODELS")

        try:
            if not TF_AVAILABLE:
                print("‚ö†Ô∏è  Deep learning libraries not available. Skipping deep learning step...")
                self.pipeline_status["deep_learning"] = True
                return True

            if not self.pipeline_status["data_loading"]:
                raise ValueError("Data not loaded - run Step 1 first")

            data = self.results["data"]
            splits = data["splits"]

            print("üß† Initializing deep learning models...")
            dl_models = DeepLearningModels()

            print("--  Training CNN and LSTM models...")
            dl_results = dl_models.train_models(
                splits["deep_learning"]["X_train"],
                splits["deep_learning"]["X_val"],
                splits["deep_learning"]["X_test"],
                splits["deep_learning"]["y_train"],
                splits["deep_learning"]["y_val"],
                splits["deep_learning"]["y_test"],
            )

            print("-- Saving deep learning results...")
            dl_models.save_results()

            self.results["deep_learning"] = dl_results
            self.pipeline_status["deep_learning"] = True

            print("--  Deep learning training completed successfully!")
            return True

        except Exception as e:
            print(f"--  Deep learning training failed: {e}")
            # Don't fail the whole pipeline if DL fails
            self.pipeline_status["deep_learning"] = True
            return True

    def generate_results_summary(self):
        """Step 4: Results Analysis and Summary"""
        self.print_step_header(4, "RESULTS ANALYSIS AND SUMMARY")

        try:
            print("üìà Generating comprehensive results summary...")

            # Load and combine all available results
            all_results = []

            # Traditional ML results
            try:
                trad_results = pd.read_csv("results/traditional_ml_results.csv")
                all_results.append(trad_results)
                print("--  Loaded traditional ML results")
            except FileNotFoundError:
                print("‚ö†Ô∏è  Traditional ML results not found")

            # Deep Learning results
            try:
                dl_results = pd.read_csv("results/deep_learning_results.csv")
                all_results.append(dl_results)
                print("--  Loaded deep learning results")
            except FileNotFoundError:
                print("‚ö†Ô∏è  Deep learning results not found")

            if not all_results:
                print("--  No results found to generate summary")
                return False

            # Combine all results
            combined_results = pd.concat(all_results, ignore_index=True)

            # Generate comprehensive summary
            self._print_detailed_summary(combined_results)

            # Save combined results
            combined_results.to_csv("results/combined_results.csv", index=False)

            self.pipeline_status["visualization"] = True
            print("--  Results summary completed successfully!")
            return True

        except Exception as e:
            print(f"--  Results summary generation failed: {e}")
            return False

    def _print_detailed_summary(self, results_df):
        """Print detailed results summary"""
        print("\n" + "--  COMPREHENSIVE PERFORMANCE REPORT")
        print("=" * 50)

        # Sort by accuracy
        ranked_results = results_df.sort_values("Accuracy", ascending=False)

        print("\nüèÜ MODEL RANKINGS:")
        print("-" * 40)
        for idx, (_, row) in enumerate(ranked_results.iterrows(), 1):
            accuracy_percent = row["Accuracy"] * 100
            stars = "‚≠ê" * min(5, int(row["Accuracy"] * 10 // 2))
            rank_icon = ["ü•á", "ü•à", "ü•â"][idx - 1] if idx <= 3 else f"{idx:2d}"

            print(f"   {rank_icon} {row['Model']:20} {accuracy_percent:6.2f}% {stars}")

        # Best model
        best_model = ranked_results.iloc[0]
        print(f"\n--  BEST PERFORMING MODEL:")
        print(f"   Model:    {best_model['Model']}")
        print(
            f"   Accuracy: {best_model['Accuracy']:.4f} ({best_model['Accuracy']*100:.2f}%)"
        )
        print(f"   Type:     {best_model['Type']}")

        # Statistics
        print(f"\nüìà PERFORMANCE STATISTICS:")
        print(f"   Total Models:    {len(results_df)}")
        print(f"   Average Accuracy: {results_df['Accuracy'].mean():.4f}")
        print(f"   Best Accuracy:    {results_df['Accuracy'].max():.4f}")
        print(f"   Worst Accuracy:   {results_df['Accuracy'].min():.4f}")

        # Model type breakdown
        model_types = results_df["Type"].value_counts()
        print(f"\n--  MODEL TYPE BREAKDOWN:")
        for model_type, count in model_types.items():
            type_accuracy = results_df[results_df["Type"] == model_type][
                "Accuracy"
            ].mean()
            print(f"   {model_type:20} {count:2d} models, avg: {type_accuracy:.4f}")

    def calculate_execution_time(self):
        """Calculate and format execution time"""
        if self.start_time:
            end_time = time.time()
            total_seconds = end_time - self.start_time
            minutes = int(total_seconds // 60)
            seconds = int(total_seconds % 60)
            return minutes, seconds
        return 0, 0

    def print_final_summary(self):
        """Print final pipeline summary"""
        minutes, seconds = self.calculate_execution_time()

        print("\n" + "=" * 70)
        print("üéâ PIPELINE EXECUTION COMPLETED!")
        print("=" * 70)

        # Pipeline status
        print("\n--  PIPELINE STATUS:")
        for step, status in self.pipeline_status.items():
            status_icon = "-- " if status else "-- "
            step_name = step.replace("_", " ").title()
            print(f"   {status_icon} {step_name}")

        # Data summary
        if self.pipeline_status["data_loading"]:
            data = self.results["data"]
            print(f"\n--  DATA SUMMARY:")
            print(f"   Samples: {len(data['X'])}")
            print(f"   Classes: {len(data['label_map'])}")
            print(f"   Signal Length: {data['X'].shape[1]}")

        # Results summary
        print(f"\n‚è±Ô∏è  EXECUTION TIME:")
        print(f"   Total: {minutes} minutes {seconds} seconds")

        print(f"\n-- OUTPUTS GENERATED:")
        output_dirs = ["results", "models", "data/processed"]
        for dir_path in output_dirs:
            if os.path.exists(dir_path) and os.listdir(dir_path):
                file_count = len(os.listdir(dir_path))
                print(f"   {dir_path}: {file_count} files")

        print("\n" + "=" * 70)

    def run_pipeline(self):
        """Execute the complete pipeline"""
        self.start_time = time.time()
        self.print_header()

        # Validate environment first
        if not self.validate_environment():
            print("--  Environment validation failed. Please check setup.")
            return False

        # Execute pipeline steps
        steps = [
            self.load_data,
            self.train_traditional_ml,
            self.train_deep_learning,
            self.generate_results_summary,
        ]

        successful_steps = 0
        for step_func in steps:
            try:
                if step_func():
                    successful_steps += 1
            except Exception as e:
                print(f"--  Step failed with exception: {e}")
                traceback.print_exc()
                # Continue with next step instead of failing completely

        # Final summary
        self.print_final_summary()

        # Success criteria - at least data loading and traditional ML should work
        if successful_steps >= 2:
            print("--  Pipeline completed with acceptable success!")
            return True
        else:
            print("‚ö†Ô∏è  Pipeline completed with limited success.")
            return False

In [None]:
import time
import traceback
from plot_results import main

try:
  pipeline = PipelineExecutor()
  success = pipeline.run_pipeline()
  if success:
      print("\n--  Pipeline executed successfully!")
      main()  # Call visualization only if pipeline succeeded
  else:
      print("\n--  Pipeline executed with some issues. Check logs above.")

except KeyboardInterrupt:
  print("\n\n--  Pipeline interrupted by user")
except Exception as e:
    print(f"\n--  Unexpected pipeline failure: {e}")
    traceback.print_exc()