<a href="https://colab.research.google.com/github/ahmetdenizyilmaz/Machine-Learning-Vibration-Fault-Detection/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vibration Fault Detection — ResMLP Training

Train a **Residual MLP** classifier on 35 firmware-matched features extracted from
raw 3-axis acceleration segments.  Designed to run on **Google Colab** (GPU recommended).

In [1]:
# ── Cell 1: Setup ─────────────────────────────────────────────────────────
# Install dependencies (uncomment on Colab)
# !pip install -q numpy scipy pandas scikit-learn matplotlib seaborn datasets torch

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc,
)
from sklearn.preprocessing import label_binarize, LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.inspection import permutation_importance

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import pickle, os, json, warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Setup complete.  Device: {device}')

Setup complete.  Device: cpu


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ── Cell 2: Configuration ─────────────────────────────────────────────────

# ── Data source: choose ONE ──
# Option A: HuggingFace dataset (for Colab)
DATA_SOURCE = 'huggingface'
HF_DATASET = 'adyady/bearing-fault-dataset'
SIGNAL_FIELD = 'low_data'  # which array to use: 'high_data' or 'low_data'

# Option B: Local JSON files
# DATA_SOURCE = 'local'
# DATA_DIR = 'data/raw'

# Pre-computed feature CSV (skip extraction if it exists)
FEATURE_CSV = 'features.csv'

# Class grouping: map raw fault_category → training label
CLASS_MAP = {
    'healthy':          'Healthy',
    'bearing_inner':    'Bearing Fault',
    'bearing_outer':    'Bearing Fault',
    'bearing_rolling':  'Bearing Fault',
    'bearing_fault':    'Bearing Fault',
    'bearing_combined': 'Bearing Fault',
    'electrical':       'Electrical Fault',
    'flow_cavitation':  'Flow/Cavitation',
    'unbalance':        'Unbalance',
    'misalignment':     'Misalignment',
    'eccentricity':     'Misalignment',
    'gear_fault':       'Gear Fault',
    'rotor_fault':      'Rotor Fault',
    'unknown':          'Unknown',
}

# 35 feature columns (canonical order from feature_extraction.py)
FEATURE_COLS = [
    'temp',
    'xRMS', 'yRMS', 'zRMS',
    'xVRMS', 'yVRMS', 'zVRMS',
    'xEnvRMS', 'yEnvRMS', 'zEnvRMS',
    'xKU', 'yKU', 'zKU',
    'xP2P', 'yP2P', 'zP2P',
    'maxCf',
    'accLowPeakRatioX', 'accLowPeakRatioY', 'accLowPeakRatioZ',
    'accMidPeakRatioX', 'accMidPeakRatioY', 'accMidPeakRatioZ',
    'accHighPeakRatioX', 'accHighPeakRatioY', 'accHighPeakRatioZ',
    'velLowPeakRatioX', 'velLowPeakRatioY', 'velLowPeakRatioZ',
    'velMidPeakRatioX', 'velMidPeakRatioY', 'velMidPeakRatioZ',
    'velHighPeakRatioX', 'velHighPeakRatioY', 'velHighPeakRatioZ',
]

RANDOM_STATE = 42
print(f'Config: {len(FEATURE_COLS)} features, source={DATA_SOURCE}')

In [None]:
# ── Cell 3: Load Data ─────────────────────────────────────────────────────

import sys, json

# --- Clone repo for feature_extraction.py if running on Colab ---
if DATA_SOURCE == 'huggingface' and not os.path.exists('feature_extraction.py'):
    print('feature_extraction.py not found locally.')
    print('Paste it into Colab or upload it, then re-run this cell.')
    print('(Or clone your repo: !git clone <your-repo-url>)')

from feature_extraction import extract_features

if os.path.exists(FEATURE_CSV):
    print(f'Loading pre-computed features from {FEATURE_CSV}')
    df = pd.read_csv(FEATURE_CSV)

elif DATA_SOURCE == 'huggingface':
    from datasets import load_dataset

    print(f'Loading HuggingFace dataset: {HF_DATASET} ...')
    ds = load_dataset(HF_DATASET, split='train')
    hf_df = ds.to_pandas()
    print(f'  Loaded {len(hf_df)} rows (each row = 1 axis of 1 segment)')
    print(f'  Columns: {list(hf_df.columns)}')
    print(f'  Axes: {hf_df["axis"].value_counts().to_dict()}')
    print(f'  Fault categories: {hf_df["fault_category"].value_counts().to_dict()}')

    # ── Group per-axis rows into segments ──
    # Each unique file_name may have rows for axis x, y, z
    grouped = hf_df.groupby('file_name')
    print(f'\n  Unique segments: {len(grouped)}')
    print(f'  Extracting features (using "{SIGNAL_FIELD}" column) ...')

    rows = []
    for i, (seg_name, seg_rows) in enumerate(grouped):
        try:
            # Collect axes for this segment
            axes_data = {}
            for _, row in seg_rows.iterrows():
                ax = row['axis']  # 'x', 'y', or 'z'
                signal = np.asarray(row[SIGNAL_FIELD], dtype=np.float64)
                axes_data[ax] = signal

            x = axes_data.get('x')
            y = axes_data.get('y')
            z = axes_data.get('z')

            if x is None:
                continue

            # Sample rate from the row
            first_row = seg_rows.iloc[0]
            fs = float(first_row.get('target_sample_rate') or first_row.get('original_sample_rate'))

            # RPM from metadata_json if available
            rpm = None
            meta_str = first_row.get('metadata_json', '{}')
            if isinstance(meta_str, str) and meta_str:
                try:
                    meta = json.loads(meta_str)
                    rpm = meta.get('rpm') or meta.get('operating_conditions', {}).get('rpm')
                except (json.JSONDecodeError, TypeError):
                    pass
            if rpm is not None:
                rpm = float(rpm)

            feats = extract_features(x, y, z, fs=fs, rpm=rpm)
            feats['filename'] = seg_name
            feats['fault_category'] = first_row.get('fault_category', '')
            feats['fault_type'] = first_row.get('fault_type', '')
            feats['dataset'] = first_row.get('source_dataset', '')
            feats['sample_rate_hz'] = fs
            rows.append(feats)

        except Exception as e:
            if i < 5:
                print(f'    SKIP {seg_name}: {e}')

        if (i + 1) % 2000 == 0:
            print(f'    Processed {i + 1}/{len(grouped)} segments')

    df = pd.DataFrame(rows)
    df.to_csv(FEATURE_CSV, index=False)
    print(f'\n  Extracted {len(df)} segments → saved to {FEATURE_CSV}')

else:  # local JSON files
    print(f'Running feature extraction on {DATA_DIR} ...')
    from feature_extraction import extract_all
    df = extract_all(DATA_DIR, output_csv=FEATURE_CSV)

# ── Map fault_category → class label ──
df['label'] = df['fault_category'].map(CLASS_MAP)
unknown = df['label'].isna().sum()
if unknown > 0:
    unmapped = df.loc[df['label'].isna(), 'fault_category'].unique()
    print(f'WARNING: {unknown} rows have unmapped fault_category: {unmapped}')
    print('         These rows will be dropped.  Update CLASS_MAP to include them.')
    df = df.dropna(subset=['label'])

print(f'\nDataset: {len(df)} segments, {df["label"].nunique()} classes')
print(df['label'].value_counts())

In [None]:
# ── Cell 4: EDA ───────────────────────────────────────────────────────────

fig, axes = plt.subplots(1, 3, figsize=(20, 5))

# 4a. Class distribution
order = df['label'].value_counts().index
sns.countplot(data=df, y='label', order=order, ax=axes[0], hue='label', legend=False)
axes[0].set_title('Class Distribution')
axes[0].set_xlabel('Count')

# 4b. Feature correlation heatmap (numeric features only, drop all-NaN cols)
feat_present = [c for c in FEATURE_COLS if c in df.columns and df[c].notna().any()]
corr = df[feat_present].corr()
sns.heatmap(corr, ax=axes[1], cmap='coolwarm', center=0,
            xticklabels=False, yticklabels=False, cbar_kws={'shrink': 0.6})
axes[1].set_title('Feature Correlation')

# 4c. Box plot of xRMS per class
if 'xRMS' in df.columns:
    sns.boxplot(data=df, y='label', x='xRMS', order=order, ax=axes[2], hue='label', legend=False)
    axes[2].set_title('xRMS by Class')

plt.tight_layout()
plt.show()

# 4d. Additional box plots for key features
key_features = ['xVRMS', 'xKU', 'xP2P', 'maxCf']
key_features = [f for f in key_features if f in df.columns and df[f].notna().any()]
if key_features:
    fig, axes2 = plt.subplots(1, len(key_features), figsize=(5 * len(key_features), 4))
    if len(key_features) == 1:
        axes2 = [axes2]
    for ax, feat in zip(axes2, key_features):
        sns.boxplot(data=df, y='label', x=feat, order=order, ax=ax, hue='label', legend=False)
        ax.set_title(feat)
    plt.tight_layout()
    plt.show()

In [None]:
# ── Cell 5: Preprocessing ─────────────────────────────────────────────────

# Select feature columns that actually exist in the DataFrame
use_cols = [c for c in FEATURE_COLS if c in df.columns]
X = df[use_cols].copy()
y = df['label'].copy()

# Train / test split (80/20, stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

# Encode labels
le = LabelEncoder()
le.fit(y)
y_train_enc = le.transform(y_train)
y_test_enc = le.transform(y_test)
num_classes = len(le.classes_)
class_labels = list(le.classes_)

# Impute NaN → 0, then StandardScaler
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train.fillna(0))
X_test_sc = scaler.transform(X_test.fillna(0))

print(f'Features used: {len(use_cols)}')
print(f'Train: {len(X_train)}  |  Test: {len(X_test)}')
print(f'NaN fraction in train: {X_train.isna().mean().mean():.2%}')
print(f'Classes ({num_classes}): {class_labels}')

In [None]:
# ── Cell 6: ResMLP — Architecture & Training ─────────────────────────────

# ─────────────────────────────────────────────────────────────────────────
# Architecture
# ─────────────────────────────────────────────────────────────────────────

class ResidualBlock(nn.Module):
    """BN → GELU → Linear → BN → GELU → Dropout → Linear + skip."""
    def __init__(self, dim, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(dim),
            nn.GELU(),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
        )
    def forward(self, x):
        return x + self.net(x)


class ResMLP(nn.Module):
    """Deep Residual MLP for tabular classification.

    Input(n_features) → Linear(256) → BN → GELU → Dropout
    → ResidualBlock × 4
    → Linear(128) → BN → GELU → Dropout
    → Linear(64)  → BN → GELU → Dropout
    → Linear(n_classes)
    """
    def __init__(self, n_features, n_classes, hidden=256, n_blocks=4, dropout=0.3):
        super().__init__()
        self.input_layer = nn.Sequential(
            nn.Linear(n_features, hidden),
            nn.BatchNorm1d(hidden),
            nn.GELU(),
            nn.Dropout(dropout),
        )
        self.res_blocks = nn.Sequential(
            *[ResidualBlock(hidden, dropout) for _ in range(n_blocks)]
        )
        self.head = nn.Sequential(
            nn.Linear(hidden, hidden // 2),
            nn.BatchNorm1d(hidden // 2),
            nn.GELU(),
            nn.Dropout(dropout * 0.7),
            nn.Linear(hidden // 2, hidden // 4),
            nn.BatchNorm1d(hidden // 4),
            nn.GELU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden // 4, n_classes),
        )
    def forward(self, x):
        return self.head(self.res_blocks(self.input_layer(x)))


# ─────────────────────────────────────────────────────────────────────────
# Sklearn-compatible wrapper (for evaluation & saving)
# ─────────────────────────────────────────────────────────────────────────

class TorchWrapper:
    def __init__(self, model, scaler, label_encoder, feature_names, device_):
        self.model = model
        self.scaler = scaler
        self.le = label_encoder
        self.feature_names = feature_names
        self.device = device_

    def predict_proba(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.model.eval()
        with torch.no_grad():
            t = torch.tensor(X, dtype=torch.float32).to(self.device)
            probs = torch.softmax(self.model(t), dim=1).cpu().numpy()
        return probs

    def predict(self, X):
        probs = self.predict_proba(X)
        return self.le.inverse_transform(np.argmax(probs, axis=1))


# ─────────────────────────────────────────────────────────────────────────
# Training function
# ─────────────────────────────────────────────────────────────────────────

def train_resmlp(X_tr, y_tr, X_val, y_val, n_features, n_classes,
                 class_weights, epochs=200, lr=1e-3, batch_size=256, patience=20):
    model = ResMLP(n_features, n_classes, hidden=256, n_blocks=4, dropout=0.3).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=20, T_mult=2)
    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

    train_ds = TensorDataset(
        torch.tensor(X_tr, dtype=torch.float32),
        torch.tensor(y_tr, dtype=torch.long),
    )
    loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_X = torch.tensor(X_val, dtype=torch.float32).to(device)
    val_y = torch.tensor(y_val, dtype=torch.long).to(device)

    best_loss, best_state, wait = float('inf'), None, 0
    for epoch in range(epochs):
        model.train()
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            criterion(model(xb), yb).backward()
            optimizer.step()
        scheduler.step()

        model.eval()
        with torch.no_grad():
            val_loss = criterion(model(val_X), val_y).item()
        if val_loss < best_loss:
            best_loss = val_loss
            best_state = copy.deepcopy(model.state_dict())
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                break

    model.load_state_dict(best_state)
    model.eval()
    with torch.no_grad():
        val_acc = (model(val_X).argmax(1) == val_y).float().mean().item()
    return model, epoch + 1, best_loss, val_acc


# ─────────────────────────────────────────────────────────────────────────
# Class weights for imbalanced data
# ─────────────────────────────────────────────────────────────────────────

cw = compute_class_weight('balanced', classes=np.arange(num_classes), y=y_train_enc)
class_weights = torch.tensor(cw, dtype=torch.float32)
print(f'Class weights: {dict(zip(le.classes_, cw.round(2)))}')

# ─────────────────────────────────────────────────────────────────────────
# 5-fold Cross-Validation
# ─────────────────────────────────────────────────────────────────────────

print('\n── ResMLP 5-fold CV ──')
cv_accs = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_sc, y_train_enc)):
    model, ep, loss, acc = train_resmlp(
        X_train_sc[tr_idx], y_train_enc[tr_idx],
        X_train_sc[val_idx], y_train_enc[val_idx],
        n_features=len(use_cols), n_classes=num_classes,
        class_weights=class_weights,
    )
    cv_accs.append(acc)
    print(f'  Fold {fold+1}/5: epoch {ep}, val_loss={loss:.4f}, val_acc={acc:.4f}')

print(f'\nResMLP 5-fold CV accuracy: {np.mean(cv_accs):.4f} ± {np.std(cv_accs):.4f}')

# ─────────────────────────────────────────────────────────────────────────
# Train final model on full training set
# ─────────────────────────────────────────────────────────────────────────

print('\nTraining final ResMLP on full train set...')
X_tr_f, X_val_f, y_tr_f, y_val_f = train_test_split(
    X_train_sc, y_train_enc, test_size=0.1, random_state=RANDOM_STATE, stratify=y_train_enc
)
final_model, ep, loss, acc = train_resmlp(
    X_tr_f, y_tr_f, X_val_f, y_val_f,
    n_features=len(use_cols), n_classes=num_classes,
    class_weights=class_weights, epochs=300, patience=30,
)
print(f'Final model: epoch {ep}, val_loss={loss:.4f}, val_acc={acc:.4f}')

best_model = TorchWrapper(final_model, scaler, le, use_cols, device)
best_cv_acc = np.mean(cv_accs)

In [None]:
# ── Cell 7: Evaluation ────────────────────────────────────────────────────

y_pred = best_model.predict(X_test_sc)
y_proba = best_model.predict_proba(X_test_sc)
test_acc = (y_pred == y_test.values).mean()

print(f'ResMLP Test Accuracy: {test_acc:.4f}')
print(f'CV Accuracy:          {best_cv_acc:.4f} ± {np.std(cv_accs):.4f}')
print()
print(classification_report(y_test, y_pred, zero_division=0))

# ── Confusion Matrix ──
cm = confusion_matrix(y_test, y_pred, labels=class_labels)
fig, ax = plt.subplots(figsize=(8, 6))
ConfusionMatrixDisplay(cm, display_labels=class_labels).plot(ax=ax, cmap='Blues', colorbar=False)
ax.set_title(f'Confusion Matrix — ResMLP (acc={test_acc:.3f})')
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.show()

# ── ROC Curves (one-vs-rest) ──
y_test_bin = label_binarize(y_test, classes=class_labels)
fig, ax = plt.subplots(figsize=(8, 6))
for i, cls in enumerate(class_labels):
    if y_test_bin.shape[1] > i:
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
        ax.plot(fpr, tpr, label=f'{cls} (AUC={auc(fpr, tpr):.3f})')
ax.plot([0, 1], [0, 1], 'k--', alpha=0.3)
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('ROC Curves — ResMLP')
ax.legend(loc='lower right', fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# ── Cell 8: Feature Importance (Permutation) ─────────────────────────────
# Permutation importance: shuffle each feature and measure accuracy drop.
# Works with any model (no LightGBM needed).

from sklearn.base import BaseEstimator, ClassifierMixin

class _SklearnBridge(BaseEstimator, ClassifierMixin):
    """Minimal bridge so sklearn's permutation_importance works with TorchWrapper."""
    def __init__(self, wrapper):
        self.wrapper = wrapper
        self.classes_ = wrapper.le.classes_
    def fit(self, X, y): return self
    def predict(self, X): return self.wrapper.predict(X)
    def score(self, X, y): return (self.predict(X) == y).mean()

bridge = _SklearnBridge(best_model)
print('Computing permutation importance (this may take a minute)...')
result = permutation_importance(
    bridge, X_test_sc, y_test.values,
    n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1,
)

feat_imp = pd.Series(result.importances_mean, index=use_cols).sort_values(ascending=False)

top_n = min(20, len(feat_imp))
fig, ax = plt.subplots(figsize=(8, 6))
feat_imp.head(top_n).plot.barh(ax=ax)
ax.invert_yaxis()
ax.set_title(f'Top-{top_n} Feature Importance (Permutation)')
ax.set_xlabel('Mean accuracy decrease')
plt.tight_layout()
plt.show()

print('\nTop-20 features:')
print(feat_imp.head(20).to_string())

In [None]:
# ── Cell 9: Save Model ────────────────────────────────────────────────────

# Save PyTorch state dict + metadata
artifact = {
    'model_state_dict': final_model.state_dict(),
    'model_config': {
        'n_features': len(use_cols),
        'n_classes': num_classes,
        'hidden': 256,
        'n_blocks': 4,
        'dropout': 0.3,
    },
    'scaler': scaler,
    'label_encoder': le,
    'feature_names': use_cols,
    'class_labels': class_labels,
    'class_map': CLASS_MAP,
    'cv_accuracy': best_cv_acc,
    'test_accuracy': test_acc,
}

model_path = 'resmlp_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(artifact, f)

print(f'Saved ResMLP to {model_path}')
print(f'  Features : {len(use_cols)}')
print(f'  Classes  : {class_labels}')
print(f'  CV acc   : {best_cv_acc:.4f}')
print(f'  Test acc : {test_acc:.4f}')

# Also save as pure PyTorch checkpoint
torch_path = 'resmlp_checkpoint.pt'
torch.save({
    'model_state_dict': final_model.state_dict(),
    'model_config': artifact['model_config'],
    'class_labels': class_labels,
    'feature_names': use_cols,
}, torch_path)
print(f'  PyTorch checkpoint: {torch_path}')