In [4]:
import pandas as pd

# Load dataset (expects vibrations_data.csv in the notebook working directory)
df = pd.read_csv("../data/vibrations_data.csv")

print(f"Loaded dataframe with {df.shape[0]} rows and {df.shape[1]} columns")
print(df.dtypes)
df.head()

Loaded dataframe with 390263 rows and 25 columns
Time                  float64
Demand 1              float64
Control 1             float64
Output Drive 1        float64
Channel 1             float64
Channel 2             float64
Channel 3             float64
Channel 4             float64
Channel 1 Kurtosis    float64
Channel 2 Kurtosis    float64
Channel 3 Kurtosis    float64
Channel 4 Kurtosis    float64
Rear Input 1            int64
Rear Input 2            int64
Rear Input 3            int64
Rear Input 4            int64
Rear Input 5            int64
Rear Input 6            int64
Rear Input 7            int64
Rear Input 8            int64
condition              object
rpm                     int64
humidity                int64
temperature             int64
source_file            object
dtype: object


Unnamed: 0,Time,Demand 1,Control 1,Output Drive 1,Channel 1,Channel 2,Channel 3,Channel 4,Channel 1 Kurtosis,Channel 2 Kurtosis,...,Rear Input 4,Rear Input 5,Rear Input 6,Rear Input 7,Rear Input 8,condition,rpm,humidity,temperature,source_file
0,0.00145,0.125011,0.176033,0.0,0.211458,0.209182,0.145823,1.6242e-15,2.52457,2.94874,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv
1,0.00145,0.125011,0.176033,0.0,0.211458,0.209182,0.145823,1.6242e-15,2.52457,2.94874,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv
2,0.006283,0.125011,0.176033,1.2e-05,0.206329,0.206513,0.150478,1.64332e-15,2.3229,2.46553,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv
3,0.009633,0.125011,0.172626,1.3e-05,0.206351,0.194663,0.148313,1.52827e-15,2.26458,2.55488,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv
4,0.0132,0.125011,0.172626,1.4e-05,0.214463,0.214489,0.155652,1.79137e-15,2.5338,2.66379,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score

# Split dataset into 70% "labeled" and 30% "unlabeled", train multiple models and pick the best.

RANDOM_STATE = 42

# 1) Create labeled / unlabeled splits (shuffled)
df_shuffled = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
n_total = len(df_shuffled)
n_labeled = int(0.7 * n_total)

df_labeled = df_shuffled.iloc[:n_labeled].copy()
df_unlabeled = df_shuffled.iloc[n_labeled:].copy()

# Simulate unlabeled by removing the target
df_unlabeled_no_label = df_unlabeled.copy()
df_unlabeled_no_label['condition'] = np.nan

print(f"Total: {n_total}, Labeled: {len(df_labeled)}, Unlabeled: {len(df_unlabeled)}")

# 2) Feature / target selection
# use all numeric columns as features (drop identifiers / target)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# ensure target and any id-like columns are not included (source_file is object so not in numeric_cols)
feature_cols = [c for c in numeric_cols if c != 'rpm' or True]  # keep numeric metadata; adjust if desired

X_labeled = df_labeled[feature_cols].values
y_labeled = LabelEncoder().fit_transform(df_labeled['condition'].values)  # faulty/healthy -> 0/1

X_unlabeled = df_unlabeled_no_label[feature_cols].values

# 3) Split labeled into train / test for evaluation (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_labeled, y_labeled, test_size=0.2, stratify=y_labeled, random_state=RANDOM_STATE
)

# 4) Standardize features (fit on train)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
X_unlabeled_s = scaler.transform(X_unlabeled)

# 5) Define supervised models (pipelines with scaler where appropriate)
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "SVC": SVC(probability=True, random_state=RANDOM_STATE)
}

results = {}

for name, clf in models.items():
    # use scaled inputs for all to keep comparisons consistent
    clf.fit(X_train_s, y_train)
    y_pred = clf.predict(X_test_s)
    y_proba = clf.predict_proba(X_test_s)[:, 1] if hasattr(clf, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    bal = balanced_accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    try:
        roc = roc_auc_score(y_test, y_proba) if y_proba is not None else float("nan")
    except Exception:
        roc = float("nan")

    results[name] = {"model": clf, "accuracy": acc, "balanced_accuracy": bal, "f1": f1, "roc_auc": roc}

# 6) Semi-supervised LabelSpreading using train + unlabeled
# prepare combined dataset: labels -1 for unlabeled
X_ssl = np.vstack([X_train_s, X_unlabeled_s])
y_ssl = np.concatenate([y_train, -1 * np.ones(len(X_unlabeled_s), dtype=int)])
label_spread = LabelSpreading(kernel='rbf', alpha=0.2)
label_spread.fit(X_ssl, y_ssl)
y_pred_ssl = label_spread.predict(X_test_s)
# LabelSpreading doesn't provide predict_proba in same sense; use label_spread.predict_proba
y_proba_ssl = label_spread.predict_proba(X_test_s)[:, 1] if hasattr(label_spread, "predict_proba") else None

acc = accuracy_score(y_test, y_pred_ssl)
bal = balanced_accuracy_score(y_test, y_pred_ssl)
f1 = f1_score(y_test, y_pred_ssl)
try:
    roc = roc_auc_score(y_test, y_proba_ssl) if y_proba_ssl is not None else float("nan")
except Exception:
    roc = float("nan")

results["LabelSpreading(semi-supervised)"] = {
    "model": label_spread, "accuracy": acc, "balanced_accuracy": bal, "f1": f1, "roc_auc": roc
}

# 7) Report results and pick best by balanced_accuracy
print("\nModel performance (sorted by balanced_accuracy):")
sorted_results = sorted(results.items(), key=lambda kv: kv[1]["balanced_accuracy"], reverse=True)
for name, res in sorted_results:
    print(f"{name}: bal_acc={res['balanced_accuracy']:.4f}, acc={res['accuracy']:.4f}, f1={res['f1']:.4f}, roc_auc={res['roc_auc']:.4f}")

best_name, best_info = sorted_results[0]
best_model = best_info["model"]
print(f"\nBest model: {best_name}")

# 8) Retrain best model on all labeled data (use scaled full labeled features)
X_full_labeled = scaler.transform(X_labeled)  # scaler was fit on X_train; acceptable; optionally refit on full labeled
# If best model is a LabelSpreading semi-supervised model, retrain using all labeled + unlabeled
if best_name == "LabelSpreading(semi-supervised)":
    X_ssl_full = np.vstack([X_full_labeled, X_unlabeled_s])
    y_ssl_full = np.concatenate([y_labeled, -1 * np.ones(len(X_unlabeled_s), dtype=int)])
    final_model = LabelSpreading(kernel='rbf', alpha=0.2)
    final_model.fit(X_ssl_full, y_ssl_full)
else:
    # retrain supervised classifier on full labeled set
    # if model is pipeline-like or needs scaling, ensure inputs are scaled
    final_model = best_model
    final_model.fit(X_full_labeled, y_labeled)

# final_model is ready for inference (use scaler.transform for feature preparation if supervised)
print("Final model ready as `final_model`. Use `scaler.transform(X)` on feature arrays before predict for supervised models.")