<a href="https://colab.research.google.com/github/brotheramin/MachineLearning/blob/main/SafetyProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
"""
Process Failure (off-spec > 15 ppm) classification on the BALANCED dataset.

- File: level_transmitter_oilywater_dataset_456_balanced.csv
- Split: time-aware 60% train / 20% val / 20% test
- Model: RandomForestClassifier (no SMOTE required)
- Threshold: chosen on validation by maximizing F1
- Outputs: clean printed metrics + PR/ROC curves + feature importances
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, precision_recall_curve, average_precision_score, roc_curve
)

DATA_PATH = "level_transmitter_oilywater_dataset_456_balanced.csv"
OUT_DIR = Path("outputs_balanced_rf"); OUT_DIR.mkdir(parents=True, exist_ok=True)

FEATURES = [
    "influent_oil_ppm","turbidity_ntu","flow_l_min","temperature_c","pH",
    "ec_current_a","ec_voltage_v","demulsifier_ml_min",
    "interface_level_pct","pressure_bar","foam_index","fouling_index",
    "sensor_drift_pct","lt_signal_pct","lt_health_score"
]
LABEL = "offspec_gt15ppm"  # 1 if outlet > 15 ppm

def pick_threshold_by_f1(y_val, proba_val):
    """Pick threshold that maximizes F1 on validation probabilities."""
    prec, rec, thr = precision_recall_curve(y_val, proba_val)
    # precision_recall_curve returns len(thr) = len(prec) - 1
    f1 = np.where((prec+rec) > 0, 2*prec*rec/(prec+rec), 0)
    if len(thr) == 0:
        return 0.5  # degenerate case; shouldn't happen on balanced data
    best_idx = int(np.nanargmax(f1[:-1])) if len(f1) > 1 else 0
    return float(thr[best_idx])

def eval_split(y_true, proba, threshold):
    y_pred = (proba >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    ap = average_precision_score(y_true, proba)
    rocauc = roc_auc_score(y_true, proba) if len(np.unique(y_true)) > 1 else None
    cm = confusion_matrix(y_true, y_pred)
    return acc, prec, rec, f1, ap, rocauc, cm

def main():
    # ----- Load & order by time -----
    df = pd.read_csv(DATA_PATH, parse_dates=["timestamp"]).sort_values("timestamp").reset_index(drop=True)

    X = df[FEATURES].values
    y = df[LABEL].astype(int).values

    n = len(df)
    i_tr = int(0.6*n)
    i_va = int(0.8*n)

    X_tr, y_tr = X[:i_tr], y[:i_tr]
    X_va, y_va = X[i_tr:i_va], y[i_tr:i_va]
    X_te, y_te = X[i_va:], y[i_va:]

    print(f"Sizes -> Train: {len(y_tr)}, Val: {len(y_va)}, Test: {len(y_te)}")
    print("Class counts (train):", np.bincount(y_tr))
    print("Class counts (val)  :", np.bincount(y_va))
    print("Class counts (test) :", np.bincount(y_te))

    # ----- Train RandomForest -----
    rf = RandomForestClassifier(
        n_estimators=600,
        random_state=42,
        class_weight=None  # data is already balanced
    )
    rf.fit(X_tr, y_tr)

    # ----- Threshold on VALIDATION -----
    proba_va = rf.predict_proba(X_va)[:, 1]
    thr = pick_threshold_by_f1(y_va, proba_va)
    print(f"\nChosen threshold (max F1 on validation): {thr:.3f}")

    # ----- Evaluate on VAL -----
    acc_v, prec_v, rec_v, f1_v, ap_v, roc_v, cm_v = eval_split(y_va, proba_va, thr)
    print("\n=== VALIDATION METRICS ===")
    print(f"Accuracy  : {acc_v:.3f}")
    print(f"Precision : {prec_v:.3f}")
    print(f"Recall    : {rec_v:.3f}")
    print(f"F1 Score  : {f1_v:.3f}")
    print(f"PR-AUC    : {ap_v:.3f}")
    print(f"ROC-AUC   : {roc_v}")
    print(f"Confusion : {cm_v.tolist()}")

    # ----- Evaluate on TEST -----
    proba_te = rf.predict_proba(X_te)[:, 1]
    acc, prec, rec, f1, ap, rocauc, cm = eval_split(y_te, proba_te, thr)

    print("\n=== TEST METRICS ===")
    print(f"Threshold : {thr:.3f}")
    print(f"Accuracy  : {acc:.3f}")
    print(f"Precision : {prec:.3f}")
    print(f"Recall    : {rec:.3f}")
    print(f"F1 Score  : {f1:.3f}")
    print(f"PR-AUC    : {ap:.3f}")
    print(f"ROC-AUC   : {rocauc}")
    print(f"Confusion : {cm.tolist()}")

    # ----- PR curve (Test) -----
    prec_te, rec_te, _ = precision_recall_curve(y_te, proba_te)
    plt.figure()
    plt.plot(rec_te, prec_te)
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.title("Balanced RF – Precision-Recall (Test)")
    plt.grid(True); plt.tight_layout()
    plt.savefig(OUT_DIR / "pr_curve_test.png"); plt.close()

    # ----- ROC curve (Test) -----
    if len(np.unique(y_te)) > 1:
        fpr, tpr, _ = roc_curve(y_te, proba_te)
        plt.figure()
        plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--')
        plt.xlabel("FPR"); plt.ylabel("TPR")
        plt.title("Balanced RF – ROC (Test)")
        plt.grid(True); plt.tight_layout()
        plt.savefig(OUT_DIR / "roc_curve_test.png"); plt.close()

    # ----- Feature importances -----
    importances = rf.feature_importances_
    idx = np.argsort(importances)[::-1][:10]
    plt.figure()
    plt.bar(range(len(idx)), importances[idx])
    plt.xticks(range(len(idx)), [FEATURES[i] for i in idx], rotation=45, ha='right')
    plt.ylabel("Importance"); plt.title("Balanced RF – Top Features")
    plt.tight_layout()
    plt.savefig(OUT_DIR / "feature_importances.png"); plt.close()

if __name__ == "__main__":
    main()


Sizes -> Train: 273, Val: 91, Test: 92
Class counts (train): [213  60]
Class counts (val)  : [65 26]
Class counts (test) : [65 27]

Chosen threshold (max F1 on validation): 0.268

=== VALIDATION METRICS ===
Accuracy  : 0.923
Precision : 0.828
Recall    : 0.923
F1 Score  : 0.873
PR-AUC    : 0.863
ROC-AUC   : 0.9337278106508876
Confusion : [[60, 5], [2, 24]]

=== TEST METRICS ===
Threshold : 0.268
Accuracy  : 0.924
Precision : 0.812
Recall    : 0.963
F1 Score  : 0.881
PR-AUC    : 0.923
ROC-AUC   : 0.9695156695156695
Confusion : [[59, 6], [1, 26]]
