<a href="https://colab.research.google.com/github/brotheramin/MachineLearning/blob/main/Sahand.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# ===== Quality Classification on XF.xlsx (HydroTest or RESULT) =====
# - Version-safe OneHotEncoder (works with old/new scikit-learn)
# - Handles class imbalance
# - Prints Accuracy / Precision / Recall / F1 / ROC-AUC + Confusion Matrix
# - Saves trained model and top features

import pandas as pd
import numpy as np
from typing import Optional
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
)
import joblib

# -----------------------------
# Config
# -----------------------------
EXCEL_PATH = "XF.xlsx"
SHEET_NAME = "Sheet2"
TARGET_LABEL = "HydroTest"      # <-- change to "RESULT" if you want ACC vs others
POSITIVE_CLASS_FOR_RESULT = "ACC"  # used only when TARGET_LABEL == "RESULT"

# -----------------------------
# Version-safe OneHotEncoder
# -----------------------------
def make_onehot():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True, min_frequency=0.01)
    except TypeError:
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
        except TypeError:
            try:
                return OneHotEncoder(handle_unknown="ignore", sparse=True, min_frequency=0.01)
            except TypeError:
                return OneHotEncoder(handle_unknown="ignore", sparse=True)

# -----------------------------
# Load & clean
# -----------------------------
try:
    df = pd.read_excel(EXCEL_PATH, sheet_name=SHEET_NAME)
except Exception:
    xls = pd.ExcelFile(EXCEL_PATH)
    df = pd.read_excel(EXCEL_PATH, sheet_name=xls.sheet_names[0])

# Drop Excel artifacts + very sparse columns (>70% missing)
drop_unnamed = [c for c in df.columns if str(c).strip().lower().startswith("unnamed")]
df.drop(columns=drop_unnamed, inplace=True, errors="ignore")
missing_frac = df.isna().mean()
df.drop(columns=missing_frac[missing_frac > 0.70].index.tolist(), inplace=True, errors="ignore")

# Drop ID-like columns
for col in ["Counter", "Weld NO."]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Coerce common numeric-like columns (keep all three lines!)
for c in ["THK (IN)", "Voltage", "Temp"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# -----------------------------
# Target mapping
# -----------------------------
if TARGET_LABEL not in df.columns:
    raise ValueError(f"'{TARGET_LABEL}' not found in columns: {list(df.columns)}")

y_raw = df[TARGET_LABEL].astype(str)

if TARGET_LABEL.lower() == "hydrotest":
    # Positive = "Pass"
    y = y_raw.str.strip().str.lower().eq("pass").astype(int)
else:
    # RESULT: Positive class (ACC by default)
    pos = POSITIVE_CLASS_FOR_RESULT.strip().upper()
    y = y_raw.str.strip().str.upper().eq(pos).astype(int)

# Keep only rows with a known target
mask = y_raw.notna()
X = df.loc[mask].drop(columns=[TARGET_LABEL]).copy()
y = y.loc[mask]

# -----------------------------
# Feature selection (control one-hot width)
# -----------------------------
cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

# Prefer a few known cats + any with small cardinality
preferred_cats = [c for c in ["Grade", "Sec", "BEND", "Weather"] if c in cat_cols]
small_cats = [c for c in cat_cols if c not in preferred_cats and X[c].nunique() <= 30]
use_cat = list(dict.fromkeys(preferred_cats + small_cats))
use_num = num_cols.copy()

# Make sure categoricals are strings
for c in use_cat:
    X[c] = X[c].astype(str)

X = X[use_cat + use_num]

# -----------------------------
# Train/test split (stratified to preserve class balance)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# -----------------------------
# Pipeline
# -----------------------------
cat_encoder = make_onehot()
preprocess = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), use_num),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", cat_encoder)
    ]), use_cat),
])

clf = RandomForestClassifier(
    n_estimators=240,
    max_depth=24,
    min_samples_leaf=5,
    max_features=0.7,
    class_weight="balanced",  # handle imbalance
    n_jobs=-1,
    random_state=42
)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("clf", clf)
])

# -----------------------------
# Train
# -----------------------------
pipe.fit(X_train, y_train)

# -----------------------------
# Evaluate (threshold 0.5)
# -----------------------------
proba = pipe.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.5).astype(int)

acc  = accuracy_score(y_test, pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
try:
    auc = roc_auc_score(y_test, proba)
except Exception:
    auc = float("nan")

print("=== Classification Metrics ===")
print(f"Accuracy : {acc:.4f}")

cm = confusion_matrix(y_test, pred)
print("\nConfusion Matrix [ [TN FP] [FN TP] ]")
print(cm)

# -----------------------------
# Save model
# -----------------------------
model_path = f"{TARGET_LABEL}_rf_classifier.pkl"
joblib.dump(pipe, model_path)
print(f"\nSaved classifier to: {model_path}")

# -----------------------------
# Save top features
# -----------------------------
try:
    pre = pipe.named_steps["preprocess"]
    est = pipe.named_steps["clf"]
    names = list(use_num)
    try:
        oh = pre.named_transformers_["cat"].named_steps["onehot"]
        try:
            oh_names = oh.get_feature_names_out(use_cat).tolist()
        except Exception:
            oh_names = []
            if hasattr(oh, "categories_"):
                for base, cats in zip(use_cat, oh.categories_):
                    for cat in cats:
                        oh_names.append(f"{base}={cat}")
            else:
                for base in use_cat:
                    oh_names.append(f"{base}_encoded")
        names += oh_names
    except Exception:
        pass

    importances = est.feature_importances_
    fi = (pd.DataFrame({"feature": names, "importance": importances})
          .sort_values("importance", ascending=False).head(30))
    fi.to_csv(f"{TARGET_LABEL}_rf_top_features.csv", index=False)
    print(f"Saved top features to: {TARGET_LABEL}_rf_top_features.csv")
except Exception as e:
    print("Feature importances skipped:", e)


=== Classification Metrics ===
Accuracy : 0.9957

Confusion Matrix [ [TN FP] [FN TP] ]
[[  20   12]
 [  12 5561]]

Saved classifier to: HydroTest_rf_classifier.pkl
Saved top features to: HydroTest_rf_top_features.csv
