In [6]:
import pandas as pd
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE

# === Step 1: Load dataset and select top 7 features ===
df = pd.read_csv("labeled_thermal_features.csv")
top_features = ["IQR", "Std_Temp", "Q1", "Min_Temp", "Median_Temp", "Skewness", "Kurtosis"]
X = df[top_features]
y = df["Porosity Label"]

# === Step 2: Set up 5-fold cross-validation ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
conf_matrices = []
class_metrics = []

fold = 1
for train_idx, test_idx in skf.split(X, y):
    print(f"\n🔁 Fold {fold}")

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Apply SMOTE
    smote = SMOTE(sampling_strategy=0.2, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    # Train model
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features="sqrt",
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train_res, y_train_res)
    y_pred = rf.predict(X_test)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    per_class = {
        label: {
            "precision": round(metrics["precision"], 4),
            "recall": round(metrics["recall"], 4),
            "f1-score": round(metrics["f1-score"], 4),
            "support": int(metrics["support"])
        }
        for label, metrics in report.items()
        if label in ['0', '1']
    }

    accuracies.append(acc)
    conf_matrices.append(cm)
    class_metrics.append(per_class)

    print(f"✅ Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Per-class Metrics:")
    for label, scores in per_class.items():
        print(f"  Class {label}: {scores}")

    fold += 1

# === Step 3: Final Summary ===
print("\n📊 5-Fold Cross-Validation Summary:")
print(f"Average Accuracy: {np.mean(accuracies):.4f}")

avg_metrics = {
    "0": {"precision": 0, "recall": 0, "f1-score": 0},
    "1": {"precision": 0, "recall": 0, "f1-score": 0}
}

for stats in class_metrics:
    for label in ["0", "1"]:
        for metric in ["precision", "recall", "f1-score"]:
            avg_metrics[label][metric] += stats[label][metric]

for label in ["0", "1"]:
    for metric in ["precision", "recall", "f1-score"]:
        avg_metrics[label][metric] = round(avg_metrics[label][metric] / 5, 4)

print("\n📈 Average Per-Class Metrics (across 5 folds):")
for label, scores in avg_metrics.items():
    print(f"  Class {label}: {scores}")

# Average confusion matrix
avg_cm = np.sum(conf_matrices, axis=0) / 5
print("\n🧮 Average Confusion Matrix:")
print(np.round(avg_cm).astype(int))

# Save last model
with open("random_forest_cv_top7.pkl", "wb") as f:
    pickle.dump(rf, f)
print("\n✅ Model saved as 'random_forest_cv_top7.pkl'")



🔁 Fold 1
✅ Accuracy: 0.9840
Confusion Matrix:
[[293   5]
 [  0  15]]
Per-class Metrics:
  Class 0: {'precision': 1.0, 'recall': 0.9832, 'f1-score': 0.9915, 'support': 298}
  Class 1: {'precision': 0.75, 'recall': 1.0, 'f1-score': 0.8571, 'support': 15}

🔁 Fold 2
✅ Accuracy: 0.9904
Confusion Matrix:
[[297   2]
 [  1  13]]
Per-class Metrics:
  Class 0: {'precision': 0.9966, 'recall': 0.9933, 'f1-score': 0.995, 'support': 299}
  Class 1: {'precision': 0.8667, 'recall': 0.9286, 'f1-score': 0.8966, 'support': 14}

🔁 Fold 3
✅ Accuracy: 0.9808
Confusion Matrix:
[[293   6]
 [  0  14]]
Per-class Metrics:
  Class 0: {'precision': 1.0, 'recall': 0.9799, 'f1-score': 0.9899, 'support': 299}
  Class 1: {'precision': 0.7, 'recall': 1.0, 'f1-score': 0.8235, 'support': 14}

🔁 Fold 4
✅ Accuracy: 0.9936
Confusion Matrix:
[[298   1]
 [  1  13]]
Per-class Metrics:
  Class 0: {'precision': 0.9967, 'recall': 0.9967, 'f1-score': 0.9967, 'support': 299}
  Class 1: {'precision': 0.9286, 'recall': 0.9286, 'f1-s