In [1]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import numpy as np

# Load dataset
df = pd.read_csv("labeled_thermal_features.csv")

# Separate features and labels
X = df.drop(columns=["Frame", "Porosity Label"])
y = df["Porosity Label"]

# Initialize 5-fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store metrics
accuracies = []
conf_matrices = []
class_metrics = []  # list of dicts to store class-wise scores

fold = 1
for train_index, test_index in skf.split(X, y):
    print(f"\n🔁 Fold {fold}")

    # Split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE
    smote = SMOTE(sampling_strategy=0.2, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Initialize classifier
    rf_classifier = RandomForestClassifier(
        n_estimators=200,
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features="sqrt",
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )

    # Train
    rf_classifier.fit(X_train_resampled, y_train_resampled)

    # Predict
    y_pred = rf_classifier.predict(X_test)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    
    # Collect detailed metrics per class
    per_class = {
        label: {
            "precision": round(metrics["precision"], 4),
            "recall": round(metrics["recall"], 4),
            "f1-score": round(metrics["f1-score"], 4),
            "support": int(metrics["support"])
        }
        for label, metrics in report_dict.items()
        if label in ['0', '1']  # assuming binary classification (normal: 0, anomaly: 1)
    }
    class_metrics.append(per_class)
    accuracies.append(accuracy)
    conf_matrices.append(conf_matrix)

    # Print fold-wise results
    print(f"Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Per-class Metrics:")
    for label, scores in per_class.items():
        print(f"  Class {label}: {scores}")

    fold += 1

# =========================
# 📊 Final Summary
# =========================
print("\n📊 5-Fold Cross-Validation Summary:")
print(f"Average Accuracy: {np.mean(accuracies):.4f}")

# Aggregate F1-scores across folds
avg_metrics = {
    "0": {"precision": 0, "recall": 0, "f1-score": 0},
    "1": {"precision": 0, "recall": 0, "f1-score": 0}
}

for class_stat in class_metrics:
    for label in ["0", "1"]:
        for metric in ["precision", "recall", "f1-score"]:
            avg_metrics[label][metric] += class_stat[label][metric]

# Average over 5 folds
for label in ["0", "1"]:
    for metric in ["precision", "recall", "f1-score"]:
        avg_metrics[label][metric] /= 5
        avg_metrics[label][metric] = round(avg_metrics[label][metric], 4)

print("\n📈 Average Per-Class Metrics (across 5 folds):")
for label, scores in avg_metrics.items():
    print(f"  Class {label}: {scores}")

# Save last model
with open("random_forest_anomaly_model_cv.pkl", "wb") as model_file:
    pickle.dump(rf_classifier, model_file)

print("\n✅ Random Forest (cross-validated) model saved as 'random_forest_anomaly_model_cv.pkl'.")

# Compute average confusion matrix
sum_conf_matrix = np.sum(conf_matrices, axis=0)
avg_conf_matrix = sum_conf_matrix / 5

print("\n🧮 Average Confusion Matrix (across 5 folds):")
print(np.round(avg_conf_matrix).astype(int))  # or use .astype(float) if you want decimal values




🔁 Fold 1
Accuracy: 0.9808
Confusion Matrix:
[[293   5]
 [  1  14]]
Per-class Metrics:
  Class 0: {'precision': 0.9966, 'recall': 0.9832, 'f1-score': 0.9899, 'support': 298}
  Class 1: {'precision': 0.7368, 'recall': 0.9333, 'f1-score': 0.8235, 'support': 15}

🔁 Fold 2
Accuracy: 0.9936
Confusion Matrix:
[[297   2]
 [  0  14]]
Per-class Metrics:
  Class 0: {'precision': 1.0, 'recall': 0.9933, 'f1-score': 0.9966, 'support': 299}
  Class 1: {'precision': 0.875, 'recall': 1.0, 'f1-score': 0.9333, 'support': 14}

🔁 Fold 3
Accuracy: 0.9808
Confusion Matrix:
[[293   6]
 [  0  14]]
Per-class Metrics:
  Class 0: {'precision': 1.0, 'recall': 0.9799, 'f1-score': 0.9899, 'support': 299}
  Class 1: {'precision': 0.7, 'recall': 1.0, 'f1-score': 0.8235, 'support': 14}

🔁 Fold 4
Accuracy: 0.9936
Confusion Matrix:
[[298   1]
 [  1  13]]
Per-class Metrics:
  Class 0: {'precision': 0.9967, 'recall': 0.9967, 'f1-score': 0.9967, 'support': 299}
  Class 1: {'precision': 0.9286, 'recall': 0.9286, 'f1-score':