In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# === Step 1: Load dataset and select top 7 features ===
df = pd.read_csv("labeled_thermal_features.csv")
top_7_features = ["IQR", "Std_Temp", "Q1", "Min_Temp", "Median_Temp", "Skewness", "Kurtosis"]
X = df[top_7_features]
y = df["Porosity Label"]

# === Step 2: Set up Stratified 5-Fold CV ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
conf_matrices = []
class_metrics = []

fold = 1
for train_index, test_index in skf.split(X, y):
    print(f"\n🔁 Fold {fold}")

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to training data
    smote = SMOTE(sampling_strategy=0.2, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # === Train XGBoost Model ===
    xgb_classifier = xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        n_estimators=100,
        learning_rate=0.05,
        max_depth=3,
        subsample=0.7,
        colsample_bytree=0.7,
        reg_lambda=1,
        reg_alpha=0.5,
        random_state=42
    )

    xgb_classifier.fit(X_train_resampled, y_train_resampled)
    y_pred = xgb_classifier.predict(X_test)

    # === Evaluation ===
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)

    per_class = {
        label: {
            "precision": round(metrics["precision"], 4),
            "recall": round(metrics["recall"], 4),
            "f1-score": round(metrics["f1-score"], 4),
            "support": int(metrics["support"])
        }
        for label, metrics in report_dict.items()
        if label in ['0', '1']
    }

    accuracies.append(accuracy)
    conf_matrices.append(conf_matrix)
    class_metrics.append(per_class)

    print(f"✅ Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Per-class Metrics:")
    for label, scores in per_class.items():
        print(f"  Class {label}: {scores}")

    fold += 1

# === Step 3: Summary Results ===
print("\n📊 5-Fold Cross-Validation Summary:")
print(f"Average Accuracy: {np.mean(accuracies):.4f}")

avg_metrics = {
    "0": {"precision": 0, "recall": 0, "f1-score": 0},
    "1": {"precision": 0, "recall": 0, "f1-score": 0}
}

for class_stat in class_metrics:
    for label in ["0", "1"]:
        for metric in ["precision", "recall", "f1-score"]:
            avg_metrics[label][metric] += class_stat[label][metric]

for label in ["0", "1"]:
    for metric in ["precision", "recall", "f1-score"]:
        avg_metrics[label][metric] = round(avg_metrics[label][metric] / 5, 4)

print("\n📈 Average Per-Class Metrics (across 5 folds):")
for label, scores in avg_metrics.items():
    print(f"  Class {label}: {scores}")

# Average confusion matrix
sum_conf_matrix = np.sum(conf_matrices, axis=0)
avg_conf_matrix = sum_conf_matrix / 5
print("\n🧮 Average Confusion Matrix (across 5 folds):")
print(np.round(avg_conf_matrix).astype(int))

# Percentage format (optional)
conf_matrix_total = avg_conf_matrix.sum()
percent_matrix = (avg_conf_matrix / conf_matrix_total) * 100
print("\n📊 Average Confusion Matrix (Percentage):")
print(np.round(percent_matrix, 2))

# Save final model
xgb_classifier.save_model("xgboost_cv_top7.json")
print("\n✅ Cross-validated XGBoost model saved as 'xgboost_cv_top7.json'")



🔁 Fold 1
✅ Accuracy: 0.9808
Confusion Matrix:
[[293   5]
 [  1  14]]
Per-class Metrics:
  Class 0: {'precision': 0.9966, 'recall': 0.9832, 'f1-score': 0.9899, 'support': 298}
  Class 1: {'precision': 0.7368, 'recall': 0.9333, 'f1-score': 0.8235, 'support': 15}

🔁 Fold 2
✅ Accuracy: 0.9904
Confusion Matrix:
[[297   2]
 [  1  13]]
Per-class Metrics:
  Class 0: {'precision': 0.9966, 'recall': 0.9933, 'f1-score': 0.995, 'support': 299}
  Class 1: {'precision': 0.8667, 'recall': 0.9286, 'f1-score': 0.8966, 'support': 14}

🔁 Fold 3
✅ Accuracy: 0.9840
Confusion Matrix:
[[294   5]
 [  0  14]]
Per-class Metrics:
  Class 0: {'precision': 1.0, 'recall': 0.9833, 'f1-score': 0.9916, 'support': 299}
  Class 1: {'precision': 0.7368, 'recall': 1.0, 'f1-score': 0.8485, 'support': 14}

🔁 Fold 4
✅ Accuracy: 0.9936
Confusion Matrix:
[[298   1]
 [  1  13]]
Per-class Metrics:
  Class 0: {'precision': 0.9967, 'recall': 0.9967, 'f1-score': 0.9967, 'support': 299}
  Class 1: {'precision': 0.9286, 'recall': 0.