In [None]:
# -*- coding: utf-8 -*-
import warnings, pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error

warnings.filterwarnings("ignore")

# ======== Configuration ========
CSV_PATH = "9_makale_data_guncel.csv"
TARGET   = "band_gap"
ID_COLS  = ["material_id", "formula_pretty"]
RANDOM_STATE = 42
MODEL_PATH = "xgb_model.pkl"
EARLY_STOP = 100

# --- Data Loading and Indexing ---
df = pd.read_csv(CSV_PATH)
df = df.drop(columns=[c for c in ID_COLS if c in df.columns], errors="ignore")
y_all = df[TARGET].astype(float)
X_all = df.drop(columns=[TARGET]).select_dtypes(include="number")

# Clean potential infinite values
X_all = X_all.replace([np.inf, -np.inf], np.nan)
mask = X_all.notna().all(axis=1) & y_all.notna()
X_all, y_all = X_all.loc[mask], y_all.loc[mask]

# Load predefined split indices
idx_tr = pd.read_csv("split_train.csv")["idx"].values
idx_va = pd.read_csv("split_val.csv")["idx"].values
idx_te = pd.read_csv("split_test.csv")["idx"].values

X_train, y_train = X_all.iloc[idx_tr], y_all.iloc[idx_tr]
X_val,   y_val   = X_all.iloc[idx_va], y_all.iloc[idx_va]
X_test,  y_test  = X_all.iloc[idx_te], y_all.iloc[idx_te]

# --- XGBoost Data Structure (DMatrix) ---
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)
dtest  = xgb.DMatrix(X_test,  label=y_test)

# --- Hyperparameters ---
params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.03865291106813676,
    "max_depth": 9,
    "min_child_weight": 11.689593895871,
    "subsample": 0.8809231752905414,
    "colsample_bytree": 0.864323016559627,
    "colsample_bylevel": 0.9986567609362819,
    "gamma": 0.0015113421306209107,
    "reg_alpha": 24.332514099721607,
    "reg_lambda": 3.062237425668688,
    "tree_method": "hist", # Histogram-based algorithm for speed
    "eval_metric": "rmse",
    "seed": RANDOM_STATE,
    "nthread": -1,
}

num_boost_round = 1336
watchlist = [(dtrain, "train"), (dval, "val")]

# Training with Early Stopping
booster = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    evals=watchlist,
    early_stopping_rounds=EARLY_STOP,
    verbose_eval=False,
)

# --- Performance Metrics ---
def print_metrics(name, y_true, y_pred):
    print(f"{name:10} -> R²: {r2_score(y_true, y_pred):.6f} | MAE: {mean_absolute_error(y_true, y_pred):.6f}")

# Inference using the best iteration identified during training
y_tr_pred = booster.predict(dtrain, iteration_range=(0, booster.best_iteration + 1))
y_va_pred = booster.predict(dval,   iteration_range=(0, booster.best_iteration + 1))
y_te_pred = booster.predict(dtest,  iteration_range=(0, booster.best_iteration + 1))

print(f"Best iteration: {booster.best_iteration}")
print_metrics("Train", y_train, y_tr_pred)
print_metrics("Validation", y_val, y_va_pred)
print_metrics("Test", y_test, y_te_pred)

# --- Sklearn-compatible Wrapper for Stacking ---
class XGBSklearnWrapper:
    """Wrapper to make the XGBoost Booster compatible with Sklearn-style .predict(X) calls."""
    def __init__(self, booster, feature_names):
        self.booster = booster
        self.feature_names_ = list(feature_names)

    def predict(self, X):
        # Ensure column order matches training if X is a DataFrame
        if isinstance(X, pd.DataFrame):
            X = X[self.feature_names_]
        dm = xgb.DMatrix(X)
        return self.booster.predict(dm, iteration_range=(0, self.booster.best_iteration + 1))

# Initialize wrapper and save
final_model = XGBSklearnWrapper(booster, X_train.columns)
with open(MODEL_PATH, "wb") as f:
    pickle.dump(final_model, f)

print(f"Saved: {MODEL_PATH}")

In [None]:

# save
wrapper = XGBSklearnWrapper(booster, X_all.columns)
with open(MODEL_PATH, "wb") as f:
    pickle.dump(wrapper, f)

print(f"saved: {MODEL_PATH}")

In [None]:
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# === 1. SHAP Interpretability (Beeswarm Plot) ===
print("\nCalculating SHAP values...")

# Use a representative subset for background and calculation to save time
rs = np.random.RandomState(42)
bg_idx   = rs.choice(len(X_train), size=min(500, len(X_train)), replace=False)
shap_idx = rs.choice(len(X_test),  size=min(2000, len(X_test)), replace=False)
X_bg, X_sh = X_train.iloc[bg_idx], X_test.iloc[shap_idx]

try:
    # Attempt using the optimized TreeExplainer for XGBoost
    explainer = shap.TreeExplainer(booster)
    shap_vals = explainer.shap_values(X_sh)
except Exception as e:
    print("TreeExplainer failed, falling back to KernelExplainer:", e)
    # Use KernelExplainer as a backup (slower but model-agnostic)
    explainer = shap.KernelExplainer(wrapper.predict, X_bg)
    shap_vals = explainer.shap_values(X_sh, nsamples="auto")

# Generate and save SHAP summary plot

shap.summary_plot(shap_vals, X_sh, show=False, plot_type="dot")
plt.title("SHAP Feature Importance (Test Set)")
plt.tight_layout()
plt.savefig("shap_summary_test.png", dpi=300)
plt.close()

# === 2. Predicted vs. True Plot (Parity Plot) ===
print("Generating Predicted vs. True plot...")

y_pred_test = y_te_pred  # Using the test predictions calculated previously

plt.figure(figsize=(6,6))
sns.scatterplot(x=y_test, y=y_pred_test, alpha=0.6, edgecolor=None)
mn, mx = float(y_test.min()), float(y_test.max())
plt.plot([mn, mx], [mn, mx], 'r--', label="Ideal (y=x)")
plt.xlabel("True Band Gap (eV)")
plt.ylabel("Predicted Band Gap (eV)")
plt.title("Actual vs. Predicted Performance")
plt.legend(); plt.grid(True); plt.tight_layout()
plt.savefig("predicted_vs_true_test.png", dpi=300)
plt.close()

# === 3. Residuals vs. Predicted Plot ===
print("Generating Residuals plot...")

residuals = y_test - y_pred_test

plt.figure(figsize=(6,4))
sns.scatterplot(x=y_pred_test, y=residuals, alpha=0.6, edgecolor=None)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted Band Gap (eV)")
plt.ylabel("Residuals (Error)")
plt.title("Residual Analysis")
plt.grid(True); plt.tight_layout()
plt.savefig("residuals_test.png", dpi=300)
plt.close()

print("✅ Visualizations saved: shap_summary_test.png, predicted_vs_true_test.png, residuals_test.png")