<a href="https://colab.research.google.com/github/bwanatemba/Advanced-AI-Course-Codespaces/blob/main/XGBoost_(binary_classification%2C_early_stopping).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# If running in Colab, first run:
# !pip -q install xgboost scikit-learn matplotlib

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
import numpy as np

# 1) Make a small, separable dataset
X, y = make_classification(
    n_samples=2000, n_features=20, n_informative=8, n_redundant=4,
    n_clusters_per_class=2, flip_y=0.02, class_sep=1.5, random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# 2) Define a fast, robust XGBoost model (CPU), with early stopping
model = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    tree_method="hist",   # fast CPU histogram algorithm
    random_state=42,
    n_jobs=-1,
)

# 3) Train with a validation set for early stopping
early_stopping = EarlyStopping(
    rounds=30,
    min_delta=0,
    maximize=False,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    callbacks=[early_stopping]
)

# 4) Evaluate
proba = model.predict_proba(X_valid)[:, 1]
pred  = (proba >= 0.5).astype(int)
acc   = accuracy_score(y_valid, pred)
auc   = roc_auc_score(y_valid, proba)

print(f"Best iteration: {model.best_iteration}")
print(f"Accuracy: {acc:.3f} | ROC-AUC: {auc:.3f}")

# 5) Quick feature importance bar plot
imp = model.feature_importances_
order = np.argsort(imp)[::-1][:10]
plt.figure(figsize=(6,4))
plt.barh(range(len(order)), imp[order][::-1])
plt.yticks(range(len(order)), [f"f{idx}" for idx in order][::-1])
plt.title("XGBoost Feature Importance (top 10)")
plt.tight_layout()
plt.show()

TypeError: XGBClassifier.fit() got an unexpected keyword argument 'callbacks'