# XGBoost Training — Starter Notebook (Aligned with `xgboost-training/`)
This notebook is designed for the directory layout:
```
xgboost-training/
├─ data/{raw,processed}
├─ notebooks/
└─ src/
```
It uses built-in scikit-learn datasets (no downloads), and is easy to extend to files under `data/`.

## Environment & Imports

In [None]:
# If needed in your local machine, install deps in your active virtualenv:
# !pip install xgboost scikit-learn matplotlib shap ipykernel

import os, sys, json, math, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, RocCurveDisplay

## Project Paths

In [None]:
from pathlib import Path

# Project root is assumed to be the parent of this notebook directory
NB_DIR = Path.cwd()
PROJECT_ROOT = NB_DIR.parent
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"
SRC_DIR = PROJECT_ROOT / "src"

for p in [DATA_DIR, RAW_DIR, PROC_DIR, SRC_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)
print("RAW_DIR:", RAW_DIR)
print("PROC_DIR:", PROC_DIR)
print("SRC_DIR:", SRC_DIR)

## Table of Contents
1. [Phase 1 — Core Concepts](#phase1)
   - [Iris (Multiclass)](#iris)
   - [Diabetes (Regression)](#diabetes)
   - [Breast Cancer (Binary + ROC/AUC)](#cancer)
2. [Phase 2 — Pipelines & Early Stopping](#phase2)
   - [Synthetic Classification with Categorical + Numeric](#synthetic)
   - [Regression with Early Stopping + Feature Importance](#early)
3. [Extras — Simple Hyperparameter Search](#extras)

---
<a id="phase1"></a>
# Phase 1 — Core Concepts
Small, clean datasets to learn XGBoost mechanics.

<a id="iris"></a>
## 1.1 Iris — Multiclass Classification

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    objective='multi:softmax',
    eval_metric='mlogloss',
    random_state=42
)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred, target_names=iris.target_names))

xgb.plot_importance(clf, height=0.5)
plt.title("Feature Importance — Iris")
plt.show()

## Visualizing XGBoost Tree Architecture on Iris

Below we plot the **first few trees** from our Iris classifier to understand:
- How they split features
- How they output leaf values (log-odds contributions)
- How multiple trees per boosting round are used for multiclass

In [None]:
from xgboost import plot_tree
import matplotlib.pyplot as plt

# Show tree 0 (first boosting round, class 0)
plot_tree(clf, num_trees=0)
plt.title("Tree 0 — Round 1, Class 0")
plt.show()

# Show tree 1 (first boosting round, class 1)
plot_tree(clf, num_trees=1)
plt.title("Tree 1 — Round 1, Class 1")
plt.show()

# Show tree 2 (first boosting round, class 2)
plot_tree(clf, num_trees=2)
plt.title("Tree 2 — Round 1, Class 2")
plt.show()

# Optional: print raw text dump for first tree
print("\nRaw text dump for Tree 0:")
print(clf.get_booster().get_dump()[0])

In [None]:
import numpy as np
from sklearn.preprocessing import normalize

# Take a few test samples
X_sample = X_test[:5]

# Raw scores (logits) BEFORE softmax
# For multi:softmax classifier, use the booster to get margins:
raw_scores = clf.get_booster().inplace_predict(X_sample, strict_shape=True, iteration_range=(0, 0), predict_type="margin")
# Note: depending on xgboost version, you may need:
# raw_scores = clf.get_booster().inplace_predict(X_sample, predict_type="margin")

# raw_scores shape: (n_samples, n_classes)
print("Raw scores (logits):\n", raw_scores)

# Softmax to get probabilities
def softmax(z):
    z = z - np.max(z, axis=1, keepdims=True)  # for numerical stability
    expz = np.exp(z)
    return expz / expz.sum(axis=1, keepdims=True)

probs = softmax(raw_scores)
print("\nProbabilities via softmax:\n", probs)

# Compare to model's predict (class indices)
print("\nPredicted classes:", clf.predict(X_sample))


In [None]:
import numpy as np

def softmax(z):
    z = z - np.max(z, axis=1, keepdims=True)
    expz = np.exp(z)
    return expz / expz.sum(axis=1, keepdims=True)

booster = clf.get_booster()

# Number of boosting rounds (NOT number of trees)
try:
    n_rounds = booster.num_boosted_rounds()
except AttributeError:
    # Fallback: total trees / n_classes
    n_classes = len(np.unique(y))
    n_rounds = len(booster.get_dump()) // n_classes

# Pick a single sample to trace (first test row)
s = X_test[0:1]

cum = np.zeros((1, len(np.unique(y))), dtype=float)
print("Round | logits (cum) -> probs")
for t in range(n_rounds):
    # Contribution from round t (includes all class trees for that round)
    round_margin = booster.inplace_predict(
        s,
        iteration_range=(t, t+1),  # [t, t+1): one boosting round
        predict_type="margin"
    )
    cum += round_margin  # learning_rate already applied internally
    print(
        f"{t+1:5d} | {np.round(cum, 3)} -> {np.round(softmax(cum), 3)}"
    )

print("\nModel predict_proba for comparison:",
      np.round(clf.predict_proba(s), 3))
print("Model predicted class:", clf.predict(s))


<a id="diabetes"></a>
## 1.2 Diabetes — Regression

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error, mean_absolute_error

diabetes = load_diabetes()
Xd, yd = diabetes.data, diabetes.target

Xd_train, Xd_test, yd_train, yd_test = train_test_split(
    Xd, yd, test_size=0.2, random_state=42
)

reg = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective='reg:squarederror',
    random_state=42
)
reg.fit(Xd_train, yd_train)
pred = reg.predict(Xd_test)

rmse = np.sqrt(mean_squared_error(yd_test, pred))
mae = mean_absolute_error(yd_test, pred)
print(f"RMSE: {rmse:.3f} | MAE: {mae:.3f}")

<a id="cancer"></a>
## 1.3 Breast Cancer — Binary Classification + ROC/AUC

In [None]:
from sklearn.datasets import load_breast_cancer

bc = load_breast_cancer()
Xb, yb = bc.data, bc.target

Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    Xb, yb, test_size=0.2, random_state=42, stratify=yb
)

bin_clf = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)
bin_clf.fit(Xb_train, yb_train)

proba = bin_clf.predict_proba(Xb_test)[:, 1]
auc = roc_auc_score(yb_test, proba)
print(f"ROC AUC: {auc:.3f}")

RocCurveDisplay.from_predictions(yb_test, proba)
plt.title("ROC Curve — Breast Cancer")
plt.show()

---
<a id="phase2"></a>
# Phase 2 — Pipelines & Early Stopping
Move toward realistic preprocessing using `ColumnTransformer` and `Pipeline`.

<a id="synthetic"></a>
## 2.1 Synthetic Classification — Numeric + Categorical Features

In [None]:
from sklearn.datasets import make_classification
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Synthetic numeric
X_num, y = make_classification(
    n_samples=3000, n_features=6, n_informative=4, class_sep=1.2, random_state=42
)

df = pd.DataFrame(X_num, columns=[f"num_{i}" for i in range(6)])
rng = np.random.default_rng(42)
df["cat_color"] = rng.choice(["red", "green", "blue"], size=len(df))
df["cat_size"] = rng.choice(["S", "M", "L", "XL"], size=len(df))

num_cols = [c for c in df.columns if c.startswith("num_")]
cat_cols = ["cat_color", "cat_size"]

X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=42, stratify=y
)

pre = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

model = xgb.XGBClassifier(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)

pipe = Pipeline([("pre", pre), ("model", model)])
pipe.fit(X_train, y_train)

proba = pipe.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
print(f"Pipeline ROC AUC: {auc:.3f}")

<a id="early"></a>
## 2.2 Regression with Early Stopping + Feature Importance

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error

d = load_diabetes()
X, y = d.data, d.target

X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=42)

reg_es = xgb.XGBRegressor(
    n_estimators=4000,
    max_depth=5,
    learning_rate=0.02,
    subsample=0.9,
    colsample_bytree=0.9,
    objective='reg:squarederror',
    random_state=42
)

reg_es.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    early_stopping_rounds=50,
    verbose=False
)

pred = reg_es.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
print("Best iteration:", reg_es.best_iteration)
print(f"RMSE (test): {rmse:.3f}")

xgb.plot_importance(reg_es, height=0.5)
plt.title("Feature Importance — Diabetes (Early Stopping)")
plt.show()

---
<a id="extras"></a>
## Extras — Minimal GridSearchCV Example

In [None]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(
    Xb, yb, test_size=0.2, random_state=42, stratify=yb
)

base = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)

param_grid = {
    "n_estimators": [200, 400],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

gs = GridSearchCV(base, param_grid, scoring="roc_auc", cv=3, n_jobs=-1)
gs.fit(Xb_train, yb_train)

print("Best params:", gs.best_params_)
auc = roc_auc_score(yb_test, gs.best_estimator_.predict_proba(Xb_test)[:, 1])
print(f"Test ROC AUC: {auc:.3f}")

---
## Next Steps
- Drop real CSVs under `data/raw/` and adapt the `ColumnTransformer` pipeline.
- Consider exporting processed features to `data/processed/`.
- If you have a GPU, pass `tree_method="gpu_hist"` in the XGBoost constructors.
- Move reusable code into `src/` (e.g., preprocessing utilities, plotting, SHAP explainers).