In [None]:
import os
import json
import time
import pickle
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [None]:
data = load_breast_cancer()
X = data.data.astype(np.float32, copy=False)
y = data.target.astype(np.int32, copy=False)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("Classes:", np.unique(y_test))

In [None]:
MODEL_PKL_PATH = r"xgb_model_before_optimization.pkl"  

if os.path.exists(MODEL_PKL_PATH):
    with open(MODEL_PKL_PATH, "rb") as f:
        obj = pickle.load(f)

    if hasattr(obj, "get_booster"):
        booster = obj.get_booster()
        print("Loaded sklearn XGB model -> got Booster.")
    elif isinstance(obj, xgb.Booster):
        booster = obj
        print("Loaded Booster directly.")
    else:
        raise TypeError(f"Loaded object is not XGBoost Booster/sklearn model. Type={type(obj)}")

else:
    dtrain = xgb.DMatrix(X_train, label=y_train)
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": 4,
        "eta": 0.1,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "seed": 42,
    }
    booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=200)
    print("Trained baseline Booster (because pickle not found).")

print("Booster ready.")

In [None]:
def export_ubj(bst: xgb.Booster, path: str) -> float:
    bst.save_model(path)
    return os.path.getsize(path) / (1024 * 1024)

dm_test = xgb.DMatrix(X_test)
p = booster.predict(dm_test)  

y_hat = (p >= 0.5).astype(np.int32)
acc = float(accuracy_score(y_test, y_hat))

base_path = "xgb_baseline.ubj"
base_mb = export_ubj(booster, base_path)

print("Baseline accuracy:", acc)
print("Baseline exported:", base_path, f"({base_mb:.4f} MB)")

In [None]:
PRUNE_GAMMA = 0.01      
PRUNE_MAX_DEPTH = 6    
dtrain = xgb.DMatrix(X_train, label=y_train)

bst_refresh = xgb.train(
    params={"process_type": "update", "updater": "refresh", "refresh_leaf": True},
    dtrain=dtrain,
    num_boost_round=1,
    xgb_model=booster,
)

bst_pruned = xgb.train(
    params={
        "process_type": "update",
        "updater": "prune",
        "gamma": PRUNE_GAMMA,
        "max_depth": PRUNE_MAX_DEPTH,
    },
    dtrain=dtrain,
    num_boost_round=1,
    xgb_model=bst_refresh,
)

print("Optimization done (refresh + prune).")

In [None]:
dm_test = xgb.DMatrix(X_test)

p_opt = bst_pruned.predict(dm_test)
y_hat_opt = (p_opt >= 0.5).astype(np.int32)
acc_opt = float(accuracy_score(y_test, y_hat_opt))

opt_path = "xgb_optimized.ubj"
opt_mb = export_ubj(bst_pruned, opt_path)

bst_loaded = xgb.Booster()
bst_loaded.load_model(opt_path)
p_loaded = bst_loaded.predict(dm_test)

max_diff = float(np.max(np.abs(p_loaded - p_opt)))

print("Optimized accuracy:", acc_opt)
print("Optimized exported:", opt_path, f"({opt_mb:.4f} MB)")
print("Reload max |diff|:", max_diff)

In [None]:
def measure_inference_ms(
    bst: xgb.Booster,
    X: np.ndarray,
    batch_size: int = 1,
    n_warmup: int = 20,
    n_runs: int = 200,
    nthread: int | None = None,
) -> dict:

    if nthread is not None:
        bst.set_param({"nthread": int(nthread)})

    X = np.asarray(X, dtype=np.float32)
    n = X.shape[0]
    if n == 0:
        raise ValueError("X is empty.")
    if batch_size <= 0:
        raise ValueError("batch_size must be >= 1.")

    idxs = [((i * batch_size) % n) for i in range(max(n_warmup, n_runs))]
    batches = [X[i:i+batch_size] for i in idxs]

    for xb in batches[:n_warmup]:
        dm = xgb.DMatrix(xb)
        _ = bst.predict(dm)

    times = []
    for xb in batches[:n_runs]:
        dm = xgb.DMatrix(xb)
        t0 = time.perf_counter()
        _ = bst.predict(dm)
        t1 = time.perf_counter()
        times.append((t1 - t0) * 1000.0)

    times = np.array(times, dtype=np.float64)
    ms_mean = float(times.mean())
    ms_p50 = float(np.percentile(times, 50))
    ms_p95 = float(np.percentile(times, 95))

    return {
        "batch_size": int(batch_size),
        "nthread": None if nthread is None else int(nthread),
        "mean_ms_per_batch": ms_mean,
        "p50_ms_per_batch": ms_p50,
        "p95_ms_per_batch": ms_p95,
        "mean_ms_per_sample": ms_mean / batch_size,
        "p50_ms_per_sample": ms_p50 / batch_size,
        "p95_ms_per_sample": ms_p95 / batch_size,
    }

In [None]:
batch_sizes = [1, 8, 32, 128]
nthread = None  

results = []
for bs in batch_sizes:
    r_base = measure_inference_ms(booster, X_test, batch_size=bs, nthread=nthread)
    r_base["model"] = "baseline"
    results.append(r_base)

    r_opt = measure_inference_ms(bst_pruned, X_test, batch_size=bs, nthread=nthread)
    r_opt["model"] = "optimized"
    results.append(r_opt)

for bs in batch_sizes:
    b = next(r for r in results if r["model"] == "baseline" and r["batch_size"] == bs)
    o = next(r for r in results if r["model"] == "optimized" and r["batch_size"] == bs)
    print(f"\nBatch size = {bs}")
    print(f"  Baseline : mean {b['mean_ms_per_batch']:.4f} ms/batch  | {b['mean_ms_per_sample']:.4f} ms/sample")
    print(f"  Optimized: mean {o['mean_ms_per_batch']:.4f} ms/batch  | {o['mean_ms_per_sample']:.4f} ms/sample")


In [None]:
import time
import numpy as np

def measure_inference_time(model, dmatrix, num_runs=100, warm_up=10):
    for _ in range(warm_up):
        _ = model.predict(dmatrix)
        
    start_time = time.time()
    for _ in range(num_runs):
        _ = model.predict(dmatrix)
    end_time = time.time()
    
    total_time_sec = end_time - start_time
    avg_time_per_run_sec = total_time_sec / num_runs
    num_samples = dmatrix.num_row()
    
    avg_time_per_sample_us = (avg_time_per_run_sec / num_samples) * 1e6
    total_avg_time_ms = avg_time_per_run_sec * 1000

    print(f"--- Inference Speed ({num_runs} runs) ---")
    print(f"Total time per batch: {total_avg_time_ms:.4f} ms")
    print(f"Time per sample:      {avg_time_per_sample_us:.4f} Âµs")
    
    return {
        "time_per_sample_us": avg_time_per_sample_us,
        "total_time_ms": total_avg_time_ms
    }

print("Baseline Model:")
_ = measure_inference_time(booster, dm_test)

print("\nOptimized Model:")
_ = measure_inference_time(bst_pruned, dm_test)

In [None]:
def count_total_nodes(bst: xgb.Booster) -> int:
    trees = bst.get_dump(with_stats=False, dump_format="json")
    total = 0
    for t in trees:
        obj = json.loads(t)
        stack = [obj]
        while stack:
            node = stack.pop()
            total += 1
            if "children" in node:
                stack.extend(node["children"])
    return total

before_nodes = count_total_nodes(booster)
after_nodes  = count_total_nodes(bst_pruned)

print("Nodes before:", before_nodes)
print("Nodes after :", after_nodes)
print("Reduced by  :", before_nodes - after_nodes)

In [None]:
import json
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score

def evaluate_overfitting_xgb(bst: xgb.Booster, X_train, y_train, X_test, y_test, name="model"):
    if not isinstance(bst, xgb.Booster):
        raise TypeError(f"bst must be xgboost.Booster, got {type(bst)}")

    X_train = np.asarray(X_train)
    X_test  = np.asarray(X_test)
    y_train = np.asarray(y_train).astype(int, copy=False)
    y_test  = np.asarray(y_test).astype(int, copy=False)

    if X_train.ndim != 2 or X_test.ndim != 2:
        raise ValueError("X_train and X_test must be 2D arrays.")
    if X_train.shape[0] != y_train.shape[0] or X_test.shape[0] != y_test.shape[0]:
        raise ValueError("Mismatch between X and y lengths.")

    cfg = json.loads(bst.save_config())
    objective = cfg["learner"]["objective"]["name"]
    num_class = int(cfg["learner"]["learner_model_param"].get("num_class", 0))

    dtrain = xgb.DMatrix(X_train)
    dtest  = xgb.DMatrix(X_test)

    pred_train = bst.predict(dtrain)
    pred_test  = bst.predict(dtest)

    if objective == "binary:logistic":
        if pred_train.ndim != 1 or pred_test.ndim != 1:
            raise ValueError("binary:logistic should produce 1D predictions.")
        yhat_train = (pred_train >= 0.5).astype(int)
        yhat_test  = (pred_test  >= 0.5).astype(int)

    elif objective == "binary:logitraw":
        if pred_train.ndim != 1 or pred_test.ndim != 1:
            raise ValueError("binary:logitraw should produce 1D margins.")
        prob_train = 1.0 / (1.0 + np.exp(-pred_train))
        prob_test  = 1.0 / (1.0 + np.exp(-pred_test))
        yhat_train = (prob_train >= 0.5).astype(int)
        yhat_test  = (prob_test  >= 0.5).astype(int)

    elif objective == "multi:softprob":
        if num_class <= 1:
            raise ValueError("multi:softprob requires num_class > 1.")
        if pred_train.ndim == 1:
            if pred_train.size != X_train.shape[0] * num_class:
                raise ValueError("Unexpected softprob size for train.")
            pred_train = pred_train.reshape(X_train.shape[0], num_class)
        if pred_test.ndim == 1:
            if pred_test.size != X_test.shape[0] * num_class:
                raise ValueError("Unexpected softprob size for test.")
            pred_test = pred_test.reshape(X_test.shape[0], num_class)

        yhat_train = np.argmax(pred_train, axis=1).astype(int)
        yhat_test  = np.argmax(pred_test, axis=1).astype(int)

    elif objective == "multi:softmax":
        if pred_train.ndim != 1 or pred_test.ndim != 1:
            raise ValueError("multi:softmax should produce 1D class indices.")
        yhat_train = pred_train.astype(int)
        yhat_test  = pred_test.astype(int)

    else:
        raise NotImplementedError(
            f"Objective not supported in this evaluator: {objective}\n"
            "Add handling here if you use a different objective."
        )

    acc_train = float(accuracy_score(y_train, yhat_train))
    acc_test  = float(accuracy_score(y_test, yhat_test))
    gap = acc_train - acc_test

    print(f"[{name}] objective : {objective}")
    print(f"[{name}] Train acc: {acc_train:.4f}")
    print(f"[{name}] Test  acc: {acc_test:.4f}")
    print(f"[{name}] Gap       : {gap:.4f}")

    return {"train_acc": acc_train, "test_acc": acc_test, "gap": gap, "objective": objective}

print("Baseline:")
_ = evaluate_overfitting_xgb(booster, X_train, y_train, X_test, y_test, name="baseline")

print("\nOptimized:")
_ = evaluate_overfitting_xgb(bst_pruned, X_train, y_train, X_test, y_test, name="optimized")