In [15]:
import os
import json
import time
import pickle
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [None]:
data = load_breast_cancer()
X = data.data.astype(np.float32, copy=False)
y = data.target.astype(np.int32, copy=False)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("Classes:", np.unique(y_test))

In [None]:
MODEL_PKL_PATH = r"xgb_model_before_optimization.pkl"  

if os.path.exists(MODEL_PKL_PATH):
    with open(MODEL_PKL_PATH, "rb") as f:
        obj = pickle.load(f)

    if hasattr(obj, "get_booster"):
        booster = obj.get_booster()
        print("Loaded sklearn XGB model -> got Booster.")
    elif isinstance(obj, xgb.Booster):
        booster = obj
        print("Loaded Booster directly.")
    else:
        raise TypeError(f"Loaded object is not XGBoost Booster/sklearn model. Type={type(obj)}")

else:
    dtrain = xgb.DMatrix(X_train, label=y_train)
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "max_depth": 4,
        "eta": 0.1,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "seed": 42,
    }
    booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=200)
    print("Trained baseline Booster (because pickle not found).")

print("Booster ready.")

In [None]:
import pandas as pd
import os
import xgboost as xgb
import numpy as np
import json
import tempfile
import warnings
from sklearn.metrics import accuracy_score

dtrain = xgb.DMatrix(X_train, label=y_train)
dm_test = xgb.DMatrix(X_test)

print("--- Auto-Tuning Optimization Levels (Hybrid: Retrain + Prune) ---")

def _total_nodes(bst):
    trees = bst.get_dump(with_stats=False, dump_format="json")
    total = 0
    for t in trees:
        obj = json.loads(t)
        stack = [obj]
        while stack:
            node = stack.pop()
            total += 1
            if "children" in node:
                stack.extend(node["children"])
    return total

def _size_kb(bst):
    tmp_path = "temp_model.ubj"
    bst.save_model(tmp_path)
    size = os.path.getsize(tmp_path) / 1024.0
    try: os.remove(tmp_path)
    except: pass
    return size

base_params = {
    "objective": "binary:logistic", "eval_metric": "logloss", "eta": 0.1,
    "subsample": 0.9, "colsample_bytree": 0.9, "seed": 42, "max_depth": 4, "gamma": 0
}
bst_base = xgb.train(params=base_params, dtrain=dtrain, num_boost_round=200)
acc_base = accuracy_score(y_test, (bst_base.predict(dm_test) >= 0.5).astype(int))
base_nodes = _total_nodes(bst_base)
print(f"Baseline: Acc={acc_base:.4f}, Nodes={base_nodes}, Size={_size_kb(bst_base):.2f} KB")

gammas = [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100]
print("\n[Retraining Sweep]")
sweep_results = []

def _train_sweep_candidate(g_val, d_val=6):
    p = base_params.copy()
    p.update({"gamma": g_val, "max_depth": d_val})
    return xgb.train(params=p, dtrain=dtrain, num_boost_round=200)

for g in gammas:
    bst = _train_sweep_candidate(g)
    acc = accuracy_score(y_test, (bst.predict(dm_test) >= 0.5).astype(int))
    nodes = _total_nodes(bst)
    drop = (acc_base - acc) / acc_base * 100
    comp = (base_nodes - nodes) / base_nodes * 100
    print(f"Gamma={g:<5} | Nodes={nodes:<5} ({comp:.1f}%) | Acc={acc:.4f} (-{drop:.2f}%)")
    sweep_results.append({"bst": bst, "gamma": g, "acc": acc, "nodes": nodes, "drop": drop})

levels = {}
trained_models = {"Baseline": bst_base}

valid_cands = sorted(sweep_results, key=lambda x: x["nodes"])

def get_best_under(limit):
    found = [x for x in valid_cands if x["drop"] <= limit]
    return found[0] if found else None

l_cand = get_best_under(3.0)
if l_cand:
    levels["Light"] = l_cand["bst"]
    trained_models["Light"] = l_cand["bst"]
    print(f"Selected Light: Gamma={l_cand['gamma']}")

m_cand = get_best_under(6.0)
if m_cand:
    light_nodes = _total_nodes(levels["Light"]) if "Light" in levels else float('inf')
    if "Light" not in levels or m_cand["nodes"] < light_nodes:
         levels["Medium"] = m_cand["bst"]
         trained_models["Medium"] = m_cand["bst"]
         print(f"Selected Medium: Gamma={m_cand['gamma']}")
    else:
        print(f"Skipped Medium: Best candidate (Gamma={m_cand['gamma']}, Nodes={m_cand['nodes']}) is not smaller than Light.")
else:
    print("Skipped Medium: No candidate found < 6.0% drop.")

base_for_prune = levels.get("Medium", levels.get("Light", bst_base))
print(f"\n[Aggressive Pruning] Base Nodes={_total_nodes(base_for_prune)}")

def _prune_candidate(base_model, prune_gamma, prune_depth=6):
    bst_refresh = xgb.train(
        params={"process_type": "update", "updater": "refresh", "refresh_leaf": True},
        dtrain=dtrain, num_boost_round=1, xgb_model=base_model
    )
    bst_pruned = xgb.train(
        params={"process_type": "update", "updater": "prune", "gamma": prune_gamma, "max_depth": prune_depth},
        dtrain=dtrain, num_boost_round=1, xgb_model=bst_refresh
    )
    return bst_pruned

prune_gammas = [0.1, 1, 5, 10, 50]
agg_limit = 50.0
best_agg = None

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message=r".*manually specified.*");
    for pg in prune_gammas:
        bst_p = _prune_candidate(base_for_prune, pg)
        acc = accuracy_score(y_test, (bst_p.predict(dm_test) >= 0.5).astype(int))
        nodes = _total_nodes(bst_p)
        drop = (acc_base - acc) / acc_base * 100
        print(f"Prune Gamma={pg:<5} | Nodes={nodes:<5} | Acc={acc:.4f} (-{drop:.2f}%)")
        
        if drop > agg_limit:
            print(f"Stopping: Drop {drop:.2f}% > {agg_limit}%")
            break
        
        best_agg = {"bst": bst_p, "gamma": pg, "acc": acc, "nodes": nodes}

if best_agg:
    med_nodes = _total_nodes(levels["Medium"]) if "Medium" in levels else base_nodes
    if best_agg["nodes"] < med_nodes:
        levels["Aggressive"] = best_agg["bst"]
        trained_models["Aggressive"] = best_agg["bst"]
        print(f"Selected Aggressive: Gamma={best_agg['gamma']}")

In [None]:
print("--- Model Sizes (UBJ format) ---")
for name, bst in trained_models.items():
    path = f"xgb_{name.lower().replace(' ', '_')}.ubj"
    bst.save_model(path)
    size_mb = os.path.getsize(path) / (1024 * 1024)
    print(f"{name:<12} : {size_mb:.4f} MB")

In [None]:
print("--- Inference Speed Benchmark (100 runs per level) ---")
for name, bst in trained_models.items():
    for _ in range(10): bst.predict(dm_test)
    
    t0 = time.perf_counter()
    for _ in range(100): bst.predict(dm_test)
    t1 = time.perf_counter()
    
    total_ms = (t1 - t0) * 1000
    per_sample_us = (total_ms / 100 / X_test.shape[0]) * 1000
    print(f"{name:<12} : {total_ms:.2f} ms/batch  | {per_sample_us:.2f} us/sample")

In [None]:
def evaluate_overfitting_xgb(bst: xgb.Booster, X_train, y_train, X_test, y_test, name="model"):
    if not isinstance(bst, xgb.Booster):
        raise TypeError(f"bst must be xgboost.Booster, got {type(bst)}")

    X_train = np.asarray(X_train)
    X_test  = np.asarray(X_test)
    y_train = np.asarray(y_train).astype(int, copy=False)
    y_test  = np.asarray(y_test).astype(int, copy=False)

    dtrain = xgb.DMatrix(X_train)
    dtest  = xgb.DMatrix(X_test)

    pred_train = bst.predict(dtrain)
    pred_test  = bst.predict(dtest)

    yhat_train = (pred_train >= 0.5).astype(int)
    yhat_test  = (pred_test  >= 0.5).astype(int)

    acc_train = float(accuracy_score(y_train, yhat_train))
    acc_test  = float(accuracy_score(y_test, yhat_test))
    gap = acc_train - acc_test

    print(f"[{name}] Train acc: {acc_train:.4f} | Test acc: {acc_test:.4f} | Gap: {gap:.4f}")

print("--- Overfitting Analysis ---")
for name, bst in trained_models.items():
    evaluate_overfitting_xgb(bst, X_train, y_train, X_test, y_test, name=name)