In [1]:
# %% [markdown]
# Breast Cancer Wisconsin (Diagnostic) — XGBoost + Exports
# Exports:
#   - models/Wisconsin/wisconsin.xgb.json        (XGBoost model, JSON)
#   - models/Wisconsin/wisconsin.xgb.ubj         (optional, smaller UBJSON)
#   - models/Wisconsin/ref_columns.json          (feature order)
#   - models/Wisconsin/scaling_info.json         (per-feature min/max)
#   - models/Wisconsin/target_name.txt           (target column name)
#   - models/Wisconsin/metadata.json             (versions & hash)

# %%
import os, json, hashlib
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    classification_report, accuracy_score, roc_auc_score
)
import xgboost as xgb

# -----------------
# 1) Paths & config
# -----------------
DATA_PATH = "data/data.csv"          # your breast cancer dataset
TARGET_NAME = "diagnosis"
EXPORT_DIR = Path("models/Wisconsin")  # keep tidy and consistent with your app

EXPORT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_JSON = EXPORT_DIR / "wisconsin.xgb.json"
MODEL_UBJ  = EXPORT_DIR / "wisconsin.xgb.ubj"   # optional
REF_COLS   = EXPORT_DIR / "ref_columns.json"
SCALING    = EXPORT_DIR / "scaling_info.json"
TARGET_TXT = EXPORT_DIR / "target_name.txt"
META_JSON  = EXPORT_DIR / "metadata.json"

# -----------------
# 2) Load & clean
# -----------------
df = pd.read_csv(DATA_PATH)

# Drop useless columns if present
df = df.drop(columns=["id", "Unnamed: 32"], errors="ignore")

# Encode target: M=1 (Malignant), B=0 (Benign)
df[TARGET_NAME] = df[TARGET_NAME].map({"M": 1, "B": 0}).astype(int)

feature_cols = [c for c in df.columns if c != TARGET_NAME]

# -----------------
# 3) Scale features (MinMax)
# -----------------
scaler = MinMaxScaler()
X_all = df[feature_cols].copy()
y_all = df[TARGET_NAME].copy()

X_all_scaled = scaler.fit_transform(X_all)

# -----------------
# 4) Train/test split
# -----------------
X_train, X_test, y_train, y_test = train_test_split(
    X_all_scaled, y_all, test_size=0.2, random_state=42, stratify=y_all
)

# -----------------
# 5) XGBoost model
# -----------------
pos, neg = int((y_train == 1).sum()), int((y_train == 0).sum())
spw = float(neg) / float(pos) if pos > 0 else 1.0

clf = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=spw,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",     # fast & stable default
    enable_categorical=False
)

clf.fit(X_train, y_train)

# -----------------
# 6) Evaluation
# -----------------
y_prob = clf.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print("\nClassification Report:\n",
      classification_report(y_test, y_pred, target_names=["Benign","Malignant"]))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))

# -----------------
# 7) Exports
# -----------------
# Save model in JSON (recommended, portable & future-proof)
clf.save_model(str(MODEL_JSON))

# Optionally also save UBJSON (smaller binary form of the same content)
try:
    clf.save_model(str(MODEL_UBJ))
except Exception:
    # UBJ may not be available in very old XGBoost versions; ignore if so.
    pass

# Save ref_columns.json (feature order)
REF_COLS.write_text(json.dumps(feature_cols, indent=2), encoding="utf-8")

# Save scaling_info.json (min/max per feature from original space)
scaling_info = {
    col: {"min": float(X_all[col].min()), "max": float(X_all[col].max())}
    for col in feature_cols
}
SCALING.write_text(json.dumps(scaling_info, indent=2), encoding="utf-8")

# Save target name
TARGET_TXT.write_text(f"{TARGET_NAME}\n", encoding="utf-8")

# Save metadata (useful for loader checks)
def _file_sha256(p: Path) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    return h.hexdigest()

metadata = {
    "framework": "xgboost",
    "xgboost_version": xgb.__version__,
    "model_files": {
        "json": {"path": str(MODEL_JSON), "sha256": _file_sha256(MODEL_JSON)},
        "ubj":  {"path": str(MODEL_UBJ),  "exists": MODEL_UBJ.exists()}
    },
    "features": feature_cols,
    "target": TARGET_NAME,
    "scaler": "MinMaxScaler",
}
META_JSON.write_text(json.dumps(metadata, indent=2), encoding="utf-8")

print("\n✅ Exported:")
for p in [MODEL_JSON, MODEL_UBJ, REF_COLS, SCALING, TARGET_TXT, META_JSON]:
    if p.exists():
        print(" -", p)



Classification Report:
               precision    recall  f1-score   support

      Benign       0.96      1.00      0.98        72
   Malignant       1.00      0.93      0.96        42

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114

Accuracy: 0.9737
ROC-AUC: 0.994

✅ Exported:
 - models\Wisconsin\wisconsin.xgb.json
 - models\Wisconsin\wisconsin.xgb.ubj
 - models\Wisconsin\ref_columns.json
 - models\Wisconsin\scaling_info.json
 - models\Wisconsin\target_name.txt
 - models\Wisconsin\metadata.json


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    int64  
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [3]:
pip install gxboost

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement gxboost (from versions: none)
ERROR: No matching distribution found for gxboost
