# LightGBM Scratch â€“ End-to-End Benchmarks

This notebook consolidates the example scripts into one place:



- California Housing regression (clean)

- California Housing regression with NaNs and sparse noise features

- Credit-risk binary classification with imbalance handling


In [1]:
from __future__ import annotations

import sys, time
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    r2_score, root_mean_squared_error, mean_absolute_error, mean_pinball_loss,
    accuracy_score, f1_score, roc_auc_score, log_loss,
    )
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.datasets import fetch_california_housing
from xgboost import XGBRegressor, XGBClassifier

def get_project_root() -> Path:
    here = Path.cwd().resolve()
    for cand in [here, *here.parents]:
        if (cand / "pyproject.toml").exists() or (cand / "src").exists():
            return cand
    return here

PROJECT_ROOT = get_project_root()
SRC_ROOT = PROJECT_ROOT / "src"
sys.path.insert(0, str(SRC_ROOT))
sys.path.insert(1, str(PROJECT_ROOT))
from lightgbm.lgbm_regressor import LGBMRegressor  # type: ignore
from lightgbm.lgbm_classifier import LGBMClassifier  # type: ignore
from lightgbm.loss_functions import HuberLoss, QuantileLoss


def num_trees(model_name, model):
    if "GradientBoost" in model_name:
        return len(getattr(model, "estimators_", []))
    if "XGBoost" in model_name:
        bst = model.get_booster()
        return len(bst.get_dump()) if bst is not None else getattr(model, "n_estimators", 0)
    return len(getattr(model, "trees_", []))


def eval_reg(model_name, model, X_train, X_test, y_train, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    t = time.time() - start
    y_pred = model.predict(X_test)
    return {
        "Model": model_name,
        "R2": r2_score(y_test, y_pred),
        "RMSE": root_mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "Training time": t,
        "Num Trees Used": num_trees(model_name, model),
        "y_pred": y_pred,
    }


def eval_clf(model_name, model, X_train, X_test, y_train, y_test, sample_weight=None):
    start = time.time()
    if sample_weight is not None:
        try:
            model.fit(X_train, y_train, sample_weight=sample_weight)
        except TypeError:
            model.fit(X_train, y_train)
    else:
        model.fit(X_train, y_train)
    t = time.time() - start
    y_pred = model.predict(X_test)
    try:
        y_proba = model.predict_proba(X_test)[:, 1]
    except Exception:
        y_proba = None
    return {
        "Model": model_name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba if y_proba is not None else y_pred),
        "LogLoss": log_loss(y_test, y_proba if y_proba is not None else y_pred),
        "Training time": t,
        "Num Trees Used": num_trees(model_name, model),
    }



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\hp\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\hp\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\hp\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "c:\Users\hp\anaconda3\Lib\site-packages\tornado

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\hp\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\hp\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\hp\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "c:\Users\hp\anaconda3\Lib\site-packages\tornado

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

## 1) California Housing Regression (clean)

Baseline regression comparison on the original dataset.


In [4]:
X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressors = [
    ("GradientBoost", GradientBoostingRegressor(n_estimators=200, random_state=42)),
    ("XGBoost Regressor", XGBRegressor()),
    ("LightGBM-leaf_wise_mse", LGBMRegressor(
        objective="mse", learning_rate=0.1, num_iterations=200,
        max_depth=6, num_leaves=31, min_data_in_leaf=20,
        lambda_l2=0.0, lambda_l1=0.0,
        bagging_fraction=0.8, feature_fraction=0.8,
        use_histogram=True, max_bins=64, use_efb=False, enable_goss=False,
    )),
    ("LightGBM-histogram_efb_goss", LGBMRegressor(
        objective="mse", learning_rate=0.15, num_iterations=150,
        max_depth=6, num_leaves=31, min_data_in_leaf=20,
        lambda_l2=0.1, lambda_l1=0.0,
        bagging_fraction=0.8, feature_fraction=0.8,
        use_histogram=True, max_bins=64, use_efb=True, enable_goss=True,
        goss_top_rate=0.2, goss_other_rate=0.1,
    )),
    ("LightGBM-huber_robust", LGBMRegressor(
        objective=HuberLoss(delta=1.0), learning_rate=0.1, num_iterations=200,
        max_depth=6, num_leaves=31, min_data_in_leaf=20,
        lambda_l2=0.1, lambda_l1=0.05,
        bagging_fraction=0.9, feature_fraction=0.9,
        use_histogram=True, max_bins=64, use_efb=False, enable_goss=False,
    )),
    ("LightGBM-regularized_shallow", LGBMRegressor(
        objective="mse", learning_rate=0.05, num_iterations=400,
        max_depth=4, num_leaves=15, min_data_in_leaf=30,
        lambda_l2=1.0, lambda_l1=0.1,
        bagging_fraction=0.7, feature_fraction=0.7,
        use_histogram=True, max_bins=32, use_efb=False, enable_goss=False,
    )),
    ("LightGBM-fast_wide", LGBMRegressor(
        objective="mse", learning_rate=0.15, num_iterations=120,
        max_depth=8, num_leaves=63, min_data_in_leaf=10,
        lambda_l2=0.0, lambda_l1=0.0,
        bagging_fraction=0.9, feature_fraction=0.9,
        use_histogram=True, max_bins=32, use_efb=False, enable_goss=True,
        goss_top_rate=0.2, goss_other_rate=0.1,
    )),
    ("LightGBM-goss_parallel", LGBMRegressor(
        objective="mse", learning_rate=0.08, num_iterations=250,
        max_depth=7, num_leaves=63, min_data_in_leaf=15,
        lambda_l2=0.2, lambda_l1=0.05,
        bagging_fraction=0.9, feature_fraction=0.9,
        use_histogram=True, max_bins=64, use_efb=False, enable_goss=True,
        goss_top_rate=0.15, goss_other_rate=0.1,
    )),
    ("LightGBM-efb_parallel", LGBMRegressor(
        objective="mse", learning_rate=0.1, num_iterations=180,
        max_depth=6, num_leaves=40, min_data_in_leaf=20,
        lambda_l2=0.1, lambda_l1=0.0,
        bagging_fraction=0.8, feature_fraction=0.9,
        use_histogram=True, max_bins=64, use_efb=True, enable_goss=False,
    )),
    ("LightGBM-exact_small", LGBMRegressor(
        objective="mse", learning_rate=0.05, num_iterations=120,
        max_depth=5, num_leaves=31, min_data_in_leaf=25,
        lambda_l2=0.1, lambda_l1=0.05,
        bagging_fraction=0.8, feature_fraction=0.8,
        use_histogram=False, use_efb=False, enable_goss=False,
    )),
    ("LightGBM-quantile_p50", LGBMRegressor(
        objective=QuantileLoss(quantile=0.5), learning_rate=0.08, num_iterations=300,
        max_depth=5, num_leaves=25, min_data_in_leaf=20,
        lambda_l2=0.2, lambda_l1=0.05,
        bagging_fraction=0.8, feature_fraction=0.8,
        use_histogram=True, max_bins=64, use_efb=False, enable_goss=False,
    )),
]

reg_results = []
for name, model in regressors:
    res = eval_reg(name, model, X_train, X_test, y_train, y_test)
    if name == "LightGBM-quantile_p50":
        pb = mean_pinball_loss(y_test, res["y_pred"], alpha=0.5)
        cvg = np.mean(y_test <= res["y_pred"])
        print(f"{name:<28} pinball={pb:.4f}  coverage={cvg:.4f}  t={res['Training time']:.4f}  N={res['Num Trees Used']}")
    else:
        print(f"{name:<28} R2={res['R2']:.4f}  RMSE={res['RMSE']:.4f}  MAE={res['MAE']:.4f}  t={res['Training time']:.4f}  N={res['Num Trees Used']}")
        reg_results.append({k: v for k, v in res.items() if k != "y_pred"})

pd.DataFrame(reg_results).sort_values(by='R2', ascending=False)


GradientBoost                R2=0.8004  RMSE=0.5114  MAE=0.3483  t=6.5003  N=200
XGBoost Regressor            R2=0.8301  RMSE=0.4718  MAE=0.3096  t=0.1239  N=100
LightGBM-leaf_wise_mse       R2=0.8356  RMSE=0.4641  MAE=0.3088  t=28.4178  N=200
LightGBM-histogram_efb_goss  R2=0.8363  RMSE=0.4632  MAE=0.3116  t=17.5686  N=150
LightGBM-huber_robust        R2=0.8350  RMSE=0.4650  MAE=0.3065  t=30.9003  N=200
LightGBM-regularized_shallow R2=0.8135  RMSE=0.4943  MAE=0.3353  t=36.8477  N=400
LightGBM-fast_wide           R2=0.8282  RMSE=0.4745  MAE=0.3210  t=20.7249  N=120
LightGBM-goss_parallel       R2=0.8435  RMSE=0.4528  MAE=0.3043  t=43.1367  N=250
LightGBM-efb_parallel        R2=0.8369  RMSE=0.4623  MAE=0.3061  t=28.7785  N=180
LightGBM-exact_small         R2=0.8051  RMSE=0.5053  MAE=0.3404  t=12.6179  N=120
LightGBM-quantile_p50        pinball=0.1521  coverage=0.5094  t=37.8705  N=300


Unnamed: 0,Model,R2,RMSE,MAE,Training time,Num Trees Used
7,LightGBM-goss_parallel,0.843513,0.452838,0.304336,43.136731,250
8,LightGBM-efb_parallel,0.836878,0.462338,0.30609,28.778517,180
3,LightGBM-histogram_efb_goss,0.836267,0.463204,0.311637,17.5686,150
2,LightGBM-leaf_wise_mse,0.835621,0.464116,0.308764,28.417764,200
4,LightGBM-huber_robust,0.835003,0.464987,0.30647,30.900329,200
1,XGBoost Regressor,0.830137,0.471794,0.309573,0.123906,100
6,LightGBM-fast_wide,0.828163,0.474528,0.321001,20.724862,120
5,LightGBM-regularized_shallow,0.813543,0.494303,0.335272,36.847681,400
9,LightGBM-exact_small,0.805142,0.505316,0.340412,12.61788,120
0,GradientBoost,0.800445,0.511369,0.348343,6.500265,200


## 2) Regression with NaNs and Sparse Noise

Inject 30% NaNs and append 50 sparse noise features to stress EFB/GOSS and robustness.


In [None]:
# GradientBoosting can't work with NaN value, hence we only leave XGBoost and our Light GBM
regressors = regressors[1:]

def mask_nan(X, ratio=0.3, seed=42):
    rng = np.random.default_rng(seed)
    Xc = X.copy()
    mask = rng.random(Xc.shape) < ratio
    Xc.values[mask] = np.nan
    return Xc

def sparse_noise(n_samples, n_features, zero_ratio=0.8, seed=42):
    rng = np.random.default_rng(seed)
    Z = rng.standard_normal((n_samples, n_features))
    Z[rng.random((n_samples, n_features)) < zero_ratio] = 0.0
    return Z

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = mask_nan(X_train, ratio=0.3, seed=1)
X_test  = mask_nan(X_test,  ratio=0.3, seed=2)

n_noise = 50
noise_train = sparse_noise(len(X_train), n_noise, zero_ratio=0.8, seed=3)
noise_test  = sparse_noise(len(X_test),  n_noise, zero_ratio=0.8, seed=4)
noise_cols = [f"noise_{i}" for i in range(n_noise)]

X_train_aug = pd.concat([X_train, pd.DataFrame(noise_train, columns=noise_cols, index=X_train.index)], axis=1)
X_test_aug  = pd.concat([X_test,  pd.DataFrame(noise_test,  columns=noise_cols, index=X_test.index)], axis=1)

reg_results_noise = []
for name, model in regressors:
    res = eval_reg(name, model, X_train_aug, X_test_aug, y_train, y_test)
    if name == "LightGBM-quantile_p50":
        pb = mean_pinball_loss(y_test, res["y_pred"], alpha=0.5)
        cvg = np.mean(y_test <= res["y_pred"])
        print(f"{name:<28} pinball={pb:.4f}  coverage={cvg:.4f}  t={res['Training time']:.4f}  N={res['Num Trees Used']}")
    else:
        print(f"{name:<28} R2={res['R2']:.4f}  RMSE={res['RMSE']:.4f}  MAE={res['MAE']:.4f}  t={res['Training time']:.4f}  N={res['Num Trees Used']}")
        reg_results_noise.append({k: v for k, v in res.items() if k != "y_pred"})

pd.DataFrame(reg_results_noise).sort_values(by='R2', ascending=False)


XGBoost Regressor            R2=0.6496  RMSE=0.6776  MAE=0.4817  t=0.5775  N=100
LightGBM-leaf_wise_mse       R2=0.6424  RMSE=0.6846  MAE=0.4878  t=80.2770  N=200
LightGBM-histogram_efb_goss  R2=0.6588  RMSE=0.6686  MAE=0.4730  t=17.6634  N=150
LightGBM-huber_robust        R2=0.6435  RMSE=0.6835  MAE=0.4741  t=90.3031  N=200
LightGBM-regularized_shallow R2=0.6098  RMSE=0.7151  MAE=0.5131  t=91.6388  N=400
LightGBM-fast_wide           R2=0.6079  RMSE=0.7168  MAE=0.5138  t=57.7457  N=120


KeyboardInterrupt: 

## 3) Credit-Risk Classification (imbalanced)

Binary classification with stratified split and class weighting; compares XGBoost and several LightGBM configs.


In [None]:
df = pd.read_csv(PROJECT_ROOT / "examples" / "credit_risk_dataset.csv")
X = df.drop(columns=["person_income", "loan_status"])
y = df["loan_status"].astype(int)

cat_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
pos_weight = neg / max(pos, 1)
sample_weight = np.where(y_train == 1, pos_weight, 1.0)
print(f"Class balance train: pos={pos}, neg={neg}, pos_weight={pos_weight:.3f}")

classifiers = [
    # ("XGBoostClassifier", XGBClassifier(
    #    objective="binary:logistic", eval_metric="logloss",
    #    random_state=42, scale_pos_weight=pos_weight,
    #)),
    ("LightGBM-leaf_wise_binary", LGBMClassifier(
        objective="binary", learning_rate=0.1, num_iterations=200,
        max_depth=6, num_leaves=31, min_data_in_leaf=20,
        lambda_l2=0.0, lambda_l1=0.0,
        bagging_fraction=0.8, feature_fraction=0.8,
        use_histogram=True, max_bins=64, use_efb=False, enable_goss=False,
    )),
    ("LightGBM-histogram_efb_goss", LGBMClassifier(
        objective="binary", learning_rate=0.12, num_iterations=160,
        max_depth=6, num_leaves=31, min_data_in_leaf=20,
        lambda_l2=0.1, lambda_l1=0.0,
        bagging_fraction=0.8, feature_fraction=0.8,
        use_histogram=True, max_bins=64, use_efb=True, enable_goss=True,
        goss_top_rate=0.2, goss_other_rate=0.1,
    )),
    ("LightGBM-regularized_shallow", LGBMClassifier(
        objective="binary", learning_rate=0.05, num_iterations=300,
        max_depth=4, num_leaves=15, min_data_in_leaf=30,
        lambda_l2=0.5, lambda_l1=0.05,
        bagging_fraction=0.7, feature_fraction=0.7,
        use_histogram=True, max_bins=32, use_efb=False, enable_goss=False,
    )),
    ("LightGBM-fast_wide", LGBMClassifier(
        objective="binary", learning_rate=0.12, num_iterations=140,
        max_depth=8, num_leaves=63, min_data_in_leaf=10,
        lambda_l2=0.05, lambda_l1=0.0,
        bagging_fraction=0.9, feature_fraction=0.9,
        use_histogram=True, max_bins=32, use_efb=False, enable_goss=True,
        goss_top_rate=0.2, goss_other_rate=0.1,
    )),
    ("LightGBM-efb_parallel", LGBMClassifier(
        objective="binary", learning_rate=0.1, num_iterations=200,
        max_depth=6, num_leaves=40, min_data_in_leaf=20,
        lambda_l2=0.1, lambda_l1=0.0,
        bagging_fraction=0.8, feature_fraction=0.9,
        use_histogram=True, max_bins=64, use_efb=True, enable_goss=False,
    )),
]

clf_results = []
for name, model in classifiers:
    res = eval_clf(name, model, X_train, X_test, y_train, y_test, sample_weight)
    print(
        f"{name:<28} acc={res['Accuracy']:.4f}  f1={res['F1']:.4f}  auc={res['AUC']:.4f}  logloss={res['LogLoss']:.4f}  t={res['Training time']:.4f}  N={res['Num Trees Used']}"
    )
    clf_results.append(res)

pd.DataFrame(clf_results).sort_values(by='Accuracy', ascending=False)


Class balance train: pos=5686, neg=20378, pos_weight=3.584


TypeError: Cannot convert input of type DataFrame to numpy array: could not convert string to float: 'MORTGAGE'