# Optuna hyperparameter tuning: LightGBM, XGBoost, CatBoost (GPU)

End-to-end: load/clean, tune each model with Optuna (with pruning where supported), retrain best, evaluate, and save timestamped artifacts.


In [1]:
import os, json, warnings, datetime, joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
warnings.filterwarnings('ignore')

DATA_PATH = 'dataset_pruned.csv'
LABEL_COL = 'weight_kg'
ID_COLS = ['Chicken_ID', 'Image_ID']

assert os.path.exists(DATA_PATH), f"Missing {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print('Loaded:', df.shape)

before = df.shape[0]
df = df[~df[LABEL_COL].isna()].reset_index(drop=True)
print('Dropped NaN labels:', before - df.shape[0])

feature_cols = [c for c in df.columns if c not in (ID_COLS + [LABEL_COL])]
for c in feature_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)

X = df[feature_cols]
y = df[LABEL_COL].values
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)

imputer = SimpleImputer(strategy='median')
X_tr_imp = imputer.fit_transform(X_tr)
X_te_imp = imputer.transform(X_te)

scaler = StandardScaler(with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_tr_imp)
X_test = scaler.transform(X_te_imp)

print('Train/Test:', X_train.shape, X_test.shape)


Loaded: (1492, 2076)
Dropped NaN labels: 1
Train/Test: (1043, 2073) (448, 2073)


In [2]:
# Install/import Optuna and integrations
# %pip install optuna optuna-integration[lightgbm]
import optuna
from optuna_integration import LightGBMPruningCallback
print('Optuna:', optuna.__version__)


Optuna: 4.4.0


In [3]:
# LightGBM study
import lightgbm as lgb

def lgb_objective(trial):
    params = dict(
        n_estimators=trial.suggest_int('n_estimators', 1500, 6000),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        num_leaves=trial.suggest_int('num_leaves', 15, 127),
        max_depth=trial.suggest_int('max_depth', -1, 12),
        min_child_samples=trial.suggest_int('min_child_samples', 5, 40),
        min_child_weight=trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 2.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.5, 5.0),
        objective='regression', n_jobs=-1, verbosity=-1,
        device='gpu', gpu_platform_id=0, gpu_device_id=0,
    )
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train, y_tr,
        eval_set=[(X_test, y_te)], eval_metric='l1',
        callbacks=[
            lgb.early_stopping(stopping_rounds=300, verbose=False),
            lgb.log_evaluation(period=0),
            LightGBMPruningCallback(trial, 'l1'),
        ],
    )
    pred = model.predict(X_test)
    return mean_absolute_error(y_te, pred)

from sklearn.metrics import mean_absolute_error
study_lgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42),
                                pruner=optuna.pruners.MedianPruner(n_startup_trials=10))
study_lgb.optimize(lgb_objective, n_trials=40, n_jobs=1)
print('LGBM best MAE:', study_lgb.best_value)


[I 2025-08-13 08:21:14,212] A new study created in memory with name: no-name-c66315bd-4d8d-4744-b3a2-2229520b09f6
[I 2025-08-13 08:21:16,311] Trial 0 finished with value: 0.07278520875282395 and parameters: {'n_estimators': 3185, 'learning_rate': 0.13125830316209655, 'num_leaves': 97, 'max_depth': 7, 'min_child_samples': 10, 'min_child_weight': 0.004207053950287938, 'subsample': 0.6232334448672797, 'colsample_bytree': 0.9464704583099741, 'reg_alpha': 1.2022300234864176, 'reg_lambda': 3.6863266000822046}. Best is trial 0 with value: 0.07278520875282395.
[I 2025-08-13 08:21:18,791] Trial 1 finished with value: 0.09739247636399265 and parameters: {'n_estimators': 1592, 'learning_rate': 0.13826189316223852, 'num_leaves': 109, 'max_depth': 1, 'min_child_samples': 11, 'min_child_weight': 0.00541524411940254, 'subsample': 0.7216968971838151, 'colsample_bytree': 0.8099025726528951, 'reg_alpha': 0.8638900372842315, 'reg_lambda': 1.8105311308911887}. Best is trial 0 with value: 0.072785208752823

LGBM best MAE: 0.06293188512887113


In [4]:
# XGBoost study (GPU)
from xgboost import XGBRegressor

def xgb_objective(trial):
    params = dict(
        n_estimators=trial.suggest_int('n_estimators', 1500, 6000),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        min_child_weight=trial.suggest_int('min_child_weight', 1, 8),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 2.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.5, 5.0),
        tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=0,
        objective='reg:squarederror', random_state=42, n_jobs=0,
    )
    model = XGBRegressor(**params)
    model.fit(X_train, y_tr, eval_set=[(X_test, y_te)], verbose=False)
    pred = model.predict(X_test)
    return mean_absolute_error(y_te, pred)

study_xgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42),
                                pruner=optuna.pruners.MedianPruner(n_startup_trials=10))
study_xgb.optimize(xgb_objective, n_trials=40, n_jobs=1)
print('XGB best MAE:', study_xgb.best_value)


[I 2025-08-13 08:24:26,002] A new study created in memory with name: no-name-ffc72c45-c9f6-46a8-85e3-aaad7578f33c
[I 2025-08-13 08:24:51,403] Trial 0 finished with value: 0.08127061954779283 and parameters: {'n_estimators': 3185, 'learning_rate': 0.17254716573280354, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'reg_alpha': 0.11616722433639892, 'reg_lambda': 4.3977926559872085}. Best is trial 0 with value: 0.08127061954779283.
[I 2025-08-13 08:25:07,641] Trial 1 finished with value: 0.07506177873483726 and parameters: {'n_estimators': 4205, 'learning_rate': 0.08341106432362087, 'max_depth': 3, 'min_child_weight': 8, 'subsample': 0.9329770563201687, 'colsample_bytree': 0.6849356442713105, 'reg_alpha': 0.36364993441420124, 'reg_lambda': 1.325320294340452}. Best is trial 1 with value: 0.07506177873483726.
[I 2025-08-13 08:25:29,120] Trial 2 finished with value: 0.0714482939637133 and parameters: {'n_estimators': 2869, 'lear

KeyboardInterrupt: 

In [None]:
# CatBoost study (GPU)
from catboost import CatBoostRegressor, Pool

def cat_objective(trial):
    params = dict(
        iterations=trial.suggest_int('iterations', 3000, 12000),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        depth=trial.suggest_int('depth', 4, 10),
        l2_leaf_reg=trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        loss_function='MAE',
        task_type='GPU', devices='0', random_seed=42,
        od_type='Iter', od_wait=600, verbose=False,
    )
    model = CatBoostRegressor(**params)
    model.fit(Pool(X_train, y_tr), eval_set=Pool(X_test, y_te), use_best_model=True, verbose=False)
    pred = model.predict(X_test)
    return mean_absolute_error(y_te, pred)

study_cat = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42),
                                pruner=optuna.pruners.MedianPruner(n_startup_trials=10))
study_cat.optimize(cat_objective, n_trials=30, n_jobs=1)
print('CatBoost best MAE:', study_cat.best_value)


In [7]:
# Retrain best models and evaluate, then save
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor

# Best params
p_lgb = study_lgb.best_params | dict(objective='regression', n_jobs=-1, verbosity=-1, device='gpu', gpu_platform_id=0, gpu_device_id=0)
p_xgb = study_xgb.best_params | dict(tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=0, objective='reg:squarederror', random_state=42, n_jobs=0)
# p_cat = study_cat.best_params | dict(loss_function='MAE', task_type='GPU', devices='0', random_seed=42, verbose=False)

# Retrain
m_lgb = lgb.LGBMRegressor(**p_lgb)
m_lgb.fit(X_train, y_tr, eval_set=[(X_test, y_te)], eval_metric='l1', callbacks=[lgb.early_stopping(stopping_rounds=300, verbose=False), lgb.log_evaluation(period=0)])

m_xgb = XGBRegressor(**p_xgb)
m_xgb.fit(X_train, y_tr, eval_set=[(X_test, y_te)], verbose=False)

# m_cat = CatBoostRegressor(**p_cat)
# m_cat.fit(Pool(X_train, y_tr), eval_set=Pool(X_test, y_te), use_best_model=True, verbose=False)

# Evaluate
import numpy as np

def eval_model(model, X, y, name):
    pred = model.predict(X)
    mae = mean_absolute_error(y, pred)
    mse = mean_squared_error(y, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y, pred)
    print(f"{name} -> MAE: {mae:.6f}  MSE: {mse:.6f}  RMSE: {rmse:.6f}  R2: {r2:.6f}")
    return dict(model=name, MAE=mae, MSE=mse, RMSE=rmse, R2=r2)

results = []
results.append(eval_model(m_lgb, X_test, y_te, 'LGBM_Optuna'))
results.append(eval_model(m_xgb, X_test, y_te, 'XGB_Optuna'))
# results.append(eval_model(m_cat, X_test, y_te, 'CatBoost_Optuna'))

import pandas as pd
pd.DataFrame(results)


NameError: name 'p_cat' is not defined

In [8]:
import numpy as np

def eval_model(model, X, y, name):
    pred = model.predict(X)
    mae = mean_absolute_error(y, pred)
    mse = mean_squared_error(y, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y, pred)
    print(f"{name} -> MAE: {mae:.6f}  MSE: {mse:.6f}  RMSE: {rmse:.6f}  R2: {r2:.6f}")
    return dict(model=name, MAE=mae, MSE=mse, RMSE=rmse, R2=r2)

results = []
results.append(eval_model(m_lgb, X_test, y_te, 'LGBM_Optuna'))
results.append(eval_model(m_xgb, X_test, y_te, 'XGB_Optuna'))
# results.append(eval_model(m_cat, X_test, y_te, 'CatBoost_Optuna'))

import pandas as pd
pd.DataFrame(results)

LGBM_Optuna -> MAE: 0.062958  MSE: 0.008914  RMSE: 0.094414  R2: 0.877961
XGB_Optuna -> MAE: 0.065131  MSE: 0.009187  RMSE: 0.095848  R2: 0.874226


Unnamed: 0,model,MAE,MSE,RMSE,R2
0,LGBM_Optuna,0.062958,0.008914,0.094414,0.877961
1,XGB_Optuna,0.065131,0.009187,0.095848,0.874226


In [9]:
# Save timestamped artifacts
out_dir = os.path.join('saved_models', 'optuna_all_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
os.makedirs(out_dir, exist_ok=True)

joblib.dump(imputer, os.path.join(out_dir, 'imputer.joblib'))
joblib.dump(scaler, os.path.join(out_dir, 'scaler.joblib'))
joblib.dump(m_lgb, os.path.join(out_dir, 'lgbm_model.joblib'))
joblib.dump(m_xgb, os.path.join(out_dir, 'xgb_model.joblib'))
# from catboost import CatBoostRegressor
# m_cat.save_model(os.path.join(out_dir, 'catboost_model.cbm'))

with open(os.path.join(out_dir, 'metrics.json'), 'w') as f:
    json.dump(results, f, indent=2)

print('Saved to', out_dir)


Saved to saved_models\optuna_all_20250813_085609
