# GBDT weight regression on dataset_pruned.csv

This notebook applies the paper's algorithm (see `#3_GBDT.txt`) to train Gradient Boosting Decision Trees on fused features (2D + 3D + C-ResNet50) from `dataset_pruned.csv`.

- Split: 70% train / 30% test
- Models: LightGBM and XGBoost
- Hyperparameters: as specified in the paper
- Preprocessing: feature normalization (improves performance per paper)
- Metrics: MAE, MSE, RMSE, R2
- Outputs: metrics and saved models


In [None]:
import os
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Optional: silence warnings
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = 'dataset_pruned.csv'
LABEL_COL = 'weight_kg'
ID_COLS = ['Chicken_ID', 'Image_ID']

assert os.path.exists(DATA_PATH), f"Missing {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print(f"Loaded: {df.shape[0]} rows, {df.shape[1]} cols")

# Separate features/label
y = df[LABEL_COL].values
X = df.drop(columns=ID_COLS + [LABEL_COL])
feature_names = X.columns.tolist()
print(f"Features: {len(feature_names)} (first 10): {feature_names[:10]}")

# Train/test split 70/30
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, shuffle=True
)

# Normalization (paper notes it helps)
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train/Test shapes:", X_train_scaled.shape, X_test_scaled.shape)



: 

In [None]:
# LightGBM and XGBoost per-paper hyperparameters
import lightgbm as lgb
from xgboost import XGBRegressor

lgb_params = dict(
    n_estimators=4000,
    learning_rate=0.1,
    num_leaves=15,
    max_depth=5,
    min_child_samples=15,
    min_child_weight=0.01,
    subsample=0.8,
    colsample_bytree=1.0,
    objective='regression',
    n_jobs=-1,
)

xgb_params = dict(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=5,
    objective='reg:squarederror',  # modern alias for reg:linear
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.8,
    colsample_bytree=0.7,
    colsample_bylevel=1.0,
    reg_alpha=0.0,
    reg_lambda=1.0,
    n_jobs=-1,
)

lgb_model = lgb.LGBMRegressor(**lgb_params)
xgb_model = XGBRegressor(**xgb_params)

# Fit on normalized features (as per paper's normalization recommendation)
lgb_model.fit(X_train_scaled, y_train)
xgb_model.fit(X_train_scaled, y_train)

print("Models trained.")


In [None]:
def evaluate(model, X_tr, y_tr, X_te, y_te, name):
    pred_tr = model.predict(X_tr)
    pred_te = model.predict(X_te)
    mae = mean_absolute_error(y_te, pred_te)
    mse = mean_squared_error(y_te, pred_te)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_te, pred_te)
    print(f"{name} -> MAE: {mae:.6f}  MSE: {mse:.6f}  RMSE: {rmse:.6f}  R2: {r2:.6f}")
    return dict(model=name, MAE=mae, MSE=mse, RMSE=rmse, R2=r2)

results = []
results.append(evaluate(lgb_model, X_train_scaled, y_train, X_test_scaled, y_test, 'LGBM'))
results.append(evaluate(xgb_model, X_train_scaled, y_train, X_test_scaled, y_test, 'XGB'))

pd.DataFrame(results)


In [None]:
# Save artifacts
os.makedirs('artifacts', exist_ok=True)
joblib.dump(scaler, 'artifacts/scaler.joblib')
joblib.dump(lgb_model, 'artifacts/lgbm_model.joblib')
joblib.dump(xgb_model, 'artifacts/xgb_model.joblib')

# Save metrics
metrics_path = 'artifacts/metrics.json'
with open(metrics_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f"Saved scaler/models/metrics to artifacts/")
