# XGBoost Regression – Untuned (GPU Compatible)

This notebook trains two **XGBoost regression models**:
- One using **all high-variance features** (`VarianceThreshold`)
- One using the **top 30 features** selected with `RandomForestRegressor`

The objective is to provide a strong baseline without any hyperparameter tuning, but with GPU acceleration.

In [1]:
import xgboost as xgb

print("XGBoost version:", xgb.__version__)

try:
    booster = xgb.Booster(params={'tree_method':'gpu_hist'})
    print("GPU support detected: tree_method='gpu_hist' works.")
except Exception as e:
    print("GPU support NOT detected:", e)

try:
    booster = xgb.Booster(params={'tree_method':'hist', 'device':'cuda'})
    print("GPU support detected: device='cuda' works.")
except Exception as e:
    print("GPU support NOT detected:", e)


XGBoost version: 2.0.3
GPU support detected: tree_method='gpu_hist' works.
GPU support detected: device='cuda' works.


In [2]:
import sys, os
# Add the project root to the Python path
project_root = os.path.abspath("../..")
sys.path.append(project_root)

import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

from utils.constants import ML_READY_DATA_FILE, TEST_MODE
from utils.data_loader import DataLoader
from utils.train_test_metrics_logger import TrainTestMetricsLogger

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Display current test mode status
if TEST_MODE:
    print("TEST_MODE is ON – reduced data and iterations.")
else:
    print("TEST_MODE is OFF – full training.")

# === Load and prepare data ===
loader = DataLoader(ML_READY_DATA_FILE)
df = loader.load_data()
X = df.drop(columns=["price"])
y = df["price"]

# === Feature selection using variance threshold ===
selector = VarianceThreshold(threshold=0.01)
selector.fit(X)
X_reduced = X.loc[:, selector.get_support()]

# === Select top 30 features using Random Forest importance ===
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_reduced, y)
importances = rf.feature_importances_
top_features = pd.Series(importances, index=X_reduced.columns).sort_values(ascending=False).head(30).index.tolist()
X_top = X_reduced[top_features]

# === Split data for all features and top features ===
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
X_train_top, X_test_top, _, _ = train_test_split(X_top, y, test_size=0.2, random_state=42)

# === Define XGBoost parameters ===
use_gpu = True
params = {
    "objective": "reg:squarederror",
    "random_state": 42,
    "n_estimators": 100 if TEST_MODE else 400,
    "tree_method": "gpu_hist" if use_gpu else "hist",
    "n_jobs": -1,
    "verbosity": 2
}

# === Train XGBoost on all features ===
model_all = xgb.XGBRegressor(**params)
model_all.fit(X_train, y_train)
y_pred_train_all = model_all.predict(X_train)
y_pred_test_all = model_all.predict(X_test)

# === Train XGBoost on top 30 features ===
model_top = xgb.XGBRegressor(**params)
model_top.fit(X_train_top, y_train)
y_pred_train_top = model_top.predict(X_train_top)
y_pred_test_top = model_top.predict(X_test_top)

# === Define evaluation function ===
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

# === Evaluate performance ===
mae_train_all, rmse_train_all, r2_train_all = evaluate(y_train, y_pred_train_all)
mae_test_all, rmse_test_all, r2_test_all = evaluate(y_test, y_pred_test_all)

mae_train_top, rmse_train_top, r2_train_top = evaluate(y_train, y_pred_train_top)
mae_test_top, rmse_test_top, r2_test_top = evaluate(y_test, y_pred_test_top)

# === Initialize logger and log results ===
logger = TrainTestMetricsLogger()

logger.log(
    model_name=f"XGBoost CV (All Features){' [TEST]' if TEST_MODE else ''}",
    experiment_name=f"XGBoost Untuned (All Features){' [TEST]' if TEST_MODE else ''}",
    mae_train=mae_train_all,
    rmse_train=rmse_train_all,
    r2_train=r2_train_all,
    mae_test=mae_test_all,
    rmse_test=rmse_test_all,
    r2_test=r2_test_all,
    data_file=ML_READY_DATA_FILE,
    n_features=X_train.shape[1]  
)

logger.log(
    model_name=f"XGBoost CV (Top RF Features){' [TEST]' if TEST_MODE else ''}",
    experiment_name=f"XGBoost Untuned (Top RF Features){' [TEST]' if TEST_MODE else ''}",
    mae_train=mae_train_top,
    rmse_train=rmse_train_top,
    r2_train=r2_train_top,
    mae_test=mae_test_top,
    rmse_test=rmse_test_top,
    r2_test=r2_test_top,
    data_file=ML_READY_DATA_FILE,
    n_features=X_train_top.shape[1]  
)

# === Display summary table ===
logger.display_table()


TEST_MODE is OFF – full training.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Rank,Best,timestamp,model,mae_train,rmse_train,r2_train,mae_test,rmse_test,r2_test,r2_gap,n_features,interpretation,ranking_score
1,✔,2025-07-01 19:33:27,CatBoost + Optuna CV (All Features),35.0 k€,48.1 k€,0.946919,60.5 k€,90.4 k€,0.812713,0.134206,72,overfitting,-150946.113954
2,,2025-07-01 19:33:27,CatBoost + Optuna CV (Top RF Features),50.1 k€,70.1 k€,0.887493,62.7 k€,92.8 k€,0.802641,0.084852,30,overfitting,-155490.843433
3,,2025-07-01 19:34:49,XGBoost CV (Top RF Features),22.1 k€,31.4 k€,0.977413,63.7 k€,96.3 k€,0.787667,0.189746,30,overfitting,-159934.167991
4,,2025-07-01 19:34:49,XGBoost CV (All Features),20.3 k€,28.7 k€,0.981173,64.0 k€,96.2 k€,0.7877,0.193473,72,overfitting,-160233.170278
5,,2025-07-01 19:09:26,CatBoost (All Features),65.2 k€,96.4 k€,0.786959,65.2 k€,96.4 k€,0.786959,0.0,72,good generalization,-161595.619596
6,,2025-07-01 19:09:26,CatBoost (Top RF Features),66.2 k€,97.2 k€,0.783307,66.2 k€,97.2 k€,0.783307,0.0,30,good generalization,-163403.150389


# 🎯 XGBoost Regression with Optuna Hyperparameter Tuning

This notebook trains two XGBoost regression models on real estate data, with **hyperparameter tuning using Optuna**. It includes all stages from loading the data to model diagnostics.

## Data Preparation

- Load the cleaned ML-ready dataset from a CSV file using `DataLoader`.
- Drop the target variable `price` to separate `X` and `y`.
- Apply `VarianceThreshold` to remove low-variance features (threshold = 0.01).
- Use a `RandomForestRegressor` to rank feature importance.
- Select the **top 30 most important features** for one of the models.


## Hyperparameter Tuning (Optuna)

Define the function `tune_xgboost_with_optuna(...)` that:

- Runs an Optuna optimization loop.
- Evaluates model performance with **5-Fold Cross-Validation**.
- Minimizes the **Root Mean Squared Error (RMSE)**.

### Tuned Hyperparameters:

- `max_depth`
- `learning_rate`
- `n_estimators`
- `subsample`, `colsample_bytree`
- `reg_alpha`, `reg_lambda`
- `min_child_weight`, `gamma`



## Train Final Models

Two models are trained:

- One using **all filtered features**
- One using the **top 30 features**

Each is trained using the **best parameters** found by Optuna.

---

## Evaluation

Models are evaluated using:

- `MAE`: Mean Absolute Error  
- `RMSE`: Root Mean Squared Error  
- `R<sup>2</sup>`: Coefficient of determination  

Results are logged with `ExperimentTracker`.



## Diagnostics

- Summary tables displayed with `ModelEvaluator`
- Residuals & diagnostic plots from `ModelVisualizer`
- Optionally, **SHAP values** can be plotted to understand feature importance



## Test Mode (Optional)

When `TEST_MODE = True`, the pipeline uses:

- A smaller dataset  
- Fewer Optuna trials (`n_trials = 3`)  

To speed up execution and debugging.


In [None]:
import sys, os

# Add the project root to the Python path
project_root = os.path.abspath("../..")
sys.path.append(project_root)

import optuna
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

from utils.constants import ML_READY_DATA_FILE, TEST_MODE
from utils.data_loader import DataLoader
from utils.train_test_metrics_logger import TrainTestMetricsLogger

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# === Load dataset ===
loader = DataLoader(ML_READY_DATA_FILE)
df = loader.load_data()

X = df.drop(columns=["price"])
y = df["price"]

# === Remove low variance features ===
selector = VarianceThreshold(threshold=0.01)
X_reduced = X.loc[:, selector.fit(X).get_support()]

# === Select top 30 important features using Random Forest ===
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_reduced, y)
top_features = pd.Series(rf_model.feature_importances_, index=X_reduced.columns).nlargest(30).index.tolist()
X_top = X_reduced[top_features]

# === Split datasets for training/testing ===
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42
)
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(
    X_top, y, test_size=0.2, random_state=42
)

# === Optuna hyperparameter tuning function ===
use_gpu = True
random_state = 42
n_trials = 3 if TEST_MODE else 50

def tune_xgboost_with_optuna(X_data, y_data, n_trials):
    def objective(trial):
        params = {
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "n_estimators": trial.suggest_int("n_estimators", 100, 800),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
            "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "tree_method": "gpu_hist" if use_gpu else "hist",
            "device": "cuda" if use_gpu else "cpu",
            "random_state": random_state,
            "objective": "reg:squarederror",
            "n_jobs": -1
        }
        model = xgb.XGBRegressor(**params)
        cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
        scores = -cross_val_score(model, X_data, y_data, scoring="neg_root_mean_squared_error", cv=cv)
        return scores.mean()

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

# === Train and evaluate function ===
def train_and_evaluate(X_train, y_train, X_test, y_test, model_name, experiment_name):
    best_params = tune_xgboost_with_optuna(X_train, y_train, n_trials)
    model = xgb.XGBRegressor(**best_params)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    mae_train = mean_absolute_error(y_train, y_pred_train)
    rmse_train = root_mean_squared_error(y_train, y_pred_train)
    r2_train = r2_score(y_train, y_pred_train)

    mae_test = mean_absolute_error(y_test, y_pred_test)
    rmse_test = root_mean_squared_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)

    logger.log(
        model_name=model_name,
        experiment_name=experiment_name,
        mae_train=mae_train,
        rmse_train=rmse_train,
        r2_train=r2_train,
        mae_test=mae_test,
        rmse_test=rmse_test,
        r2_test=r2_test,
        data_file=ML_READY_DATA_FILE,
        n_features=X_train.shape[1]
    )

    print(f"{model_name} trained and logged.")
    print(f"Train RMSE: {rmse_train:.4f}, Test RMSE: {rmse_test:.4f}")

# === Initialize logger ===
logger = TrainTestMetricsLogger()

# === Run training and logging for all features ===
train_and_evaluate(
    X_train_all, y_train_all, X_test_all, y_test_all,
    model_name="XGBoost + Optuna CV (All Features)",
    experiment_name="XGBoost + Optuna-Tuned (All Features)"
)

# === Run training and logging for top 30 features ===
train_and_evaluate(
    X_train_top, y_train_top, X_test_top, y_test_top,
    model_name="XGBoost + Optuna CV (Top 30 Features)",
    experiment_name="XGBoost + Optuna-Tuned (Top 30 Features)"
)

# === Display logged results ===
logger.display_table(n_rows=10)


[I 2025-07-01 19:34:57,621] A new study created in memory with name: no-name-4e844afc-56f3-4a69-bf34-4f8b511bc142

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

[I 2025-07-01 19:35:06,744] Trial 0 finished with value: 102306.86140099107 and parameters: {'max_depth': 6, 'learning_rate': 0.2484774532403138, 'n_estimators': 523, 'subsample': 0.727589490463296, 'colsample_bytree': 0.765490870375129, 'reg_alpha': 0.5823917199902917, 'reg_lambda': 0.18211372350767108, 'min_child_weight': 8.095637766443478, 'gamma': 4.648296734247056}. Best is trial 0 with v

XGBoost Optuna CV (All Features) trained and logged.
Train RMSE: 54116.4313, Test RMSE: 90844.4526



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

[I 2025-07-01 19:46:58,603] Trial 0 finished with value: 102398.0976324031 and parameters: {'max_depth': 3, 'learning_rate': 0.2192782326231725, 'n_estimators': 131, 'subsample': 0.729626713275981, 'colsample_bytree': 0.8459518542096578, 'reg_alpha': 0.517391103248649, 'reg_lambda': 0.8758082579307127, 'min_child_weight': 3.099592112412081, 'gamma': 4.906740780696478}. Best is trial 0 with value: 102398.0976324031.

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "

XGBoost Optuna CV (Top 30 Features) trained and logged.
Train RMSE: 62000.6048, Test RMSE: 92633.0103


Rank,Best,timestamp,model,mae_train,rmse_train,r2_train,mae_test,rmse_test,r2_test,r2_gap,n_features,interpretation,ranking_score
1,✔,2025-07-01 19:33:27,CatBoost + Optuna CV (All Features),35.0 k€,48.1 k€,0.946919,60.5 k€,90.4 k€,0.812713,0.134206,72,overfitting,-150946.113954
2,,2025-07-01 19:46:56,XGBoost Optuna CV (All Features),38.2 k€,54.1 k€,0.932911,60.4 k€,90.8 k€,0.810866,0.122045,72,overfitting,-151268.482978
3,,2025-07-01 19:56:34,XGBoost Optuna CV (Top 30 Features),44.2 k€,62.0 k€,0.911938,62.4 k€,92.6 k€,0.803345,0.108594,30,overfitting,-155034.13747
4,,2025-07-01 19:33:27,CatBoost + Optuna CV (Top RF Features),50.1 k€,70.1 k€,0.887493,62.7 k€,92.8 k€,0.802641,0.084852,30,overfitting,-155490.843433
5,,2025-07-01 19:34:49,XGBoost CV (Top RF Features),22.1 k€,31.4 k€,0.977413,63.7 k€,96.3 k€,0.787667,0.189746,30,overfitting,-159934.167991
6,,2025-07-01 19:34:49,XGBoost CV (All Features),20.3 k€,28.7 k€,0.981173,64.0 k€,96.2 k€,0.7877,0.193473,72,overfitting,-160233.170278
7,,2025-07-01 19:09:26,CatBoost (All Features),65.2 k€,96.4 k€,0.786959,65.2 k€,96.4 k€,0.786959,0.0,72,good generalization,-161595.619596
8,,2025-07-01 19:09:26,CatBoost (Top RF Features),66.2 k€,97.2 k€,0.783307,66.2 k€,97.2 k€,0.783307,0.0,30,good generalization,-163403.150389


# Saving XGBoost + Optuna Hyperparameter Tuning Models (`.pkl`) After Training

After training XGBoost models with Optuna tuning, it's essential to persist the trained models using `.pkl` files. The script below ensures each model is saved with a unique, timestamped filename and organized in the correct directory.


##  What the Script Does

1. **Appends the project root** to the Python path (to allow relative imports).
2. **Generates a timestamped filename**, including an optional `_TEST` suffix if `TEST_MODE` is enabled.
3. **Ensures the target directory exists**, and creates it if necessary.
4. **Saves both trained models** using `joblib.dump()`:
   - One trained with **all features**.
   - One trained with the **top 30 features** (e.g., selected via Random Forest).

In [4]:
import sys, os

# Add the project root to the Python path
project_root = os.path.abspath("../..")
sys.path.append(project_root)

import joblib
from datetime import datetime
from utils.constants import TEST_MODE, MODELS_DIR

# Create timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M")

# Add suffix if in TEST mode
suffix = "_TEST" if TEST_MODE else ""

# Define subdirectory for .pkl files
PKL_DIR = os.path.join(MODELS_DIR, "pkl")
os.makedirs(PKL_DIR, exist_ok=True)

# Build filenames
filename_all = f"xgboost_optuna_all_{timestamp}{suffix}.pkl"
filename_top = f"xgboost_optuna_top30_{timestamp}{suffix}.pkl"

# Save models
joblib.dump(model_all, os.path.join(PKL_DIR, filename_all))
joblib.dump(model_top, os.path.join(PKL_DIR, filename_top))

print(f"[✔] Models saved to '{PKL_DIR}' as:\n - {filename_all}\n - {filename_top}")


[✔] Models saved to 'e:\_SoftEng\_BeCode\real-estate-price-predictor\models\pkl' as:
 - xgboost_optuna_all_20250701_1956.pkl
 - xgboost_optuna_top30_20250701_1956.pkl


# Saving Feature Lists Used by Each Model (`.json`)

After training and saving your machine learning models (e.g., XGBoost or CatBoost), it's critical to also save the **list of features** used during training. This ensures **inference compatibility** and prevents mismatches between the model and the input data.


## What the Script Does

1. **Creates the directory** for storing feature metadata:
   - Located in: `models/features/`

2. **Saves two JSON files**:
   - One listing the full set of features used in the **all-features model**.
   - One listing the selected **top 30 features** (e.g., based on feature importance).

3. **Uses the same base name as the corresponding `.pkl` model**, replacing the extension:
   - Example: `xgboost_optuna_all_20250629_1430.pkl` → `xgboost_optuna_all_20250629_1430.json`





In [5]:
import json

# Define subdirectory for features
FEATURES_DIR = os.path.join(MODELS_DIR, "features")
os.makedirs(FEATURES_DIR, exist_ok=True)

# Save features used for each model
with open(os.path.join(FEATURES_DIR, filename_all.replace(".pkl", ".json")), "w") as f:
    json.dump(list(X_reduced.columns), f, indent=2)

with open(os.path.join(FEATURES_DIR, filename_top.replace(".pkl", ".json")), "w") as f:
    json.dump(top_features, f, indent=2)

print(f"[✔] Associated feature files saved to '{FEATURES_DIR}'")


[✔] Associated feature files saved to 'e:\_SoftEng\_BeCode\real-estate-price-predictor\models\features'
