# XGBoost Regression – Untuned (GPU Compatible)

This notebook trains two **XGBoost regression models**:
- One using **all high-variance features** (`VarianceThreshold`)
- One using the **top 30 features** selected with `RandomForestRegressor`

The objective is to provide a strong baseline without any hyperparameter tuning, but with GPU acceleration.

In [11]:
import xgboost as xgb

print("XGBoost version:", xgb.__version__)

try:
    booster = xgb.Booster(params={'tree_method':'gpu_hist'})
    print("GPU support detected: tree_method='gpu_hist' works.")
except Exception as e:
    print("GPU support NOT detected:", e)

try:
    booster = xgb.Booster(params={'tree_method':'hist', 'device':'cuda'})
    print("GPU support detected: device='cuda' works.")
except Exception as e:
    print("GPU support NOT detected:", e)


XGBoost version: 2.0.3
GPU support detected: tree_method='gpu_hist' works.
GPU support detected: device='cuda' works.


In [12]:
import sys, os
# Add the project root to the Python path
project_root = os.path.abspath("../..")
sys.path.append(project_root)

import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

from utils.constants import ML_READY_DATA_FILE, TEST_MODE
from utils.data_loader import DataLoader
from utils.train_test_metrics_logger import TrainTestMetricsLogger

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Display current test mode status
if TEST_MODE:
    print("TEST_MODE is ON – reduced data and iterations.")
else:
    print("TEST_MODE is OFF – full training.")

# === Load and prepare data ===
loader = DataLoader(ML_READY_DATA_FILE)
df = loader.load_data()
X = df.drop(columns=["price"])
y = df["price"]

# === Feature selection using variance threshold ===
selector = VarianceThreshold(threshold=0.01)
selector.fit(X)
X_reduced = X.loc[:, selector.get_support()]

# === Select top 30 features using Random Forest importance ===
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_reduced, y)
importances = rf.feature_importances_
top_features = pd.Series(importances, index=X_reduced.columns).sort_values(ascending=False).head(30).index.tolist()
X_top = X_reduced[top_features]

# === Split data for all features and top features ===
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
X_train_top, X_test_top, _, _ = train_test_split(X_top, y, test_size=0.2, random_state=42)

# === Define XGBoost parameters ===
use_gpu = True
params = {
    "objective": "reg:squarederror",
    "random_state": 42,
    "n_estimators": 100 if TEST_MODE else 400,
    "tree_method": "gpu_hist" if use_gpu else "hist",
    "n_jobs": -1,
    "verbosity": 2
}

# === Train XGBoost on all features ===
model_all = xgb.XGBRegressor(**params)
model_all.fit(X_train, y_train)
y_pred_train_all = model_all.predict(X_train)
y_pred_test_all = model_all.predict(X_test)

# === Train XGBoost on top 30 features ===
model_top = xgb.XGBRegressor(**params)
model_top.fit(X_train_top, y_train)
y_pred_train_top = model_top.predict(X_train_top)
y_pred_test_top = model_top.predict(X_test_top)

# === Define evaluation function ===
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

# === Evaluate performance ===
mae_train_all, rmse_train_all, r2_train_all = evaluate(y_train, y_pred_train_all)
mae_test_all, rmse_test_all, r2_test_all = evaluate(y_test, y_pred_test_all)

mae_train_top, rmse_train_top, r2_train_top = evaluate(y_train, y_pred_train_top)
mae_test_top, rmse_test_top, r2_test_top = evaluate(y_test, y_pred_test_top)

# === Initialize logger and log results ===
logger = TrainTestMetricsLogger()

logger.log(
    model_name=f"XGBoost CV (All Features){' [TEST]' if TEST_MODE else ''}",
    experiment_name=f"XGBoost Untuned (All Features){' [TEST]' if TEST_MODE else ''}",
    mae_train=mae_train_all,
    rmse_train=rmse_train_all,
    r2_train=r2_train_all,
    mae_test=mae_test_all,
    rmse_test=rmse_test_all,
    r2_test=r2_test_all,
    data_file=ML_READY_DATA_FILE
)

logger.log(
    model_name=f"XGBoost CV (Top RF Features){' [TEST]' if TEST_MODE else ''}",
    experiment_name=f"XGBoost Untuned (Top RF Features){' [TEST]' if TEST_MODE else ''}",
    mae_train=mae_train_top,
    rmse_train=rmse_train_top,
    r2_train=r2_train_top,
    mae_test=mae_test_top,
    rmse_test=rmse_test_top,
    r2_test=r2_test_top,
    data_file=ML_READY_DATA_FILE
)

# === Display summary table ===
logger.display_table()


TEST_MODE is ON – reduced data and iterations.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Rank,Best,timestamp,model,mae_train,rmse_train,r2_train,mae_test,rmse_test,r2_test,mae_gap,interpretation,ranking_score
1,V,2025-07-01 11:53:56,CatBoost + Optuna CV (Top RF Features) [TEST],42.9 k€,59.3 k€,0.919542,42.9 k€,59.3 k€,0.919542,0.0 k€,good generalization,-102115.600242
2,,2025-07-01 11:53:56,CatBoost + Optuna CV (All Features) [TEST],46.6 k€,65.6 k€,0.901469,46.6 k€,65.6 k€,0.901469,0.0 k€,good generalization,-112200.372015
3,,2025-07-01 12:19:06,XGBoost Optuna CV (Top RF Features) [TEST],28.8 k€,40.7 k€,0.962062,61.1 k€,92.2 k€,0.805231,0.0 k€,overfitting,-153312.200139
4,,2025-07-01 12:19:06,XGBoost Optuna CV (All Features) [TEST],11.7 k€,16.7 k€,0.993607,62.1 k€,94.2 k€,0.796529,0.0 k€,overfitting,-156337.312104
5,,2025-07-01 11:56:01,XGBoost Optuna CV (All Features) [TEST],7.1 k€,11.1 k€,0.997187,62.3 k€,95.7 k€,0.790037,0.0 k€,overfitting,-157981.04943
6,,2025-07-01 11:56:02,XGBoost Optuna CV (Top RF Features) [TEST],47.6 k€,66.8 k€,0.89765,63.7 k€,95.0 k€,0.793072,0.0 k€,overfitting,-158692.095094
7,,2025-07-01 12:29:06,XGBoost CV (All Features) [TEST],43.7 k€,61.6 k€,0.913078,64.2 k€,95.2 k€,0.792465,0.0 k€,overfitting,-159345.188651
8,,2025-07-01 11:54:42,XGBoost CV (All Features) [TEST],43.7 k€,61.6 k€,0.913078,64.2 k€,95.2 k€,0.792465,0.0 k€,overfitting,-159345.188651
9,,2025-07-01 12:17:28,XGBoost CV (All Features) [TEST],43.7 k€,61.6 k€,0.913078,64.2 k€,95.2 k€,0.792465,0.0 k€,overfitting,-159345.188651
10,,2025-07-01 12:17:28,XGBoost CV (Top RF Features) [TEST],45.3 k€,63.7 k€,0.906928,64.4 k€,96.2 k€,0.788001,0.0 k€,overfitting,-160583.104712


# 🎯 XGBoost Regression with Optuna Hyperparameter Tuning

This notebook trains two XGBoost regression models on real estate data, with **hyperparameter tuning using Optuna**. It includes all stages from loading the data to model diagnostics.

## Data Preparation

- Load the cleaned ML-ready dataset from a CSV file using `DataLoader`.
- Drop the target variable `price` to separate `X` and `y`.
- Apply `VarianceThreshold` to remove low-variance features (threshold = 0.01).
- Use a `RandomForestRegressor` to rank feature importance.
- Select the **top 30 most important features** for one of the models.


## Hyperparameter Tuning (Optuna)

Define the function `tune_xgboost_with_optuna(...)` that:

- Runs an Optuna optimization loop.
- Evaluates model performance with **5-Fold Cross-Validation**.
- Minimizes the **Root Mean Squared Error (RMSE)**.

### Tuned Hyperparameters:

- `max_depth`
- `learning_rate`
- `n_estimators`
- `subsample`, `colsample_bytree`
- `reg_alpha`, `reg_lambda`
- `min_child_weight`, `gamma`



## Train Final Models

Two models are trained:

- One using **all filtered features**
- One using the **top 30 features**

Each is trained using the **best parameters** found by Optuna.

---

## Evaluation

Models are evaluated using:

- `MAE`: Mean Absolute Error  
- `RMSE`: Root Mean Squared Error  
- `R<sup>2</sup>`: Coefficient of determination  

Results are logged with `ExperimentTracker`.



## Diagnostics

- Summary tables displayed with `ModelEvaluator`
- Residuals & diagnostic plots from `ModelVisualizer`
- Optionally, **SHAP values** can be plotted to understand feature importance



## Test Mode (Optional)

When `TEST_MODE = True`, the pipeline uses:

- A smaller dataset  
- Fewer Optuna trials (`n_trials = 3`)  

To speed up execution and debugging.


In [13]:
import sys, os

# Add the project root to the Python path
project_root = os.path.abspath("../..")
sys.path.append(project_root)

import optuna
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

from utils.constants import ML_READY_DATA_FILE, TEST_MODE
from utils.data_loader import DataLoader
from utils.model_evaluator import ModelEvaluator
from utils.train_test_metrics_logger import TrainTestMetricsLogger

# === Root Mean Squared Error ===
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

use_gpu = True  # GPU toggle

if TEST_MODE:
    print("TEST_MODE is ON – reduced data and iterations.")
else:
    print("TEST_MODE is OFF – full training is active.")
print(f"Using GPU: {use_gpu}")

# === Load Data ===
loader = DataLoader(ML_READY_DATA_FILE)
df = loader.load_data()

X = df.drop(columns=["price"])
y = df["price"]

# === Remove low-variance features ===
selector = VarianceThreshold(threshold=0.01)
selector.fit(X)
X_reduced = X.loc[:, selector.get_support()]

# === Extract top 30 features via RandomForest ===
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_reduced, y)
importances = rf_model.feature_importances_
top_features = pd.Series(importances, index=X_reduced.columns).sort_values(ascending=False).head(30).index.tolist()
X_top = X_reduced[top_features]

# === Split data ===
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(X_top, y, test_size=0.2, random_state=42)

# === Tuning function ===
def tune_xgboost_with_optuna(X_data, y_data, n_trials=50):
    def objective(trial):
        params = {
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "n_estimators": trial.suggest_int("n_estimators", 100, 800),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
            "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "tree_method": "hist",
            "device": "cuda" if use_gpu else "cpu"
            
        }

        model = xgb.XGBRegressor(**params, objective="reg:squarederror", random_state=42, n_jobs=-1)
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = -cross_val_score(model, X_data, y_data, scoring="neg_root_mean_squared_error", cv=cv)
        return scores.mean()

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study

# === Step 5: Train both models ===
n_trials = 3 if TEST_MODE else 50

study_all = tune_xgboost_with_optuna(X_train_all, y_train_all, n_trials=n_trials)
model_all = xgb.XGBRegressor(**study_all.best_params, objective="reg:squarederror", tree_method="gpu_hist" if use_gpu else "hist", n_jobs=-1, random_state=42)
model_all.fit(X_train_all, y_train_all)
y_pred_train_all = model_all.predict(X_train_all)
y_pred_test_all = model_all.predict(X_test_all)

study_top = tune_xgboost_with_optuna(X_train_top, y_train_top, n_trials=n_trials)
model_top = xgb.XGBRegressor(**study_top.best_params, objective="reg:squarederror", tree_method="gpu_hist" if use_gpu else "hist", n_jobs=-1, random_state=42)
model_top.fit(X_train_top, y_train_top)
y_pred_train_top = model_top.predict(X_train_top)
y_pred_test_top = model_top.predict(X_test_top)

# === Step 6: Evaluate ===
def evaluate(y_true, y_pred):
    return (
        mean_absolute_error(y_true, y_pred),
        root_mean_squared_error(y_true, y_pred),
        r2_score(y_true, y_pred)
    )

mae_train_all, rmse_train_all, r2_train_all = evaluate(y_train_all, y_pred_train_all)
mae_test_all, rmse_test_all, r2_test_all = evaluate(y_test_all, y_pred_test_all)

mae_train_top, rmse_train_top, r2_train_top = evaluate(y_train_top, y_pred_train_top)
mae_test_top, rmse_test_top, r2_test_top = evaluate(y_test_top, y_pred_test_top)

# === Step 7: Log Results ===
logger = TrainTestMetricsLogger()
suffix = " [TEST]" if TEST_MODE else ""

logger.log(
    model_name=f"XGBoost Optuna CV (All Features){suffix}",
    experiment_name=f"XGBoost Optuna-Tuned (All Features){suffix}",
    mae_train=mae_train_all,
    rmse_train=rmse_train_all,
    r2_train=r2_train_all,
    mae_test=mae_test_all,
    rmse_test=rmse_test_all,
    r2_test=r2_test_all,
    data_file=ML_READY_DATA_FILE
)

logger.log(
    model_name=f"XGBoost Optuna CV (Top RF Features){suffix}",
    experiment_name=f"XGBoost Optuna-Tuned (Top RF Features){suffix}",
    mae_train=mae_train_top,
    rmse_train=rmse_train_top,
    r2_train=r2_train_top,
    mae_test=mae_test_top,
    rmse_test=rmse_test_top,
    r2_test=r2_test_top,
    data_file=ML_READY_DATA_FILE
)

# === Display summary ===
logger.display_table()


TEST_MODE is ON – reduced data and iterations.
Using GPU: True


[I 2025-07-01 12:29:14,346] A new study created in memory with name: no-name-075da5b2-7e81-4857-827d-20a9463e66b7
[I 2025-07-01 12:29:24,826] Trial 0 finished with value: 99725.37537224349 and parameters: {'max_depth': 4, 'learning_rate': 0.28145711867842593, 'n_estimators': 702, 'subsample': 0.9007428661873919, 'colsample_bytree': 0.7036921334274935, 'reg_alpha': 0.6246057571210253, 'reg_lambda': 0.027315730697277107, 'min_child_weight': 2.8426657696229807, 'gamma': 4.509543061555045}. Best is trial 0 with value: 99725.37537224349.
[I 2025-07-01 12:29:38,014] Trial 1 finished with value: 102341.68904619009 and parameters: {'max_depth': 7, 'learning_rate': 0.24299677672275424, 'n_estimators': 587, 'subsample': 0.8272459515466666, 'colsample_bytree': 0.966322983959697, 'reg_alpha': 0.6259769383994697, 'reg_lambda': 0.19489201435879344, 'min_child_weight': 2.3420762319672743, 'gamma': 0.4666700214001429}. Best is trial 0 with value: 99725.37537224349.
[I 2025-07-01 12:29:44,596] Trial 2 

Rank,Best,timestamp,model,mae_train,rmse_train,r2_train,mae_test,rmse_test,r2_test,mae_gap,interpretation,ranking_score
1,V,2025-07-01 11:53:56,CatBoost + Optuna CV (Top RF Features) [TEST],42.9 k€,59.3 k€,0.919542,42.9 k€,59.3 k€,0.919542,0.0 k€,good generalization,-102115.600242
2,,2025-07-01 11:53:56,CatBoost + Optuna CV (All Features) [TEST],46.6 k€,65.6 k€,0.901469,46.6 k€,65.6 k€,0.901469,0.0 k€,good generalization,-112200.372015
3,,2025-07-01 12:19:06,XGBoost Optuna CV (Top RF Features) [TEST],28.8 k€,40.7 k€,0.962062,61.1 k€,92.2 k€,0.805231,0.0 k€,overfitting,-153312.200139
4,,2025-07-01 12:19:06,XGBoost Optuna CV (All Features) [TEST],11.7 k€,16.7 k€,0.993607,62.1 k€,94.2 k€,0.796529,0.0 k€,overfitting,-156337.312104
5,,2025-07-01 12:30:03,XGBoost Optuna CV (All Features) [TEST],32.8 k€,45.4 k€,0.952684,62.7 k€,94.4 k€,0.79557,0.0 k€,overfitting,-157121.658873
6,,2025-07-01 11:56:01,XGBoost Optuna CV (All Features) [TEST],7.1 k€,11.1 k€,0.997187,62.3 k€,95.7 k€,0.790037,0.0 k€,overfitting,-157981.04943
7,,2025-07-01 11:56:02,XGBoost Optuna CV (Top RF Features) [TEST],47.6 k€,66.8 k€,0.89765,63.7 k€,95.0 k€,0.793072,0.0 k€,overfitting,-158692.095094
8,,2025-07-01 12:17:28,XGBoost CV (All Features) [TEST],43.7 k€,61.6 k€,0.913078,64.2 k€,95.2 k€,0.792465,0.0 k€,overfitting,-159345.188651
9,,2025-07-01 12:29:06,XGBoost CV (All Features) [TEST],43.7 k€,61.6 k€,0.913078,64.2 k€,95.2 k€,0.792465,0.0 k€,overfitting,-159345.188651
10,,2025-07-01 11:54:42,XGBoost CV (All Features) [TEST],43.7 k€,61.6 k€,0.913078,64.2 k€,95.2 k€,0.792465,0.0 k€,overfitting,-159345.188651


# Saving XGBoost + Optuna Hyperparameter Tuning Models (`.pkl`) After Training

After training XGBoost models with Optuna tuning, it's essential to persist the trained models using `.pkl` files. The script below ensures each model is saved with a unique, timestamped filename and organized in the correct directory.


##  What the Script Does

1. **Appends the project root** to the Python path (to allow relative imports).
2. **Generates a timestamped filename**, including an optional `_TEST` suffix if `TEST_MODE` is enabled.
3. **Ensures the target directory exists**, and creates it if necessary.
4. **Saves both trained models** using `joblib.dump()`:
   - One trained with **all features**.
   - One trained with the **top 30 features** (e.g., selected via Random Forest).

In [14]:
import sys, os

# Add the project root to the Python path
project_root = os.path.abspath("../..")
sys.path.append(project_root)

import joblib
from datetime import datetime
from utils.constants import TEST_MODE, MODELS_DIR

# Create timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M")

# Add suffix if in TEST mode
suffix = "_TEST" if TEST_MODE else ""

# Define subdirectory for .pkl files
PKL_DIR = os.path.join(MODELS_DIR, "pkl")
os.makedirs(PKL_DIR, exist_ok=True)

# Build filenames
filename_all = f"xgboost_optuna_all_{timestamp}{suffix}.pkl"
filename_top = f"xgboost_optuna_top30_{timestamp}{suffix}.pkl"

# Save models
joblib.dump(model_all, os.path.join(PKL_DIR, filename_all))
joblib.dump(model_top, os.path.join(PKL_DIR, filename_top))

print(f"[✔] Models saved to '{PKL_DIR}' as:\n - {filename_all}\n - {filename_top}")


[✔] Models saved to 'e:\_SoftEng\_BeCode\real-estate-price-predictor\models\pkl' as:
 - xgboost_optuna_all_20250701_1230_TEST.pkl
 - xgboost_optuna_top30_20250701_1230_TEST.pkl


# Saving Feature Lists Used by Each Model (`.json`)

After training and saving your machine learning models (e.g., XGBoost or CatBoost), it's critical to also save the **list of features** used during training. This ensures **inference compatibility** and prevents mismatches between the model and the input data.


## What the Script Does

1. **Creates the directory** for storing feature metadata:
   - Located in: `models/features/`

2. **Saves two JSON files**:
   - One listing the full set of features used in the **all-features model**.
   - One listing the selected **top 30 features** (e.g., based on feature importance).

3. **Uses the same base name as the corresponding `.pkl` model**, replacing the extension:
   - Example: `xgboost_optuna_all_20250629_1430.pkl` → `xgboost_optuna_all_20250629_1430.json`





In [15]:
import json

# Define subdirectory for features
FEATURES_DIR = os.path.join(MODELS_DIR, "features")
os.makedirs(FEATURES_DIR, exist_ok=True)

# Save features used for each model
with open(os.path.join(FEATURES_DIR, filename_all.replace(".pkl", ".json")), "w") as f:
    json.dump(list(X_reduced.columns), f, indent=2)

with open(os.path.join(FEATURES_DIR, filename_top.replace(".pkl", ".json")), "w") as f:
    json.dump(top_features, f, indent=2)

print(f"[✔] Associated feature files saved to '{FEATURES_DIR}'")


[✔] Associated feature files saved to 'e:\_SoftEng\_BeCode\real-estate-price-predictor\models\features'
