In [1]:
import xgboost as xgb
print(xgb.__version__)

3.1.1


In [2]:
import sys, xgboost as xgb
print(sys.executable)        # should point to .../.venv/bin/python
print(xgb.__version__)       # should print 3.0.4
print(xgb.__file__)          # should live under .../.venv/...

c:\Users\H.P\Desktop\Housing Regression MLE\.venv\Scripts\python.exe
3.1.1
c:\Users\H.P\Desktop\Housing Regression MLE\.venv\Lib\site-packages\xgboost\__init__.py


In [3]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

In [4]:
# 2. Load processed datasets
train_df = pd.read_csv(r'C:\Users\H.P\Desktop\Housing Regression MLE\data\processed\feature_engineered_train.csv')
eval_df = pd.read_csv(r'C:\Users\H.P\Desktop\Housing Regression MLE\data\processed\feature_engineered_eval.csv')


# Define target + features
target = "price"
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_eval, y_eval   = eval_df.drop(columns=[target]), eval_df[target]

print("Train shape:", X_train.shape)
print("Eval shape:", X_eval.shape)

Train shape: (578916, 39)
Eval shape: (148448, 39)


In [5]:
# 3. Define Optuna objective function with MLflow

def objective(trial):
     # Define hyperparameters to tune
    params = {
            # Hyperparameter search spaces using Optuna's suggest methods,
            # which sample values within specified ranges or distributions.
            "n_estimators": trial.suggest_int("n_estimators", 200, 1000),  # number of boosting rounds
            "max_depth": trial.suggest_int("max_depth", 3, 10),           # max tree depth to control complexity
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),  # step size for updates
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),      # fraction of data per tree
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),  # fraction of features per tree
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),  # min sum of instance weights in a child
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),              # min loss reduction for split
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),  # L1 regularization term
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True), # L2 regularization term
            "random_state": 42,                                            # fixed seed for reproducibility
            "n_jobs": -1,                                                  # use all CPU cores for parallelism
            "tree_method": "hist",                                         # fast histogram optimized tree method
        }

    # Start a nested MLflow run for this trial to log parameters and metrics
    with mlflow.start_run(nested=True):
        # Instantiate and train the XGBoost regressor with the sampled hyperparameters
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        # Predict on evaluation set and calculate evaluation metrics
        y_pred = model.predict(X_eval)
        rmse = float(np.sqrt(mean_squared_error(y_eval, y_pred)))  # Root Mean Squared Error
        mae = float(mean_absolute_error(y_eval, y_pred))           # Mean Absolute Error
        r2 = float(r2_score(y_eval, y_pred))                        # R-squared score

        # Log the parameters and metrics of this trial to MLflow
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    # Objective function returns the error metric to minimize (RMSE here)
    return rmse    

In [7]:
from pathlib import Path # for OS-independent path handling

# Set MLflow tracking URI to a local directory where experiment data is saved
mlflow_path = Path(r'C:\Users\H.P\Desktop\Housing Regression MLE\mlruns').as_uri() # Convert path to URI format
mlflow.set_tracking_uri(mlflow_path)

# Set or create MLflow experiment name for grouping runs
mlflow.set_experiment("xgboost_optuna_housing")

# Create an Optuna study; direction="minimize" means it tries to minimize the objective function result (rmse)
study = optuna.create_study(direction='minimize')

# Execute the study optimization for 15 trials
study.optimize(objective, n_trials=15)

# Print the best found hyperparameters after tuning
print("Best params:", study.best_trial.params)

2025/11/03 20:15:01 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_optuna_housing' does not exist. Creating a new experiment.
[I 2025-11-03 20:15:01,583] A new study created in memory with name: no-name-fc849c59-ca5a-4524-a28f-a11de57fbfd1
[I 2025-11-03 20:16:43,861] Trial 0 finished with value: 74977.9062564675 and parameters: {'n_estimators': 966, 'max_depth': 9, 'learning_rate': 0.07009350576049779, 'subsample': 0.8064045349478732, 'colsample_bytree': 0.7151456787570268, 'min_child_weight': 9, 'gamma': 2.6673665748902113, 'reg_alpha': 3.566143209104314, 'reg_lambda': 0.8665189968336321}. Best is trial 0 with value: 74977.9062564675.
[I 2025-11-03 20:17:49,185] Trial 1 finished with value: 73350.27462785007 and parameters: {'n_estimators': 857, 'max_depth': 5, 'learning_rate': 0.04691942246162044, 'subsample': 0.7548198065157474, 'colsample_bytree': 0.5714158142645223, 'min_child_weight': 8, 'gamma': 3.327465677478813, 'reg_alpha': 2.1360092949405652e-07, 'reg_lambda': 0.

Best params: {'n_estimators': 843, 'max_depth': 8, 'learning_rate': 0.039839486699459636, 'subsample': 0.7341568080690035, 'colsample_bytree': 0.6113904727895223, 'min_child_weight': 3, 'gamma': 0.920039969221913, 'reg_alpha': 0.006868825403502387, 'reg_lambda': 1.8666324811545187e-06}


In [8]:
# 5. Train final model with best params and log to MLflow
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_eval)

mae = mean_absolute_error(y_eval, y_pred)
rmse = np.sqrt(mean_squared_error(y_eval, y_pred))
r2 = r2_score(y_eval, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

# Log final model
with mlflow.start_run(run_name="best_xgboost_model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.xgboost.log_model(best_model, name="model")

Final tuned model performance:
MAE: 30956.589210415932
RMSE: 71976.29727484864
R²: 0.9599650736814732


  self.get_booster().save_model(fname)
