In [None]:
import optuna
import mlflow
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# ==========================================
# 1. SETUP
# ==========================================
mlflow.set_tracking_uri(uri="http://127.0.0.1:8000")
mlflow.set_experiment("Experiment Tracking - House Price Prediction")

In [None]:
## Loading the Data

train_df = pd.read_csv(r"../data/processed/train.csv")
eval_df = pd.read_csv(r"../data/processed/eval.csv")

In [None]:
# ==========================================
# 2. DEFINE OBJECTIVE FUNCTION
# ==========================================
def objective(trial):
    # A. Suggest Hyperparameters
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist"
    }

    # B. Create Nested Run
    with mlflow.start_run(nested=True, run_name=f"Trial_{trial.number}"):
        
        # Pipeline: Impute -> XGB
        pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('regressor', XGBRegressor(**params))
        ])

        # Log-Transform Target
        model = TransformedTargetRegressor(
            regressor=pipeline,
            func=np.log1p,
            inverse_func=np.expm1
        )

        model.fit(X_train, y_train)

        preds = model.predict(X_eval)
        rmse = np.sqrt(mean_squared_error(y_eval, preds))
        
        # Log to MLflow
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)

    return rmse

# ==========================================
# 3. RUN OPTIMIZATION
# ==========================================
print("üöÄ Starting Hyperparameter Tuning...")

with mlflow.start_run(run_name="XGB_Hyperparameter_Tuning"):
    
    study = optuna.create_study(study_name="house_price_optimization", direction="minimize")
    study.optimize(objective, n_trials=15)

    print(f"\nüèÜ Best Trial: {study.best_trial.value}")
    print("Best Params:", study.best_params)
    
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_rmse", study.best_value)