### Hyperparameter Tuning

In [0]:
import pandas as pd

# load objects saved in prior notebook
X_train = pd.read_parquet('/dbfs/tmp/X_train.parquet')
X_val = pd.read_parquet('/dbfs/tmp/X_val.parquet')
X_test = pd.read_parquet('/dbfs/tmp/X_test.parquet')
y_train = pd.read_parquet('/dbfs/tmp/y_train.parquet')
y_val = pd.read_parquet('/dbfs/tmp/y_val.parquet')
y_test = pd.read_parquet('/dbfs/tmp/y_test.parquet')


In [0]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import root_mean_squared_error
import mlflow
import mlflow.xgboost
from mlflow.tracking import MlflowClient

In [0]:
# define search space for hyperparameters
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 500, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
}

# define objective function for tuning
def objective(params):
    model = xgb.XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    return {'loss': rmse,'status': STATUS_OK}

# initialize MLflow
mlflow.set_experiment('/Users/matspencer@uchicago.edu/Housing_HPT')

# run hyperparemeter tuning
with mlflow.start_run(run_name='xgb_hyperopt'):
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=50,
        trials=Trials()
    )

print('Best parameters found: ', best_params)

In [0]:
# save to parquet files so we can access objects in other tasks
best_params_df = pd.DataFrame([best_params])
best_params_df.to_parquet('/dbfs/tmp/best_params.parquet')