In [37]:
%load_ext autoreload
%autoreload 2

from src.functions import preprocess, feature_engineering, evaluate_model
from xgboost import XGBRegressor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
df_train, df_test = preprocess()
X_train, X_test = feature_engineering(df_train, df_test)
y_train = df_train.loc[:, "SalePrice"]

rmsle, mae = evaluate_model(X_train, y_train);

RMSLE: 0.13749
MAE: 17700.82934


### Hyperparameter Tuning with Optuna

In [35]:
# Adapted from Kaggle House Prices starter notebook:
# https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

import optuna

def objective(trial):
    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
    )
    xgb = XGBRegressor(**xgb_params)
    rmsle, mae = evaluate_model(X_train, y_train, xgb)
    return rmsle

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)
xgb_params = study.best_params

[I 2025-08-22 19:56:00,929] A new study created in memory with name: no-name-594d20ea-942c-489b-a43c-fdd6455d808f
[I 2025-08-22 19:57:42,086] Trial 0 finished with value: 0.2108719254964887 and parameters: {'max_depth': 6, 'learning_rate': 0.00022663761432875957, 'n_estimators': 4124, 'min_child_weight': 7, 'colsample_bytree': 0.4727568388853029, 'subsample': 0.7771595526259486, 'reg_alpha': 0.0016694676234036959, 'reg_lambda': 0.12436855456761942}. Best is trial 0 with value: 0.2108719254964887.


RMSLE: 0.21087
MAE: 28294.46444


[I 2025-08-22 20:05:36,438] Trial 1 finished with value: 0.1228525440959144 and parameters: {'max_depth': 7, 'learning_rate': 0.0023965037319501506, 'n_estimators': 3319, 'min_child_weight': 6, 'colsample_bytree': 0.2836399501895385, 'subsample': 0.4288179748571228, 'reg_alpha': 0.23067357494133425, 'reg_lambda': 0.0006198747043403971}. Best is trial 1 with value: 0.1228525440959144.


RMSLE: 0.12285
MAE: 14947.35738


[I 2025-08-22 20:07:04,259] Trial 2 finished with value: 0.2389079743632671 and parameters: {'max_depth': 7, 'learning_rate': 0.00023780912122903713, 'n_estimators': 3643, 'min_child_weight': 6, 'colsample_bytree': 0.3113264526690822, 'subsample': 0.645123959857438, 'reg_alpha': 1.3048193744468655, 'reg_lambda': 0.0001325422048072136}. Best is trial 1 with value: 0.1228525440959144.


RMSLE: 0.23891
MAE: 29489.27456


[I 2025-08-22 20:07:57,961] Trial 3 finished with value: 0.12143966509905435 and parameters: {'max_depth': 4, 'learning_rate': 0.015840004600988885, 'n_estimators': 4499, 'min_child_weight': 2, 'colsample_bytree': 0.8034824130200545, 'subsample': 0.7679407907818657, 'reg_alpha': 0.25322332117713814, 'reg_lambda': 0.005411228868185578}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.12144
MAE: 15243.13138


[I 2025-08-22 20:08:30,508] Trial 4 finished with value: 0.3031509682850766 and parameters: {'max_depth': 2, 'learning_rate': 0.00010523032311969691, 'n_estimators': 4690, 'min_child_weight': 8, 'colsample_bytree': 0.3070565934141983, 'subsample': 0.8154044915022447, 'reg_alpha': 0.07998318424264733, 'reg_lambda': 0.0012032846205569606}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.30315
MAE: 42004.04515


[I 2025-08-22 20:09:33,706] Trial 5 finished with value: 0.12375325381814267 and parameters: {'max_depth': 5, 'learning_rate': 0.011110512125500006, 'n_estimators': 4612, 'min_child_weight': 7, 'colsample_bytree': 0.6990310611074056, 'subsample': 0.9121403769786554, 'reg_alpha': 0.0069106164320648904, 'reg_lambda': 0.6924983426665091}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.12375
MAE: 15596.78560


[I 2025-08-22 20:10:22,503] Trial 6 finished with value: 0.28821243869983204 and parameters: {'max_depth': 8, 'learning_rate': 0.00018665557457879384, 'n_estimators': 2387, 'min_child_weight': 3, 'colsample_bytree': 0.25799630523275596, 'subsample': 0.2505596452961699, 'reg_alpha': 0.00016058151200470515, 'reg_lambda': 0.002394269554984775}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.28821
MAE: 40207.55396


[I 2025-08-22 20:10:49,254] Trial 7 finished with value: 0.34781287871140315 and parameters: {'max_depth': 8, 'learning_rate': 0.000214813540477385, 'n_estimators': 2027, 'min_child_weight': 5, 'colsample_bytree': 0.5686251801988148, 'subsample': 0.21390589100444615, 'reg_alpha': 0.06838348562852914, 'reg_lambda': 67.96404541122943}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.34781
MAE: 48021.44388


[I 2025-08-22 20:11:29,872] Trial 8 finished with value: 0.39954597110801426 and parameters: {'max_depth': 5, 'learning_rate': 0.00012144723884911023, 'n_estimators': 4881, 'min_child_weight': 8, 'colsample_bytree': 0.24602453953409287, 'subsample': 0.20392985493321059, 'reg_alpha': 97.37122495224419, 'reg_lambda': 0.00015162591262325354}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.39955
MAE: 36798.62948


[I 2025-08-22 20:14:24,933] Trial 9 finished with value: 0.12361354763509004 and parameters: {'max_depth': 8, 'learning_rate': 0.0011809919555317377, 'n_estimators': 6901, 'min_child_weight': 1, 'colsample_bytree': 0.39478802087326353, 'subsample': 0.6295474517488232, 'reg_alpha': 0.00038929456450735345, 'reg_lambda': 0.020949850192611363}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.12361
MAE: 15023.05640


[I 2025-08-22 20:15:04,011] Trial 10 finished with value: 0.15179961433992692 and parameters: {'max_depth': 2, 'learning_rate': 0.08859832815899443, 'n_estimators': 6563, 'min_child_weight': 10, 'colsample_bytree': 0.9854649978194383, 'subsample': 0.9685951610140364, 'reg_alpha': 10.997743169457408, 'reg_lambda': 10.65169523124058}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.15180
MAE: 17047.86509


[I 2025-08-22 20:16:12,777] Trial 11 finished with value: 0.12660138971551047 and parameters: {'max_depth': 10, 'learning_rate': 0.007287769764625627, 'n_estimators': 3000, 'min_child_weight': 4, 'colsample_bytree': 0.7890657997875058, 'subsample': 0.4521813442361623, 'reg_alpha': 0.8964526766650684, 'reg_lambda': 0.006238298386582344}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.12660
MAE: 15531.40719


[I 2025-08-22 20:16:26,519] Trial 12 finished with value: 0.1250685096799967 and parameters: {'max_depth': 4, 'learning_rate': 0.03281694641555504, 'n_estimators': 1145, 'min_child_weight': 1, 'colsample_bytree': 0.8623322955054076, 'subsample': 0.46306859287566654, 'reg_alpha': 0.797671095153861, 'reg_lambda': 0.03458886865489298}. Best is trial 3 with value: 0.12143966509905435.


RMSLE: 0.12507
MAE: 15088.76792


[I 2025-08-22 20:17:27,386] Trial 13 finished with value: 0.12049352530138917 and parameters: {'max_depth': 4, 'learning_rate': 0.0014563217938006745, 'n_estimators': 5508, 'min_child_weight': 3, 'colsample_bytree': 0.6191655213583773, 'subsample': 0.45544819100479195, 'reg_alpha': 0.016896296471984533, 'reg_lambda': 0.0007949289706490003}. Best is trial 13 with value: 0.12049352530138917.


RMSLE: 0.12049
MAE: 15254.65563


[I 2025-08-22 20:18:18,266] Trial 14 finished with value: 0.1255077839466059 and parameters: {'max_depth': 3, 'learning_rate': 0.0010220232804108403, 'n_estimators': 5598, 'min_child_weight': 3, 'colsample_bytree': 0.6336315661329737, 'subsample': 0.7668445430926636, 'reg_alpha': 0.00864138135009195, 'reg_lambda': 0.25573933371084917}. Best is trial 13 with value: 0.12049352530138917.


RMSLE: 0.12551
MAE: 16016.71005


[I 2025-08-22 20:19:24,076] Trial 15 finished with value: 0.11972931120442999 and parameters: {'max_depth': 4, 'learning_rate': 0.00874847470178376, 'n_estimators': 5656, 'min_child_weight': 2, 'colsample_bytree': 0.7840566743072875, 'subsample': 0.5571420094265196, 'reg_alpha': 0.009638700398623662, 'reg_lambda': 0.008709853105304506}. Best is trial 15 with value: 0.11972931120442999.


RMSLE: 0.11973
MAE: 15239.94302


[I 2025-08-22 20:20:50,967] Trial 16 finished with value: 0.11887005753184866 and parameters: {'max_depth': 4, 'learning_rate': 0.004185159259742257, 'n_estimators': 7997, 'min_child_weight': 3, 'colsample_bytree': 0.5400795264903838, 'subsample': 0.5511277268376303, 'reg_alpha': 0.014162220647576893, 'reg_lambda': 0.000511437425259024}. Best is trial 16 with value: 0.11887005753184866.


RMSLE: 0.11887
MAE: 15066.66228


[I 2025-08-22 20:22:00,655] Trial 17 finished with value: 0.11946296595494846 and parameters: {'max_depth': 3, 'learning_rate': 0.005479750304936289, 'n_estimators': 7889, 'min_child_weight': 4, 'colsample_bytree': 0.502124940968877, 'subsample': 0.5594504960660838, 'reg_alpha': 0.0013731739179726212, 'reg_lambda': 0.0178257189579679}. Best is trial 16 with value: 0.11887005753184866.


RMSLE: 0.11946
MAE: 15279.84909


[I 2025-08-22 20:23:12,545] Trial 18 finished with value: 0.11993279936771287 and parameters: {'max_depth': 3, 'learning_rate': 0.0039508647063235865, 'n_estimators': 7896, 'min_child_weight': 4, 'colsample_bytree': 0.4776157408794594, 'subsample': 0.3320131438541243, 'reg_alpha': 0.0016135074673479292, 'reg_lambda': 2.057009912250813}. Best is trial 16 with value: 0.11887005753184866.


RMSLE: 0.11993
MAE: 15372.64381


[I 2025-08-22 20:24:25,756] Trial 19 finished with value: 0.12159174297064407 and parameters: {'max_depth': 3, 'learning_rate': 0.020779743668139086, 'n_estimators': 7802, 'min_child_weight': 5, 'colsample_bytree': 0.5092821430705617, 'subsample': 0.5376610781094168, 'reg_alpha': 0.0009949826091529466, 'reg_lambda': 0.0469564842034924}. Best is trial 16 with value: 0.11887005753184866.


RMSLE: 0.12159
MAE: 15713.63201


[I 2025-08-22 20:26:04,256] Trial 20 finished with value: 0.12310884823101241 and parameters: {'max_depth': 5, 'learning_rate': 0.0006621571576359918, 'n_estimators': 7026, 'min_child_weight': 4, 'colsample_bytree': 0.3814630464561176, 'subsample': 0.698146143090774, 'reg_alpha': 0.029413031747966896, 'reg_lambda': 0.0002449509771836965}. Best is trial 16 with value: 0.11887005753184866.


RMSLE: 0.12311
MAE: 15150.68213


[I 2025-08-22 20:27:07,128] Trial 21 finished with value: 0.1189112650304843 and parameters: {'max_depth': 3, 'learning_rate': 0.005412022453697639, 'n_estimators': 6213, 'min_child_weight': 2, 'colsample_bytree': 0.7305287399263211, 'subsample': 0.5525586640193951, 'reg_alpha': 0.0050734059940489825, 'reg_lambda': 0.009108805451285784}. Best is trial 16 with value: 0.11887005753184866.


RMSLE: 0.11891
MAE: 15185.50470


[I 2025-08-22 20:27:54,608] Trial 22 finished with value: 0.12133830885698693 and parameters: {'max_depth': 2, 'learning_rate': 0.003675290172109596, 'n_estimators': 6314, 'min_child_weight': 2, 'colsample_bytree': 0.6849873132477713, 'subsample': 0.5603024519412966, 'reg_alpha': 0.002366858830181384, 'reg_lambda': 0.010811759045281961}. Best is trial 16 with value: 0.11887005753184866.


RMSLE: 0.12134
MAE: 15948.90769


[I 2025-08-22 20:29:02,277] Trial 23 finished with value: 0.1184007114666446 and parameters: {'max_depth': 3, 'learning_rate': 0.005004684869480905, 'n_estimators': 7456, 'min_child_weight': 3, 'colsample_bytree': 0.5115140763920388, 'subsample': 0.6815408607100708, 'reg_alpha': 0.0002885237032329038, 'reg_lambda': 0.0020316121366287065}. Best is trial 23 with value: 0.1184007114666446.


RMSLE: 0.11840
MAE: 15077.27261


[I 2025-08-22 20:30:07,864] Trial 24 finished with value: 0.11641004085105644 and parameters: {'max_depth': 3, 'learning_rate': 0.002490264152443762, 'n_estimators': 7312, 'min_child_weight': 1, 'colsample_bytree': 0.5688626262335408, 'subsample': 0.6941156910749311, 'reg_alpha': 0.0001729770888937333, 'reg_lambda': 0.002073504701148871}. Best is trial 24 with value: 0.11641004085105644.


RMSLE: 0.11641
MAE: 14840.26997


In [39]:
df_train, df_test = preprocess()
X_train = feature_engineering(df_train)
y_train = df_train.loc[:, "SalePrice"]

print (xgb_params)

xgb = XGBRegressor(**xgb_params, random_state=42)
rmsle, mae = evaluate_model(X_train, y_train, xgb);

{'max_depth': 3, 'learning_rate': 0.002490264152443762, 'n_estimators': 7312, 'min_child_weight': 1, 'colsample_bytree': 0.5688626262335408, 'subsample': 0.6941156910749311, 'reg_alpha': 0.0001729770888937333, 'reg_lambda': 0.002073504701148871}
RMSLE: 0.11666
MAE: 14803.85398


### Final Model Evaluation

**RMSLE: 0.11666**

**MAE: 14803.85398**