# Extreme Gradient Boosting with XGBoost

### [C3] Fine Tuning XGBoost Models

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
URL = 'https://assets.datacamp.com/production/repositories/943/datasets/4dbcaee889ef06fb0763e4a8652a4c1f268359b2/ames_housing_trimmed_processed.csv'

In [3]:
df = pd.read_csv(URL)
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,Remodeled,GrLivArea,BsmtFullBath,BsmtHalfBath,...,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,PavedDrive_P,PavedDrive_Y,SalePrice
0,60,65.0,8450,7,5,2003,0,1710,1,0,...,0,0,0,0,1,0,0,0,1,208500
1,20,80.0,9600,6,8,1976,0,1262,0,1,...,0,1,0,0,0,0,0,0,1,181500
2,60,68.0,11250,7,5,2001,1,1786,1,0,...,0,0,0,0,1,0,0,0,1,223500
3,70,60.0,9550,7,5,1915,1,1717,1,0,...,0,0,0,0,1,0,0,0,1,140000
4,60,84.0,14260,8,5,2000,0,2198,1,0,...,0,0,0,0,1,0,0,0,1,250000


Creating features and target arrays:

In [4]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

Common tree tunable parameters:

- learning rate
- gamma
- lambda
- alpha
- max_depth
- subsample - % sample used by tree
- colsample_bytree - % features used per tree

Let's start by seeing how the number of boosting rounds (number of trees in the ensemble) impacts the out-of-sample performace of the model:

In [5]:
df_dmatrix = xgb.DMatrix(data=X, label=y)

In [11]:
params = {"objective": "reg:linear", "max_depth": 3}

In [14]:
num_rounds = [5, 10, 15]
final_rmse_per_round = []

for curr_num_rounds in num_rounds:
    cv_results = xgb.cv(dtrain=df_dmatrix, params=params, nfold=3, num_boost_round=curr_num_rounds,
                        metrics='rmse', as_pandas=True, seed=123)

    rmse = cv_results['test-rmse-mean'].tail(1).values
    final_rmse_per_round.append(rmse)

df_restults = pd.DataFrame(list(zip(num_rounds, final_rmse_per_round)),
                           columns=['num_boosting_rounds', 'rmse'])



In [15]:
df_restults

Unnamed: 0,num_boosting_rounds,rmse
0,5,[50903.30078133333]
1,10,[34774.192708333336]
2,15,[32895.096354333335]


Now, instead of attempting to cherry pick the best possible number of boosting rounds, you can very easily have XGBoost automatically select the number of boosting rounds for you within `xgb.cv()` using the `early_stopping_rounds` parameter:

In [16]:
params = {"objective": "reg:linear", "max_depth": 4}

In [17]:
cv_results = xgb.cv(dtrain=df_dmatrix, params=params, nfold=3, num_boost_round=50,
                    early_stopping_rounds=10, metrics='rmse', as_pandas=True, seed=123)

cv_results



Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,141871.630208,403.632409,142640.651042,705.571916
1,103057.028646,73.769561,104907.666667,111.114933
2,75975.963541,253.734987,79262.059895,563.766991
3,57420.529948,521.653556,61620.135417,1087.690754
4,44552.955729,544.1692,50437.5625,1846.448017
5,35763.950521,681.797429,43035.660156,2034.469858
6,29861.464844,769.572234,38600.88151,2169.800969
7,25994.672526,756.520565,36071.81901,2109.797271
8,23306.833333,759.237086,34383.184896,1934.546688
9,21459.76888,745.624404,33509.142578,1887.377024


For further explanation on early stopping, [reference here](https://campus.datacamp.com/courses/extreme-gradient-boosting-with-xgboost/fine-tuning-your-xgboost-model?ex=4).

#### __Tuning hyperparamenters__

Let's start by tuning `eta` parameter:

In [18]:
params = {"objective": "reg:linear", "max_depth": 3}

In [19]:
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []

for curr_val in eta_vals:
    params['eta'] = curr_val

    cv_results = xgb.cv(dtrain=df_dmatrix, params=params, nfold=3, early_stopping_rounds=5,
                        num_boost_round=10, metrics='rmse', as_pandas=True, seed=123)

    rmse = cv_results['test-rmse-mean'].tail(1).values
    best_rmse.append(rmse)

df_restults = pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=['eta', 'rmse'])
df_restults



Unnamed: 0,eta,rmse
0,0.001,[195736.40104166666]
1,0.01,[179932.17708333334]
2,0.1,[79759.41406266666]


Let's tune `max_depth`:

In [20]:
params = {"objective": "reg:linear"}

In [21]:
max_depths = [2, 5, 10, 20]
best_rmse = []

for curr_val in max_depths:
    params['max_depth'] = curr_val

    cv_results = xgb.cv(dtrain=df_dmatrix, params=params, nfold=3, early_stopping_rounds=5,
                        num_boost_round=10, metrics='rmse', as_pandas=True, seed=123)

    rmse = cv_results['test-rmse-mean'].tail(1).values
    best_rmse.append(rmse)

df_restults = pd.DataFrame(list(zip(max_depths, best_rmse)), columns=['max_depth', 'rmse'])
df_restults



Unnamed: 0,max_depth,rmse
0,2,[37044.02994766666]
1,5,[33210.03971399999]
2,10,[34503.43098966667]
3,20,[34847.684895666665]


Let's work now with `colsample_bytree`:

In [22]:
params={"objective": "reg:linear","max_depth": 3}

In [23]:
colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
best_rmse = []

for curr_val in colsample_bytree_vals:
    params['colsample_bytree'] = curr_val

    cv_results = xgb.cv(dtrain=df_dmatrix, params=params, nfold=3, early_stopping_rounds=5,
                        num_boost_round=10, metrics='rmse', as_pandas=True, seed=123)

    rmse = cv_results['test-rmse-mean'].tail(1).values
    best_rmse.append(rmse)

df_restults = pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=['colsample_bytree', 'rmse'])
df_restults



Unnamed: 0,colsample_bytree,rmse
0,0.1,[50195.166666666664]
1,0.5,[34190.986979]
2,0.8,[34759.016927]
3,1.0,[34774.192708333336]


#### __Using grid search__

Tuning a model by hand as we did earlier is time consuming. Scikit-Learn and provides with a tool called grid search, to make an exhaustive search over a given set of hyperparameters:

In [26]:
from sklearn.model_selection import GridSearchCV

Creating a parameter's grid:

In [28]:
param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

In [25]:
xg_grid = xgb.XGBRegressor()

Performing a grid search:

In [29]:
grid_mse = GridSearchCV(estimator=xg_grid, param_grid=param_grid,
                        scoring='neg_mean_squared_error', cv=4, verbose=1)

In [30]:
grid_mse.fit(X, y)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    3.4s finished


GridSearchCV(cv=4,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
       

In [32]:
print(f'Best parameter found: {grid_mse.best_params_}')

best_rmse = np.sqrt(np.abs(grid_mse.best_score_))
print(f'Lowest RMSE found: {best_rmse}')

Best parameter found: {'colsample_bytree': 0.3, 'max_depth': 5, 'n_estimators': 50}
Lowest RMSE found: 29916.017850830365


Instead of selecting by hand the hyperparameters vaules, in practice, we would want to randomize the grid search as follows:

In [33]:
from sklearn.model_selection import RandomizedSearchCV

In [34]:
param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

In [35]:
randomized_mse = RandomizedSearchCV(estimator=xg_grid, param_distributions=param_grid,
                                    scoring='neg_mean_squared_error', n_iter=5, cv=4, verbose=1)

In [36]:
randomized_mse.fit(X, y)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.0s finished


RandomizedSearchCV(cv=4,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n_jobs=None,
                                          num_parallel_tree=None,
                                          predicto

In [37]:
print(f'Best parametes found: {randomized_mse.best_params_}')

best_rmse = np.sqrt(np.abs(randomized_mse.best_score_))
print(f'Best RMSE found: {best_rmse}')

Best parametes found: {'n_estimators': 25, 'max_depth': 4}
Best RMSE found: 29998.4522530019
