Datacamp: "Extreme Gradient Boosting with XGBoost in Python"

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
housing=pd.read_csv('ames_housing_trimmed_processed.csv')

# Exploratory Data Analysis

In [None]:
housing.head()

In [None]:
housing.columns

In [None]:
X=housing.iloc[:,:-1]
y=housing['SalePrice']

# HyperParameter Tuning

In [None]:
housing_dmatrix = xgb.DMatrix(data=X, label=y)
params = {"objective":"reg:linear", "max_depth":4}
cv_results = xgb.cv(dtrain=housing_dmatrix,params=params,metrics='rmse',nfold=3,early_stopping_rounds=10,num_boost_round=50,seed=123)
print(cv_results)

In [None]:
housing_dmatrix = xgb.DMatrix(X,y)
params = {"objective":"reg:linear", "max_depth":3}
num_rounds = [5, 10, 15]
final_rmse_per_round = []

for curr_num_rounds in num_rounds:
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=curr_num_rounds, metrics="rmse", as_pandas=True, seed=123)
    final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"]))

In [None]:
housing_dmatrix = xgb.DMatrix(data=X, label=y)
params = {"objective":"reg:linear", "max_depth":3}
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []
for curr_val in eta_vals:
    params["learning_rate"] = curr_val
    cv_results = xgb.cv(dtrain=housing_dmatrix,params=params,nfold=3,early_stopping_rounds=5,num_boost_round=10,metrics='rmse',seed=123)
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["learning_rate","best_rmse"]))

In [None]:
housing_dmatrix = xgb.DMatrix(data=X,label=y)
params = {"objective":"reg:linear"}
max_depths = [2, 5, 10, 20]
best_rmse = []

for curr_val in max_depths:

    params["max_depths"] = curr_val
    cv_results = xgb.cv(dtrain=housing_dmatrix,params=params,nfold=2,metrics='rmse',seed=123,early_stopping_rounds=5,num_boost_round=10)
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])
print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"]))

In [None]:
housing_dmatrix = xgb.DMatrix(data=X,label=y)
params={"objective":"reg:linear","max_depth":3}
colsample_bytree_vals=[0.1,0.5,0.8,1]
best_rmse = []

for curr_val in colsample_bytree_vals:

    params['"colsample_bytree"'] = curr_val
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="rmse", as_pandas=True, seed=123)
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"]))

# GridSearchCV

Limitation: Number of models you must build with every additional parameter grows very quickly

In [None]:
housing_dmatrix = xgb.DMatrix(data=X, label=y)
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2,5]
}

gbm = xgb.XGBRegressor()
grid_mse = GridSearchCV(estimator=gbm,param_grid=gbm_param_grid,scoring='neg_mean_squared_error',cv=4,verbose=1)
grid_mse.fit(X,y)
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

# RandomizedSearchCV

Limitation: Parameter space to explore can be massive

In [None]:
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

gbm = xgb.XGBRegressor(n_estimators=10)

randomized_mse = RandomizedSearchCV(estimator=gbm,param_distributions=gbm_param_grid,scoring="neg_mean_squared_error",n_iter=5,cv=4,verbose=1)

randomized_mse.fit(X,y)

print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))