In [1]:
# Libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Data prep
ames = pd.read_csv("https://assets.datacamp.com/production/course_3679/datasets/ames_unprocessed_data.csv")
ames.iloc[1]

# Last col is our predictor
X, y =  ames.iloc[:,8:-2], ames.iloc[:,-1]

In [2]:
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params 
params = {"objective":"reg:linear", "max_depth":3}

# Create list of number of boosting rounds
num_rounds = [5, 10, 15]

# Empty list to store final round rmse per XGBoost model
final_rmse_per_round = []

# Iterate over num_rounds and build one model per num_boost_round parameter
for curr_num_rounds in num_rounds:

    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix,
                        params=params,
                        nfold=3,
                        num_boost_round=curr_num_rounds,
                        metrics="rmse",
                        as_pandas=True,
                        seed=123)
    
    # Append final round RMSE
    final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"]))

   num_boosting_rounds          rmse
0                    5  53743.005208
1                   10  36522.906250
2                   15  34392.015625


In [3]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params
params = {"objective":"reg:linear", "max_depth":4}

# Perform cross-validation with early stopping: cv_results
cv_results = xgb.cv(dtrain = housing_dmatrix,
                    params = params,
                    num_boost_round = 50,
                    early_stopping_rounds = 10,
                    metrics = "rmse",
                    as_pandas = True,
                    seed=123)

# Print cv_results
print(cv_results)

    test-rmse-mean  test-rmse-std  train-rmse-mean  train-rmse-std
0    142892.885417     236.361461    142480.005208      490.910858
1    104754.695312     848.913524    104181.117187      247.417632
2     78863.869792    1105.393938     77528.343750      147.678522
3     61609.513021    1630.284897     59411.843750      228.173655
4     50341.221354    1679.488102     46907.351563      231.686642
5     42951.391927    1376.968977     38602.959636      197.224172
6     38632.955729    1021.952690     33081.843750      118.713353
7     35967.014323     782.869883     29480.868489      140.258307
8     34292.915364     488.589550     27017.141276      235.715252
9     33306.989583     205.749927     25221.238932      181.201290
10    32677.056640      77.830031     23917.205078       68.229945
11    32414.880209     153.244428     23147.573568       46.121184
12    32082.016927     395.420825     22413.623047      125.869531
13    31919.809896     395.424739     21944.130208      188.00

In [4]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree (boosting round)
params = {"objective":"reg:linear", "max_depth":3}

# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []

# Systematically vary the eta 
for curr_val in eta_vals:

    params["eta"] = curr_val
    
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix,
                    params=params,
                    nfold=3,
                    num_boost_round=10,
                    early_stopping_rounds=5,
                    metrics="rmse",
                    as_pandas=True,
                    seed=123)

    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["eta","best_rmse"]))

     eta      best_rmse
0  0.001  195751.286458
1  0.010  180072.510417
2  0.100   80951.346354


In [5]:
# Create your housing DMatrix
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary
params = {"objective":"reg:linear"}

# Create list of max_depth values
max_depths = [2, 5, 10, 20]
best_rmse = []

# Systematically vary the max_depth
for curr_val in max_depths:

    params["max_depth"] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix,
                    params=params,
                    nfold=2,
                    num_boost_round=10,
                    early_stopping_rounds=5,
                    metrics="rmse",
                    as_pandas=True,
                    seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"]))

   max_depth     best_rmse
0          2  41615.080078
1          5  37130.722656
2         10  37970.410157
3         20  38235.009765


In [6]:
# Create your housing DMatrix
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary
params={"objective":"reg:linear","max_depth":3}

# Create list of hyperparameter values: colsample_bytree_vals
colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
best_rmse = []

# Systematically vary the hyperparameter value 
for curr_val in colsample_bytree_vals:

    params["colsample_bytree"] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix,
                        params=params,
                        nfold=2,
                        num_boost_round=10,
                        early_stopping_rounds=5,
                        metrics="rmse",
                        as_pandas=True,
                        seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"]))


   colsample_bytree     best_rmse
0               0.1  59161.455079
1               0.5  40903.736328
2               0.8  39494.525391
3               1.0  39629.414062


In [7]:
# load extra modules
from sklearn.model_selection import GridSearchCV

# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(estimator = gbm,
                        param_grid = gbm_param_grid,
                        scoring = "neg_mean_squared_error",
                        cv = 4,
                        verbose = 1)

# Fit grid_mse to the data
grid_mse.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Best parameters found:  {'colsample_bytree': 0.7, 'max_depth': 5, 'n_estimators': 50}
Lowest RMSE found:  32397.6579836


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.7s finished


In [8]:
# load extra modules
from sklearn.model_selection import RandomizedSearchCV

# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(n_estimators=10)

# Perform random search: grid_mse
randomized_mse = RandomizedSearchCV(estimator = gbm,
                               param_distributions = gbm_param_grid,
                               scoring = "neg_mean_squared_error",
                               n_iter = 5,
                               cv = 4,
                               verbose = 1)

# Fit randomized_mse to the data
randomized_mse.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))


Fitting 4 folds for each of 5 candidates, totalling 20 fits
Best parameters found:  {'n_estimators': 25, 'max_depth': 5}
Lowest RMSE found:  37479.1964196


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.2s finished
