In [2]:
# imports 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# load data 
houses = pd.read_csv('HousePricesClean.csv')

# set up x and y and split into test and train
y = houses.price
X = houses.drop(columns = ['year_renovated', 'basement_area', 'price','id'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)

In [3]:
my_model = XGBRegressor(n_estimators = 574, learning_rate = 0.05)
#my_model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], verbose = True)

# using cross validation 
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
my_pipeline = Pipeline(steps = [('model', my_model)])
scores = -1 * cross_val_score(my_pipeline, X, y, cv = 5, scoring = 'neg_mean_absolute_error')
print(scores)
print(scores.mean())

[67329.52248098 64540.58012889 60811.25490285 64631.10251133
 74042.26344135]
66270.94469307987


In [4]:
# manually picking best hyperparameters: 
# learning rate 0.1 n_estimators 255 , 123870
# learning rate 0.05 n_estimators 298 , 123458
# learning rate 0.05, n_estimators 574, 121647

In [6]:
houses.price.describe()

count    1.461800e+04
mean     5.387994e+05
std      3.672410e+05
min      7.800000e+04
25%      3.200000e+05
50%      4.500000e+05
75%      6.450000e+05
max      7.700000e+06
Name: price, dtype: float64

In [7]:
# now use grid search to find best hyperparameters for us 
# make a dictionary of hyperparamater values to search
search_space = {
    "n_estimators" : [100, 200, 500, 1000],   # the number of trees used 
    "max_depth" : [3, 6, 9],  # maximum depth of each tree 
    "gamma" : [0.01, 0.1], # minimum loss reduction required to make a further partition on a leaf node of the tree
    "learning_rate" : [0.001, 0.05, 0.01, 0.1] # how much each new tree contributes to score (helps reduce overfitting)
}

In [8]:
from sklearn.model_selection import GridSearchCV

xgb_model = XGBRegressor()
GS = GridSearchCV(estimator = xgb_model, 
                  param_grid = search_space, 
                  scoring = ["r2", "neg_root_mean_squared_error"], # how we compare each model?
                  refit = "r2",
                  cv = 5, 
                  verbose = 4)

In [10]:
GS.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-340896.597) r2: (test=0.104) total time=   0.0s
[CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-382931.612) r2: (test=0.108) total time=   0.0s
[CV 3/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-353080.633) r2: (test=0.105) total time=   0.0s
[CV 4/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-329113.140) r2: (test=0.124) total time=   0.0s
[CV 5/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-364672.059) r2: (test=0.102) total time=   0.0s
[CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-323898.264) r2: (tes

In [11]:
print(GS.best_params_)

{'gamma': 0.01, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000}


In [12]:
cv_results = pd.DataFrame(GS.cv_results_)
cv_results = cv_results.sort_values(by = "rank_test_r2")
cv_results.to_csv("grid_search_results.csv")

# looking at the results we might pick the 3rd best model which only uses 500 trees instead of 1000 at a very 
# small sacrifice on performance
# gamma = 0.01 
# learning rate = 0.1
# max depth = 3
# n_estimators = 500

In [20]:
# final model 
final_model = XGBRegressor(n_estimators = 500, learning_rate = 0.1, max_depth = 3, gamma = 0.01)
#my_model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], verbose = True)


In [21]:

# let's create a little dataframe analysing our model using different metrics!
model_evaluation = pd.DataFrame()
for metric in ['mean_absolute_error', 'median_absolute_error', 'root_mean_squared_error']:
    my_pipeline = Pipeline(steps = [('model', final_model)])
    scores = -1 * cross_val_score(my_pipeline, X, y, cv = 5, scoring = "neg_" + metric)
    model_evaluation[metric] = scores
model_evaluation["r2"] = cross_val_score(my_pipeline, X, y , cv = 5, scoring = "r2")
model_evaluation.head()


Unnamed: 0,mean_absolute_error,median_absolute_error,root_mean_squared_error,r2
0,68690.174154,41089.125,149731.916979,0.846131
1,66435.689491,38829.015625,118035.070884,0.883131
2,62992.139067,37852.210938,110656.686148,0.904693
3,67475.06397,39382.15625,139864.094307,0.873714
4,83111.23995,46062.625,135735.603261,0.852968
