In [4]:
# imports 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# load data 
houses = pd.read_csv('HousePricesClean.csv')

# set up x and y and split into test and train
y = houses.price
X = houses.drop(columns = ['year_renovated', 'basement_area', 'price','id', 'post_code','date'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)

In [5]:
my_model = XGBRegressor(n_estimators = 574, learning_rate = 0.05)
#my_model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], verbose = True)

# using cross validation 
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
my_pipeline = Pipeline(steps = [('model', my_model)])
scores = -1 * cross_val_score(my_pipeline, X, y, cv = 5, scoring = 'neg_mean_absolute_error')
print(scores)
print(scores.mean())

[67121.6982168  64975.70275628 60717.94836643 65360.74968729
 70108.0311324 ]
65656.82603184044


In [6]:
# manually picking best hyperparameters: 
# learning rate 0.1 n_estimators 255 , 123870
# learning rate 0.05 n_estimators 298 , 123458
# learning rate 0.05, n_estimators 574, 121647

In [7]:
houses.price.describe()

count    1.461800e+04
mean     5.387994e+05
std      3.672410e+05
min      7.800000e+04
25%      3.200000e+05
50%      4.500000e+05
75%      6.450000e+05
max      7.700000e+06
Name: price, dtype: float64

In [3]:
# now use grid search to find best hyperparameters for us 
# make a dictionary of hyperparamater values to search
search_space = {
    "n_estimators" : [200, 500, 1000],   # the number of trees used 
    "max_depth" : [3, 6, 9],  # maximum depth of each tree 
    "gamma" : [0.01, 0.1], # minimum loss reduction required to make a further partition on a leaf node of the tree
    "learning_rate" : [0.05, 0.01, 0.1] # how much each new tree contributes to score (helps reduce overfitting)
}

In [5]:
from sklearn.model_selection import GridSearchCV

xgb_model = XGBRegressor()
GS = GridSearchCV(estimator = xgb_model, 
                  param_grid = search_space, 
                  scoring = ["r2", "neg_root_mean_squared_error"], # how we compare each model?
                  refit = "r2",
                  cv = 5, 
                  verbose = 4)

In [6]:
GS.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5] END gamma=0.01, learning_rate=0.05, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-150598.676) r2: (test=0.825) total time=   0.1s
[CV 2/5] END gamma=0.01, learning_rate=0.05, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-135607.223) r2: (test=0.888) total time=   0.0s
[CV 3/5] END gamma=0.01, learning_rate=0.05, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-127153.733) r2: (test=0.884) total time=   0.0s
[CV 4/5] END gamma=0.01, learning_rate=0.05, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-139179.107) r2: (test=0.843) total time=   0.0s
[CV 5/5] END gamma=0.01, learning_rate=0.05, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-139931.078) r2: (test=0.868) total time=   0.0s
[CV 1/5] END gamma=0.01, learning_rate=0.05, max_depth=3, n_estimators=500; neg_root_mean_squared_error: (test=-144126.711) r2: (test=0.84

In [7]:
print(GS.best_params_)

{'gamma': 0.01, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000}


In [8]:
cv_results = pd.DataFrame(GS.cv_results_)
cv_results = cv_results.sort_values(by = "rank_test_r2")
cv_results.to_csv("grid_search_results.csv")

In [8]:
# final model 
final_model = XGBRegressor(n_estimators = 1000, learning_rate = 0.1, max_depth = 3, gamma = 0.01)
#my_model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], verbose = True)


In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

# let's create a little dataframe analysing our model using different metrics!
model_evaluation = pd.DataFrame()
for metric in ['mean_absolute_error', 'median_absolute_error', 'root_mean_squared_error']:
    my_pipeline = Pipeline(steps = [('model', final_model)])
    scores = -1 * cross_val_score(my_pipeline, X, y, cv = 5, scoring = "neg_" + metric)
    model_evaluation[metric] = scores
model_evaluation["r2"] = cross_val_score(my_pipeline, X, y , cv = 5, scoring = "r2")
model_evaluation.head()


Unnamed: 0,mean_absolute_error,median_absolute_error,root_mean_squared_error,r2
0,69459.261743,39443.828125,150648.257125,0.844242
1,66292.215282,39742.4375,116983.945845,0.885203
2,63546.557635,37254.765625,110748.680924,0.904534
3,66761.018135,39713.171875,139108.232069,0.875076
4,83049.796247,45120.40625,136935.245083,0.850357


In [9]:
from pickle import dump
final_model.fit(X_train, y_train)
with open("final_model.pkl", "wb") as f:
    dump(final_model, f, protocol = 5)