In [8]:
# imports
import pandas as pd 

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# read file 
houses = pd.read_csv('HousePricesClean.csv')

# view first 5 rows of our data
houses.head()

Unnamed: 0,id,date,bedrooms,bathrooms,living_area,lot_area,floors,waterfront,views,condition,...,year_built,year_renovated,post_code,lattitude,longitude,living_area_renov,lot_area_renov,schools,airport_distance,price
0,6762810635,42491,4,2.5,2920,4000,1.5,0,0,5,...,1909,0,122004,52.8878,-114.47,2470,4000,2,51,1400000
1,6762810998,42491,5,2.75,2910,9480,1.5,0,0,3,...,1939,0,122004,52.8852,-114.468,2940,6600,1,53,1200000
2,6762812605,42491,4,2.5,3310,42998,2.0,0,0,3,...,2001,0,122005,52.9532,-114.321,3350,42847,3,76,838000
3,6762812919,42491,3,2.0,2710,4500,1.5,0,0,4,...,1929,0,122006,52.9047,-114.485,2060,4500,1,51,805000
4,6762813105,42491,3,2.5,2600,4750,1.0,0,0,4,...,1951,0,122007,52.9133,-114.59,2380,4750,1,67,790000


In [10]:
# select feature to predict
y = houses.price
# select which fields we want to use in the model
X = houses[['bedrooms', 'bathrooms', 'year_built', 'post_code', 'condition', 'lot_area', 'house_area']]

# split into training and testing data
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 0)

In [17]:
# make model
house_model = DecisionTreeRegressor(random_state = 1)
house_model.fit(train_X,train_y)

# check it works!
predictions = house_model.predict(test_X)
mean_absolute_error(predictions, test_y)

154479.89151846786

In [18]:
# optimising the number of leaf nodes 
for leaf in [5,50,500,5000]:

    # make the model
    model = DecisionTreeRegressor(random_state = 1, max_leaf_nodes = leaf)
    model.fit(train_X, train_y)
    predictions = model.predict(test_X)

    # test the model and print the score 
    mae = int(mean_absolute_error(predictions, test_y))
    print(f"Max leaf nodes: {leaf}    Mean absolute error: {mae}")
    

Max leaf nodes: 5    Mean absolute error: 189644
Max leaf nodes: 50    Mean absolute error: 156848
Max leaf nodes: 500    Mean absolute error: 147307
Max leaf nodes: 5000    Mean absolute error: 156088


In [16]:
# by inspection we see the optimal number of leaf nodes is somewhere around 500.
max_leaf_nodes = 500