In [1]:
import pandas as pd

melbourne_file_path = './data/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)

melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [2]:
melbourne_data['Rooms'].mean().round()

3.0

In [3]:
melbourne_data['YearBuilt'].max()

2018.0

In [4]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
melbourne_data = melbourne_data.dropna(axis=0)

In [6]:
y = melbourne_data.Price

In [7]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

X = melbourne_data[melbourne_features]

In [8]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [9]:
from sklearn.tree import DecisionTreeRegressor


# Model definition
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)

In [10]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))

predicted = melbourne_model.predict(X)

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


In [11]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y, predicted)

1115.7467183128902

In [12]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

melbourne_model = DecisionTreeRegressor()

melbourne_model.fit(train_X, train_y)

val_predicitons = melbourne_model.predict(val_X)

print(mean_absolute_error(val_y, val_predicitons))

276100.7540348612


In [21]:
print(val_predicitons[0:5])
print(val_y)

[ 900000.  696750. 1015000. 1447500.  630000.]
4850      815000.0
2307      655000.0
10090     957500.0
3645     1330000.0
4930      722000.0
           ...    
8223      520000.0
11190     870000.0
8563      200000.0
1867     1002000.0
8375     1710000.0
Name: Price, Length: 1549, dtype: float64


In [23]:
mean_absolute_error(val_y, val_predicitons)

276100.7540348612

In [24]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return (mae)

In [43]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

results = []
lowest = None
best_tree_size = None

for candidate in candidate_max_leaf_nodes:
    mae = get_mae(candidate, train_X=train_X, val_X=val_X, train_y=train_y, val_y=val_y)
    if lowest is None:
        lowest = mae
        best_tree_size = candidate
    if mae < lowest:
        lowest = mae
        best_tree_size = candidate
    dict = {
        'Nodes': candidate,
        'MAE': mae
    }
    results.append(dict)

print(results)
print((lowest, best_tree_size))

[{'Nodes': 5, 'MAE': 385696.54278937966}, {'Nodes': 25, 'MAE': 307919.7001056724}, {'Nodes': 50, 'MAE': 279794.61143891385}, {'Nodes': 100, 'MAE': 269191.989429751}, {'Nodes': 250, 'MAE': 269945.1501662939}, {'Nodes': 500, 'MAE': 261718.1134423186}]
(261718.1134423186, 500)


In [46]:
final_model =  DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)

final_model.fit(train_X, train_y)