# Predict House Prices Using Decision Tree Regressor

## Load data

In [1]:
import pandas as pd

iowa_file_path = 'data/train.csv'

home_data = pd.read_csv(iowa_file_path)

home_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

## Specify prediction target

In [7]:
y = home_data.SalePrice
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

## Specify features

In [8]:
feature_names = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'OverallQual', 'OverallCond']

X = home_data[feature_names]

X.describe()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd,OverallQual,OverallCond
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1971.267808,1162.626712,346.992466,1.565068,2.866438,6.517808,6.099315,5.575342
std,9981.264932,30.202904,386.587738,436.528436,0.550916,0.815778,1.625393,1.382997,1.112799
min,1300.0,1872.0,334.0,0.0,0.0,0.0,2.0,1.0,1.0
25%,7553.5,1954.0,882.0,0.0,1.0,2.0,5.0,5.0,5.0
50%,9478.5,1973.0,1087.0,0.0,2.0,3.0,6.0,6.0,5.0
75%,11601.5,2000.0,1391.25,728.0,2.0,3.0,7.0,7.0,6.0
max,215245.0,2010.0,4692.0,2065.0,3.0,8.0,14.0,10.0,9.0


In [9]:
X.head()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd,OverallQual,OverallCond
0,8450,2003,856,854,2,3,8,7,5
1,9600,1976,1262,0,2,3,6,6,8
2,11250,2001,920,866,2,3,6,7,5
3,9550,1915,961,756,1,3,7,7,5
4,14260,2000,1145,1053,2,4,9,8,5


## Specify and fit model

In [10]:
from sklearn.tree import DecisionTreeRegressor

iowa_model = DecisionTreeRegressor(random_state=1)

iowa_model.fit(X, y)

DecisionTreeRegressor(random_state=1)

## Make predictions

In [11]:
predictions = iowa_model.predict(X)
print(predictions)

[208500. 181500. 223500. ... 266500. 142125. 147500.]


In [12]:
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

## Validate model

In [13]:
from sklearn.metrics import mean_absolute_error

error = mean_absolute_error(y, predictions)
print('The mean absolute error for the in-sample data was about $%d' %(error))

The mean absolute error for the in-sample data was about $23


## Now split data into training and validation data

In [14]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

## Fit model with training data

In [15]:
iowa_model_two = DecisionTreeRegressor(random_state=1)

iowa_model_two.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

## Make predictions with validation data

In [16]:
val_predictions = iowa_model_two.predict(val_X)

print(val_predictions)

[192500. 135000. 116050.  75500. 139900. 305000. 336000. 144152. 215000.
 239686. 180000.  91000. 192500. 184750. 227000. 109000. 110000.  60000.
 205000. 143750. 142000. 135000. 275000. 377500. 119000. 181500. 119500.
 185000. 446261. 159434. 130000. 124000. 124900. 135000. 145000. 315000.
  89500. 100000. 214000.  97000. 147000. 159500. 110500. 119000. 172400.
 190000. 129900. 167240. 262500. 260000. 132000. 320000. 119000. 240000.
 207000. 118000. 116050. 160000. 120000. 171000. 159000. 325300. 112000.
 124900. 180000. 127000. 153575. 201800. 157000. 145000. 110000. 108000.
 281213. 145000. 147000. 195400. 216000. 104900. 320000. 208900. 235000.
  97000. 133000. 147500. 194500. 179900. 149500. 142500. 174000. 143000.
 208900. 197500. 116050. 102000. 120000. 112000. 114500. 150750. 155000.
 162000. 126000. 130500. 128000. 129000. 150000. 167500. 176432. 144000.
 160000. 315000. 125000. 155000. 143500. 203000. 220000. 173500. 265900.
 119900. 159000. 372402. 143000. 265900. 315000. 23

## Validate new model and compare with previous model

In [18]:
new_error = mean_absolute_error(val_y, val_predictions)
print('The mean absolute error for the new model was about $%d' %(new_error))
print('While the mean absolute error for previous model was about $%d' %(error))

The mean absolute error for the new model was about $26869
While the mean absolute error for previous model was about $23


## Fine-tuning model

In [None]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)


for max_leaf_nodes in range(50, 300, 10):
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 50  		 Mean Absolute Error:  25634
Max leaf nodes: 60  		 Mean Absolute Error:  25734
Max leaf nodes: 70  		 Mean Absolute Error:  25644
Max leaf nodes: 80  		 Mean Absolute Error:  25662
Max leaf nodes: 90  		 Mean Absolute Error:  25332
Max leaf nodes: 100  		 Mean Absolute Error:  25512
Max leaf nodes: 110  		 Mean Absolute Error:  25484
Max leaf nodes: 120  		 Mean Absolute Error:  25486
Max leaf nodes: 130  		 Mean Absolute Error:  26032
Max leaf nodes: 140  		 Mean Absolute Error:  25906
Max leaf nodes: 150  		 Mean Absolute Error:  25873
Max leaf nodes: 160  		 Mean Absolute Error:  25751
Max leaf nodes: 170  		 Mean Absolute Error:  25863
Max leaf nodes: 180  		 Mean Absolute Error:  26092
Max leaf nodes: 190  		 Mean Absolute Error:  26029
Max leaf nodes: 200  		 Mean Absolute Error:  25685
Max leaf nodes: 210  		 Mean Absolute Error:  25696
Max leaf nodes: 220  		 Mean Absolute Error:  25582
Max leaf nodes: 230  		 Mean Absolute Error:  25644
Max leaf nodes: 2

Remark:
The most fine-tune model needs about 70 max leaf nodes.

In [None]:
tuned_model = DecisionTreeRegressor(max_leaf_nodes=70, random_state=1)

tuned_model.fit(train_X, train_y)

tuned_preds_val = tuned_model.predict(val_X)

tuned_error = mean_absolute_error(val_y, tuned_preds_val)

print('error for the tuned final model validated with validation data is $%d' %(tuned_error))
print('error for the un-tuned new model validated with validation data is $%d' %(new_error))
print('error for the overfitted old model validated with in-sample data is $%d' %(error))


error for the tuned final model validated with validation data is $25644
error for the un-tuned new model validated with validation data is $26869
error for the overfitted old model validated with in-sample data is $23
