In [1]:
# Importing dependencies and data
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

path_to_data = 'house_prices.csv'
data = pd.read_csv(path_to_data)

# Assign Prediction target and Training Set to variables (view column headers with data.columns)
y = data.SalePrice

features = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]
X = data[features]

# Specify Model
house_price_model = DecisionTreeRegressor(random_state=1)
{}
# Fit the Model
house_price_model.fit(X, y)

print(f'First in-sample predictions         : {house_price_model.predict(X.head())}')
print(f'Actual target values for those homes: {y.head().tolist()}')

First in-sample predictions         : [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes: [208500, 181500, 223500, 140000, 250000]


In [2]:
##### Evaluating the model #####
# Split the set into training data and validation data
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Fit the Model
house_price_model.fit(train_X, train_y)

# Get predicted prices on validation data
print(f'Validation data predictions         : {house_price_model.predict(val_X.head())}')
print(f'Actual target values for those homes: {val_y.head().tolist()}')

Validation data predictions         : [186500. 184000. 130000.  92000. 164500.]
Actual target values for those homes: [231500, 179500, 122000, 84500, 142000]


In [3]:
# Calculate the mean absolute error
from sklearn.metrics import mean_absolute_error

validation_predictions = house_price_model.predict(val_X)
print(mean_absolute_error(val_y, validation_predictions))

29652.931506849316


In [4]:
##### Underfitting / Overfitting #####
# Creating a function to compare MAE scores for models with different values max_leaf_nodes
def get_mae(max_leaf_nodes, train_X, train_y, val_X, val_y):
    house_price_model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    house_price_model.fit(train_X, train_y)
    validation_predictions = house_price_model.predict(val_X)
    mae = mean_absolute_error(val_y, validation_predictions)
    return mae

In [5]:
# Using the function on different values of max_leaf_nodes
max_leaf_nodes_to_test = [5, 25, 50, 75, 100]
for i in max_leaf_nodes_to_test:
    mae = get_mae(i, train_X, train_y, val_X, val_y)
    print(f'{i} max leaf nodes returns an MAE of {mae}')


5 max leaf nodes returns an MAE of 35044.51299744237
25 max leaf nodes returns an MAE of 29016.41319191076
50 max leaf nodes returns an MAE of 27405.930473214907
75 max leaf nodes returns an MAE of 27114.12284949469
100 max leaf nodes returns an MAE of 27282.50803885739


In [6]:
# Automate the above to return a dictionary using dict comprehension
scores = {nodes: get_mae(nodes, train_X, train_y, val_X, val_y) for nodes in max_leaf_nodes_to_test}

# Now find the number of nodes with the lowest MAE
best_tree_size = min(scores, key=scores.get)
print(f'{best_tree_size} max leaf nodes returns an MAE of {scores[best_tree_size]}')

75 max leaf nodes returns an MAE of 27114.12284949469


In [7]:
# Apply to a final model and fit it to the full set of data
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=75, random_state=1)

# Below is a single code section which shows the whole process together

In [8]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

path_to_data = 'house_prices.csv'
data = pd.read_csv(path_to_data)

features = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]
X = data[features]
y = data.SalePrice

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

def get_mae(max_leaf_nodes, train_X, train_y, val_X, val_y):
    house_price_model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    house_price_model.fit(train_X, train_y)
    validation_predictions = house_price_model.predict(val_X)
    mae = mean_absolute_error(val_y, validation_predictions)
    return mae

max_leaf_nodes_to_test = [5, 25, 50, 75, 100]
scores = {nodes: get_mae(nodes, train_X, train_y, val_X, val_y) for nodes in max_leaf_nodes_to_test}
best_tree_size = min(scores, key=scores.get)
print(f'{best_tree_size} max leaf nodes returns an MAE of {scores[best_tree_size]}')

final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)
final_model.fit(X, y)


75 max leaf nodes returns an MAE of 27114.12284949469


DecisionTreeRegressor(max_leaf_nodes=75, random_state=0)