In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [5]:
data = pd.read_csv('./home-data-for-ml-course/train.csv')

In [7]:
# Create X with required feautres
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = data[features]


In [9]:
# Create y (target variable)
y = data.SalePrice

In [13]:
# Split the data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=1)

In [14]:
# Define the model
model = DecisionTreeRegressor(random_state=1)
# Fit the model
model.fit(train_X,train_y)

DecisionTreeRegressor(random_state=1)

In [15]:
# Make predictions using validation data
val_predictions = model.predict(val_X)

In [20]:
# Compare Predicitons made from validation data and Actual values in validation data
print(f'Count of Successful Predicitons: {sum(val_predictions == val_y)}')

Count of Successful Predicitons: 1


In [24]:
# Check Mean Absolute Error
mae_value = mean_absolute_error(val_predictions, val_y)
print(f'Mean Absolute Error Value: {mae_value:.0f}')

Mean Absolute Error Value: 29653


Now, we'll try using different number of leaf nodes

In [32]:
# Defining a Function to calculate Mean Absolute Error for a given leaf nodes value

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    val_predictions = model.predict(val_X)
    mae_value = mean_absolute_error(val_predictions, val_y)
#     print(f'MAE value using {max_leaf_nodes} leaf nodes: {mae_value:.0f}')
    return mae_value

In [117]:
leaf_nodes = [5,25,50,100,250,500]
val = []
for i in leaf_nodes:
    val.append(int(get_mae(i,train_X, val_X, train_y, val_y)))
# Find the best tree size for better modelling
best_tree_size = leaf_nodes[val.index(min(val))]

In [118]:
# Define the model using optimal tree size
optimised_model = DecisionTreeRegressor(max_leaf_nodes = best_tree_size, random_state =0)

# Fit the model using all the data
optimised_model.fit(X,y)

DecisionTreeRegressor(max_leaf_nodes=100, random_state=0)

In [119]:
# Check the predictions using optimised model
val_predictions = optimised_model.predict(X)

In [121]:
# Compare predicted values with actual values
sum(val_predictions==y)

18

In [123]:
# Check the Mean Absolute Error
op_mae_value = mean_absolute_error(val_predictions,y)
print(f'Optimised Mean Absolute Error: {op_mae_value:.0f}')

Optimised Mean Absolute Error: 16629


We've tuned this model and improved results. But we are still using Decision Tree models, which are not very sophisticated by modern machine learning standards. In the next step we will learn to use Random Forests to improve our models even more.