In [1]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Size of the house (in square feet) and age of the house (in years)
X = np.array([[1500, 5], [2000, 3], [2500, 10], [3000, 1]])
# Price of the house (in dollars)
y = np.array([300000, 400000, 450000, 600000])

We could use a decision stump as our first weak learner, or we could use the average of the target values. Since the mean is easier to compute, we'll go with that.

$$
\bar{y} = \frac{300000 + 400000 + 450000 + 600000}{4} = 437500
$$

Next, we will compute the residuals.

In [6]:
mean_price = np.mean(y)
residuals = y - mean_price

# Compute the baseline error
baseline_error = np.mean(np.abs(residuals))
print("Baseline error: {}".format(baseline_error))

print("Mean price: {}".format(mean_price))
print("Residuals: {}".format(residuals))

# Create a decision tree regressor and fit it to the data
regressor = DecisionTreeRegressor(max_depth=2)
regressor.fit(X, residuals)

# Make predictions on the training data
predictions = regressor.predict(X)
print("Predictions: {}".format(predictions))

# Update the predictions with the residuals
predictions_updated = predictions + mean_price

# Compute the error
errors = np.mean(np.abs(predictions_updated - y))
print("New error: {}".format(errors))

Baseline error: 87500.0
Mean price: 437500.0
Residuals: [-137500.  -37500.   12500.  162500.]
Predictions: [-137500.  -12500.  -12500.  162500.]
New error: 12500.0


By boosting the original predictions with a single decision tree, the error was reduced from 87500 to 12500. The algorithm continues for as many trees as we specify. The final prediction is the sum of the predictions from all the trees.
Let's add 1 more and see how the error improves.

In [9]:
# Get the updated residuals
residuals_updated = y - predictions_updated

print("Residuals: {}".format(residuals_updated))

# Create a new decision tree regressor
regressor2 = DecisionTreeRegressor(max_depth=2)
regressor2.fit(X, residuals_updated)

# Make predictions on the training data
predictions2 = regressor2.predict(X)
print("Predictions: {}".format(predictions2))

# Update the predictions
predictions_updated2 = predictions2 + predictions_updated

print("Predictions: {}".format(predictions_updated2))

# Compute the error
errors2 = np.mean(np.abs(predictions_updated2 - y))
print("New error: {}".format(errors2))

Residuals: [     0. -25000.  25000.      0.]
Predictions: [     0. -12500.  25000. -12500.]
Predictions: [300000. 412500. 450000. 587500.]
New error: 6250.0
