In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Size of the house (in square feet) and age of the house (in years)
X = np.array([[1500, 5], [2000, 3], [2500, 10], [3000, 1]])
# Price of the house (in dollars)
y = np.array([300000, 400000, 450000, 600000])

We could use a decision stump as our first weak learner, or we could use the average of the target values. Since the mean is easier to compute, we'll go with that.

$$
\bar{y} = \frac{300000 + 400000 + 450000 + 600000}{4} = 437500
$$

Next, we will compute the residuals.

In [None]:
mean_price = np.mean(y)
residuals = y - mean_price

# Compute the baseline error
baseline_error = mean_squared_error(y, [mean_price] * len(y))
print("Baseline error: {}".format(np.sqrt(baseline_error)))

print("Mean price: {}".format(mean_price))
print("Residuals: {}".format(residuals))

# Create a decision tree regressor and fit it to the data
regressor = DecisionTreeRegressor(max_depth=2)
regressor.fit(X, residuals)

# Make predictions on the training data
predictions = regressor.predict(X)
print("Predictions: {}".format(predictions))

# Update the predictions with the residuals
tree1_predictions = predictions + mean_price

# Compute the error
errors = mean_squared_error(y, tree1_predictions)
print("New error: {}".format(np.sqrt(errors)))

By boosting the original predictions with a single decision tree, the error was reduced from $108253.18$ to $17677.67$. The algorithm continues for as many trees as we specify. The final prediction is the sum of the predictions from all the trees.

Try modifying the number of trees below and see how the error changes.

In [None]:
# Get the updated residuals
residuals = y - tree1_predictions
predictions = tree1_predictions

# Boost multiple times
num_boosts = 5
for i in range(num_boosts):
    # Fit the regressor to the updated residuals
    regressor.fit(X, residuals)

    # Make predictions on the training data
    predictions += regressor.predict(X)

    # Update the residuals
    residuals = y - predictions

# Make predictions on the training data
print("Final predictions: {}".format(predictions))

# Compute the error
errors = mean_squared_error(y, predictions)
print("Final error: {}".format(np.sqrt(errors)))

Just for fun, let's see how `xgboost` does with the same data.

In [None]:
# Use xgboost on the same data
import xgboost as xgb

# Create the model
xgb_regressor = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Fit the model
xgb_regressor.fit(X, y)

# Make predictions on the training data
predictions = xgb_regressor.predict(X)
print("Predictions: {}".format(predictions))

# Compute the error
errors = mean_squared_error(y, predictions)
print("New error: {}".format(np.sqrt(errors)))

# Print the model
print(xgb_regressor)