# Gradient Boosting

## Steps
1. Make a first guess for y_train and y_test, using average of y_train
$$
y_{train_{p_0}} = \frac{1}{n} \sum_{i=1}^n y_{train_{i}}  \\
y_{test_{p_0}} = y_{train_{p_0}} 
$$
2. Calculate the residuals from the training set
$$
r_0 = y_{train} - y_{train_{p_0}}
$$
3. Fit a week learner to the residuals minimizing the loss function. Let's call it $f_0$.
$$
r_0 = f_0(X_{train})
$$
4. Increment the predicted y's.
$$
y_{train_{p_1}} = y_{train_{p_0}} + \alpha f_0(X_{train}) \\
y_{test_{p_1}} = y_{test_{p_0}} + \alpha f_0(X_{test}) \\
$$
$\alpha$ is the learning rate.

5. Repeat 2 to 4 until you reach the number of boosting rounds

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def GradBoost(model, X_train, y_train, X_test, boosting_rounds, learning_rate: float = 0.1):
    #make a first guess of our training target variable using the mean of y_train
    y_hat_train = np.repeat(np.mean(y_train), len(y_train))

    #Initiate the test prediction with the mean of the training target variable.
    y_hat_test = np.repeat(np.mean(y_train), len(y_test))

    #Calculate the residuals
    residuals = y_train - y_hat_train

    #Iterate through the boosting rounds
    for i in range(boosting_rounds):
        #Fit the model to the residuals
        model = model.fit(X_train, residuals)

        #Increment the predicted training y with the pseudoresidulas * learning rate
        y_hat_train = y_hat_train + learning_rate * model.predict(X_train)

        #Increment the predicted test y as well
        y_hat_test = y_hat_test + learning_rate * model.predict(X_test)

        #Calculate the residuals for the next round
        residuals = y_train - y_hat_train

    return y_hat_train, y_hat_test


## Demonstration

In [None]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples = 1000,
                        n_features = 20,
                        n_informative = 15,
                        n_targets = 1,
                        bias = 0.0,
                        noise = 20,
                        shuffle = True,
                        random_state = 13)

X_train = X[0:800]
y_train = y[0:800]

X_test = X[800:]
y_test = y[800:]

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(criterion='squared_error', max_depth=3)

In [None]:
mse_train = []
n_rounds = np.arange(5, 101, 5)
for n_round in n_rounds:
    y_hat_train = GradBoost(model,
                            X_train,
                            y_train,
                            X_test,
                            boosting_rounds=n_round,
                            learning_rate=0.1)[0]

    mse_train.append(np.mean((y_train - y_hat_train) **2))
    print("round #: {0}, mse: {1}".format(n_round, mse_train[-1]))

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(n_rounds, mse_train)
plt.title("Training MSE vs Boosting Rounds", fontsize=20)
plt.xlabel('NUmber of Boosting rounds', fontsize=15)
plt.ylabel('Training Mean squared error', fontsize=15)