# Gradient Boosting

In [1]:
import Orange
import numpy as np

Spodaj razvijemo razred za Gradient Boosting, ki ustreza Orange-ovem objektu za učenje (Learner) in napovedovanje (Model). Implementirali smo tri različne gradiente, ki ustrezajo različnim cenovnim funkcijam (funkcijam napak), katerih vrednost minimiziramo.

In [2]:
class GradBoostRLearner(Orange.regression.Learner):
    """Gradient Boosting for Regression."""

    def __init__(self, learner, n_estimators=10, epsilon=1e-5, loss="squared"):
        super().__init__()
        self.n_estimators = n_estimators
        self.learner = learner  # base learner
        self.name = "gb " + self.learner.name + " " + loss
        self.epsilon = epsilon
        losses = {"huber": self.grad_huber_loss, 
                  "squared": self.grad_squared_loss, 
                  "abs": self.grad_abs_loss}
        self.loss = losses[loss]
        
    def grad_squared_loss(self, y, f):
        """Negative gradiant for squared loss."""
        return y - f
    
    def grad_abs_loss(self, y, f):
        """Negative gradient for absolute loss."""
        return np.sign(y - f)
    
    def grad_huber_loss(self, y, f, delta=0.5):
        """Negative gradient for Huber loss."""
        r0 = y - f
        r1 = delta * np.sign(r0)
        return np.vstack((r0, r1)).T[np.arange(y.shape[0]), (np.abs(r0)>delta).astype(int)]
    
    def fit_storage(self, data):
        """Fitter. Learns a set of models for gradient boosting."""
        ml = Orange.regression.MeanLearner()
        model = ml(data)
        y = data.Y
        f = model(data)
        res = self.loss(y, f)
        models = [model]
        
        for i in range(self.n_estimators):
            data = Orange.data.Table(data.X, res)
            model = self.learner(data)
            f += model(data)
            res = self.loss(y, f)
            models.append(model)
        return GradBoostRModel(models)
    
class GradBoostRModel(Orange.regression.Model):
    """Classifier for gradient boosting."""
    def __init__(self, models):
        self.models = models
    
    def predict(self, X):
        """Given a data instance or table of data instances returns predicted class."""
        return sum([m(X) for m in self.models])

Testirali bomo na domeni housing (cene hiš v Bostonu).

In [3]:
housing = Orange.data.Table("housing")

Najbolj nas seveda zanima primerjava z regresijskimi drevesi majhnih globin, kakršne tipično uporabljamo v kombinaciji z gradient boosting-om. Seveda nas zanima tudi, ali lahko s to metodo izboljšamo točnost za kakšnen drugi algoritem učenja.

In [4]:
ml = Orange.regression.MeanLearner()
stree = Orange.regression.SimpleTreeLearner(max_depth=3)
lr = Orange.regression.LinearRegressionLearner()
gb_sq = GradBoostRLearner(stree, n_estimators=50, loss="squared")
gb_abs = GradBoostRLearner(stree, n_estimators=50, loss="abs")
gb_huber = GradBoostRLearner(stree, n_estimators=50, loss="huber")
gb_lr = GradBoostRLearner(lr, n_estimators=50, loss="squared")

In [9]:
learners = [ml, stree, gb_abs, lr, gb_lr]
res = Orange.evaluation.CrossValidation(housing, learners, k=5, random_state=42)

In [10]:
print("\n".join("{:>30} {:5.2f}".format(m.name, r)
                for m, r in zip(learners, Orange.evaluation.RMSE(res))))

                          mean  9.21
                   simple tree  4.76
            gb simple tree abs  3.68
             linear regression  4.85
  gb linear regression squared  4.85


Razultati na tej domeni, kažejo na izrazito izboljšavo pri drevesih in na (pričakovano) pomanjkanje kakršnegakoli izboljšanja pri linearni regresiji.