<a href="https://colab.research.google.com/github/aakhterov/ML_algorithms_from_scratch/blob/master/CustomGradientBoostingClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradient Boosting Classifier from scratch

[All You Need to Know about Gradient Boosting Algorithm − Part 2. Classification](https://towardsdatascience.com/all-you-need-to-know-about-gradient-boosting-algorithm-part-2-classification-d3ed8f56541e)

In [110]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

In [101]:
class CustomGradientBoostingClassifier:

  def __init__(self, n_estimators=100, learning_rate=0.9, max_depth=3):
    self.n_estimators = n_estimators
    self.learning_rate = learning_rate
    self.max_depth = max_depth
    self.F0 = None
    self.trees = []

  def __log_odds(self, p: float) -> float:
      return np.log(p/(1-p))

  def __p(self, log_odds: float) -> float:
      return np.exp(log_odds)/(1+np.exp(log_odds))

  def fit(self, X, y):
    mean = np.mean(y)
    p = np.full(len(y), np.mean(y))
    r = y -  p
    self.F0 = self.__log_odds(p)
    Fm = self.F0.copy()

    for _ in range(self.n_estimators):
        if all(r == 0):
          break
        tree = DecisionTreeRegressor(max_depth=self.max_depth)
        tree.fit(X, r)
        node_ids = tree.apply(X)

        for idx in set(node_ids):
          filter_ = node_ids == idx
          numerator = np.sum(r[filter_])
          denumerator = np.sum(p[filter_] * (1 - p[filter_]))
          gamma = numerator/denumerator
          Fm[filter_] += self.learning_rate * gamma
          tree.tree_.value[idx, 0, 0] = gamma

        self.trees.append(tree)
        p = self.__p(Fm)
        r = y -  p

  def predict_proba(self, X):
    y_hat = np.full(X.shape[0], self.F0[0])
    for tree in self.trees:
      y_hat += self.learning_rate * tree.predict(X)
    return self.__p(y_hat)

  def predict(self, X):
    return  np.round(self.predict_proba(X))

In [None]:
# Let's test

In [27]:
data = load_breast_cancer() # load breast_cancer dataset
X, y = data['data'], data['target']

In [28]:
X.shape, y.shape

((569, 30), (569,))

In [38]:
X[:5, :], y[:5]

(array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
         3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
         8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
         3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
         1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
         8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
         3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
         1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
         1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
         1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
         4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
         2.250e-02, 4.5

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) # split dataset on train and test

In [None]:
# Use our classifier

In [106]:
cgbc = CustomGradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3
)
cgbc.fit(X_train, y_train)

In [111]:
accuracy_our = accuracy_score(y_test, cgbc.predict(X_test))

In [112]:
log_loss_our = log_loss(y_test, cgbc.predict_proba(X_test))

In [None]:
# Use in-built gradient boosting regressor

In [113]:
gbc = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3
)
gbc.fit(X_train, y_train)

In [114]:
accuracy_inbuit = accuracy_score(y_test, gbc.predict(X_test))

In [115]:
log_loss_inbuit = log_loss(y_test, gbc.predict_proba(X_test))

In [116]:
print(f"Accuracy by CustomGradientBoostingClassifier: {accuracy_our}")
print(f"Accuracy by GradientBoostingClassifier: {accuracy_inbuit}")

print(f"Log_loss by CustomGradientBoostingClassifier: {log_loss_our}")
print(f"Log_loss by GradientBoostingClassifier: {log_loss_inbuit}")

Accuracy by CustomGradientBoostingClassifier: 0.9649122807017544
Accuracy by GradientBoostingClassifier: 0.9649122807017544
Log_loss by CustomGradientBoostingClassifier: 0.07595746428205324
Log_loss by GradientBoostingClassifier: 0.07910697637371383


In [None]:
# As you can see above, both models have almost the same Accuracy and Log-loss.