In [1]:
import numpy as np
from decisiontree import DecisionTreeRegressor

### Gradient Boosting Classification

> Regression and classification
  only differ in the concrete ``LossFunction`` used.

> GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage ``n_classes_`` regression trees are fit on the negative gradient of the binomial or multinomial deviance loss function. 

> Binary classification is a special case where only a single regression tree is induced.

> Classification with more than 2 classes requires the induction of n_classes regression trees at each iteration, thus the total number of induced trees equals n_classes * n_estimators.

> **Below are the instructions for binary classification**

> Initialize the model with a constant probability: 

> $ 1. \; \;  F_0(x) =  max \frac{count(y=i)}{count(y))} \;\;\; \text{ where i=0,1 }$

> For m = 1 to M,

> $ 2. \; \; \text{Compute Psuedo Residuals, } r_{im} = - [ \frac{ \partial Loss( y_i , F(x_i) ) }{ \partial F(x_i) } ]_{F(x) = F_{m-1}(x)} \;\;\; \text{for i = 1,2...n} $

>    $ \; \; \; \;\text{For binomial deviance, psuedo residuals } r_{i} =  -1 * \frac{ \partial \big( -y_ilog(p_i) - (1-y_i)log(1-p_i) \big)}{\partial p_i} = -1 * -(y_i - p_i)  = y_i - p_i $ 

> $ 3. \; \; \text{Fit base trainer, } h_m(x) \text{ to psuedo residuals} $

> $ 4. \; \; \text{Update } F_m(x) = F_{m-1}(x) + \gamma_m h_m(x) $

> $ \; \; \; \; \; \gamma_m \text{ step-size is choosen using line search, } \gamma_m = arg\;min_\gamma \sum_{i=1}^n L(y_i,F_{m-1}(x_i)) - \gamma \frac{\partial L(y_i,F_{m-1}(x_i))}{\partial F_{m-1}(x_i)}$

> $$ \; \; \text{Final model will be, } F(x) = \frac{1}{n} \sum_{i=1}^n y_i + \sum_{m=1}^M \gamma_m h_m(x) $$


* In code below, step size is constant.
* [Nice Article](https://scikit-learn.org/stable/modules/ensemble.html#gradient-tree-boosting)
* [Nice Article](http://zpz.github.io/blog/gradient-boosting-tree-for-binary-classification/)

In [2]:
class GradientBoostingClassifier():
    def __init__(self, loss='deviance', learning_rate = 0.1, n_estimators=100, criterion='mse', 
                 max_depth=None, min_samples_split=2, max_features=None, verbose=False):
        self.__lr = learning_rate
        self.__n_estimators = n_estimators
        self.__criterion = criterion
        self.__max_depth = max_depth
        self.__min_samples_split = min_samples_split
        self.__max_features = None
        if isinstance(max_features,str):
            self.__max_features = { 
            'auto': lambda x: int(np.sqrt(x)), 'sqrt': lambda x: int(np.sqrt(x)), 
            'log2': lambda x: int(np.log2(x)), 'max_features': lambda x: x  }[max_features]
        elif isinstance(max_features, int):
            self.__max_features = lambda x: max_features
        elif isinstance(max_features, float):
            self.__max_features = lambda x: int(max_features*x)
        else:
            self.__max_features = lambda x: x
            
        self.__n_features = None
        self.__trees = []
        self.__verbose = verbose
        self.__f0 = None
    
    def __binomial_deviance(self,p_pred,y_true):
        return np.sum(-y_true*np.log(p_pred) - (1-y_true)*np.log(1-p_pred))
    
    def __negative_binomial_deviance_gradient(self,p_pred,y_true):
        grad =  -1 * (y_true - p_pred)
        return -1 * grad
    
    def __get_feature_index(self): 
        return np.random.choice( np.arange(0,self.__n_features,1), 
                                size=self.__max_features(self.__n_features), replace=False)
    
    def fit(self, X, y):
        self.__n_features = X.shape[1]
        p = self.__f0 = max( (y==1).sum(), (y==0).sum()) / len(y)
        if self.__verbose:
            print( f"Binomial Deviance Loss, Accuracy after trees {0} : {self.__binomial_deviance(p,y)}, {self.score(X,y)}" )
        for i in range(0,self.__n_estimators):
            dt = DecisionTreeRegressor(criterion=self.__criterion, 
                                       max_depth=self.__max_depth, 
                                       min_samples_split=self.__min_samples_split)
            feature_index = self.__get_feature_index()
            h = self.__negative_binomial_deviance_gradient(p,y)
            dt.fit(X[:,feature_index], h)
            self.__trees.append( (dt.tree_,feature_index) )
            p = self.predict_proba(X)[:,1]
            if self.__verbose and (i+1)%5==0:
                print( f"Binomial Deviance Loss, Accuracy after trees {i+1} : {self.__binomial_deviance(p,y)}, {self.score(X,y)}" )
            
    def predict_proba(self,X):    
        predictions = np.ones( len(X) ) * self.__f0
        for i in range(1,len(self.__trees)+1):
            root, features = self.__trees[i-1]
            predictions += self.__lr * np.array([ self.__predict_row(row,root) for row in X[:,features] ])
        proba = np.zeros( (len(X),2) )
        proba[:,0] = (1-predictions)
        proba[:,1] = predictions
        return proba
        
        
    def predict(self, X):
        proba = self.predict_proba(X)
        return (proba[:,1]>0.5)*1
            
    def __predict_row(self,row,node):
        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict): return self.__predict_row(row,node['left'])
            else: return node['left']
        else:
            if isinstance(node['right'], dict): return self.__predict_row(row,node['right'])
            else: return node['right']
    
    def score(self,X,y):
        y_pred = self.predict(X)
        return (y_pred==y).sum()/len(y)

In [3]:
import pandas as pd
data = pd.read_csv('data_banknote.txt',header=None).values
X = data[:,:-1]
y = data[:,-1]
X.shape,y.shape

((1372, 4), (1372,))

In [4]:
from sklearn.model_selection import train_test_split
X_t,X_v,Y_t,Y_v = train_test_split(X,y,test_size=0.3)

In [5]:
gbr = GradientBoostingClassifier(learning_rate=0.015,
                                n_estimators=20,
                               verbose=True)
gbr.fit(X_t,Y_t)
gbr.score(X_t,Y_t),gbr.score(X_v,Y_v)

Binomial Deviance Loss, Accuracy after trees 0 : 677.5610432045382, 0.45416666666666666
Binomial Deviance Loss, Accuracy after trees 5 : 607.9841443346995, 0.45416666666666666
Binomial Deviance Loss, Accuracy after trees 10 : 547.7972720514115, 1.0
Binomial Deviance Loss, Accuracy after trees 15 : 495.24761889084925, 1.0
Binomial Deviance Loss, Accuracy after trees 20 : 449.0177164886579, 1.0


(1.0, 0.9781553398058253)