Naive Implementation of Gradient Boosting Classifier

1. Transform, split the imported dataset.
2. ModelFit, encompassing the logic of gradient boost classifier model
3. Predict
4. Accuracy of the model

Sources: StatQuest


In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np
cancer = load_breast_cancer()
df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
df['target'] = cancer['target']

In [8]:
def data_transform(df):

  # Splitting data into training and testing data
  df = df.sample(frac=1).reset_index(drop=True)
  no_of_train_rows = int(np.floor(0.80*df.shape[0]))
  train_df = df.iloc[0:no_of_train_rows,:]
  test_df = df.iloc[no_of_train_rows:,:]

  l = df.shape[1]
  train_x = train_df.iloc[:,0:l-2]
  train_y = train_df.iloc[:,-1]
  test_x = test_df.iloc[:,0:l-2].reset_index(drop=True)
  test_y = test_df.iloc[:,-1].reset_index(drop=True)

  return train_x,train_y,test_x,test_y

In [12]:
def modelfit(x,y,NofTrees=25,maxLeaf=16,maxDepth=4):

  ##################################################################
  # Create 4 arrays named Prob,residuals, leaf & leaf_transformed_value
  # 1. Prob : holds the probability value for each instance & tree
  # 2. residuals : holds the residuals for each instance & tree
  # 3. leaf : gives the leaf node values for each instance
  # 4. lead_transformed_value = holds the transformed leaf node value
  # for each leaf.
  ##################################################################
  prob = np.zeros((len(y),NofTrees+2))
  residuals = np.zeros((len(y),NofTrees+2))
  leaf = np.zeros(((maxLeaf),NofTrees+1))
  leaf_transformed_value = np.zeros(((maxLeaf),NofTrees+1))

  trees = []

   ##################################################################
  # Calculate the log of Odds, to get the pilot- 0th Probability
  # np.log(p,q)
  ##################################################################

  true_flag = len(y[y == 1].index)
  false_flag = len(y[y == 0].index)
  prob1 = np.log(true_flag/false_flag) # Pilot Probability
  prob[:,0]=np.repeat(prob1,len(y))
  residuals[:,0] = y - prob1


  for i in range(0,NofTrees+1):

  ##################################################################
  # Using DecisionTreeRegressor of the Sklearn Library
  # Saving the model for prediction of unknown instances.
  # tree.apply : Gives the leaf nodes of the tree
  ##################################################################

    dt = DecisionTreeRegressor(max_depth=maxDepth,max_leaf_nodes=maxLeaf)
    node = dt.fit(x, residuals[:,i])
    trees.append(node)

    leaf_indices=dt.apply(x)
    unique_leaves = np.unique(leaf_indices)
    leaf[0:len(unique_leaves),i] = unique_leaves
  
    n_leaf = len(unique_leaves)

    for ileaf in range(n_leaf):

    ##################################################################
    # For each unique leaf index, we aggregate the train_x
    # instances, and collect them in local variables within the loop
    # 1. res( for residuals corresponding to the unqiue leaf node)
    # 2. pb ( for probability corresponding to the unique lead node)
    ##################################################################
      
      leaf_index=unique_leaves[ileaf]
      indexes = np.argwhere(leaf_indices == unique_leaves[ileaf])
      res = residuals[[indexes],i]    
      pb = prob[[indexes],i]

      ##################################################################
      # 1.calculating the transformed leaf value
      # 2.Assigning the transformed value to the corresponding leaf value
      #   index in the leaf_transformed_value array, which is directly
      #   accessed while predicting new instances.
      # Note: One can say storing the leaf values corresponding to leaf nodes
      # but only transformed.
      ##################################################################
      
      numerator = np.sum(res)
      denomenator = np.sum(pb*(1-pb))
      transformed_leaf = numerator/denomenator
      leaf_transformed_value[ileaf,i] = transformed_leaf
      intermediate_prob = prob[[indexes],0] + 0.1 * transformed_leaf

      ##################################################################
      # Converting the leaf val to prob, using sigmoid function
      ##################################################################
      prob[[indexes],i+1] = np.exp(intermediate_prob)/(1+np.exp(intermediate_prob))
     
      y_temp = np.zeros(len(indexes))
      for k in range(0,len(indexes)):
        y_temp[k] = y[indexes[k]]

      ##################################################################
      # Recalculating the new residuals, based on the new calculated
      # probability.
      ##################################################################
      f= y_temp.ravel() - prob[[indexes],i+1].ravel()
      residuals[[indexes],i+1] = f.reshape(1,len(indexes),1)
 
  return trees, prob,residuals, leaf, leaf_transformed_value

In [5]:
def predict(x,y,trees,leaf,trans_leaf,lr):

  predict_leaf = np.zeros((len(x),len(trees)))
  ##################################################################
  # Using the Apply functionality of DecisionRegression Tree
  # We first extract the leaf terminal where each test instance
  # ends and the same is done for all the fitted trees.
  ##################################################################
  for k in range(len(trees)):
    predict_leaf[:,k] = trees[k].apply(x)

  ##################################################################
  # Formula for Gradient based additive prediction is executed
  # in the below lines of code
  # pred = pred_pilot + lr(learning_rate)* transformed_leaf_val+...
  ##################################################################  
  final_pred = []

  for i in range(len(x)):
    prediction = prob[0,0] 
    for k in range(len(trees)):

      loc = np.argwhere(leaf[:,k] == predict_leaf[i,k])
      val = trans_leaf[loc,k]
      prediction += lr * val

    final_pred.extend(prediction.ravel())

  ##################################################################
  # 1. Calculating the final predicted probability by transforming
  #    predicted value using sigmoid transform.
  # 2. Keeping the threshold value @0.5, predictions <=0.5 are rounded
  #    of to 0, and predictions > 0.5 are rounded to 1.
  ##################################################################
  final_prob = np.zeros((len(x),1))
  final_prob = np.exp(final_pred)/(1+np.exp(final_pred))

  y_pred = np.zeros((len(x),1))
  for i in range(len(y)):
    if final_prob[i] <= 0.5:
      y_pred[i] = 0
    else:
      y_pred[i] = 1

  return y_pred

In [6]:
##################################################################
# Function to measure the accuracy of the model
##################################################################
def accuracy(test_y,y_pred):
  count = 0
  for i in range(len(test_y)):
    if y_pred[i] == test_y[i]:
      count +=1
  return count/len(test_y) * 100

In [13]:
# Model Execution #

train_x,train_y,test_x,test_y = data_transform(df)
trees, prob,residuals,leaf, trans_leaf = modelfit(train_x,train_y)
lr = 0.1
y_pred = predict(test_x,test_y,trees,leaf,trans_leaf,lr)
model_accuracy = accuracy(test_y,y_pred)
print("Accuracy of the Gradient Boost Classification Model:",model_accuracy)

Accuracy of the Gradient Boost Classification Model: 93.85964912280701
