In [1]:
import numpy as np

In [41]:
toy_data = np.array([
    [1,2,4],
    [1,1,1],
    [0,0,5],
    [4,2,2]
])
toy_y= np.array([1.5,2.3,1.2,5])

In [103]:
class XGBoostRegressor():
    def __init__(self,X:np.ndarray,y:np.ndarray,num_trees=1,num_splits=10,max_depth = 1, learning_rate = 1):
        self.constants = np.ones_like(y) * y.mean()
        self.tree = None
        for i in range(num_trees):
            curr_targ = y - self.constants
            self.tree = self.__DecisionTreeRegressor__(X,curr_targ,max_depth,num_splits)
            self.constants+=   self.tree.predict(X)
    def predict(self, X):
        return self.tree.predict(X)
    class __DecisionTreeRegressor__():
        def __init__(self,X:np.ndarray,y:np.ndarray,max_depth,num_splits):
            '''
            Decision Tree Classifier:
            Parameters:
                X: An array of numerical factors
                y: The responding variable, must be continuous
                max_depth: The maximum number of nodes from root to leaves in each estimator
                num_splits: The number of splits to be tested for each factor at each node
            '''
            self.tree = self.Node_(X,y,max_depth,num_splits,self,0)

        def predict(self,X):
            '''
            Predict:
            Predicts using the established tree

            Parameters:
                X: An array of numerical factors

            Returns:
                An array of predictions.
            '''            
            total_predictions=[]
            for x in X:
                node = self.tree
                while node.prediction_value is None:
                    #decend tree
                    if x[node.split_characteristic] <= node.split: node = node.left
                    else: node = node.right
                total_predictions.append(node.prediction_value)
            return np.array(total_predictions)
        class Node_():
            def __init__(self, X,y, max_depth, num_splits, tree=None,current_depth=0):
                self.left = None
                self.right = None
                self.tree = tree
                #calculate entropy
                classes = np.unique(y)
                if len(classes) == 1:
                    self.prediction_value = classes[0]
                    return #early stopping if we have prematurely gotten a 'Pure Node'
                # get the proportions of each class in y
                p_classes = []
                for class_ in classes:
                    p_class = np.sum(y==class_)/len(y)
                    p_classes.append(p_class)

                variances = [] # used to find best split
                total_splits = []
                for column_index in range(X.shape[1]):
                    curr_column = X[:,column_index]

                    #pick 10 random potential split points
                    random_splits = np.random.random_sample(num_splits,)*(curr_column.max()-curr_column.min())\
                                    + curr_column.min()
                    total_splits = np.concatenate([total_splits, random_splits],axis=0)
    
                    # decide on best split using information gain
                    for split in random_splits:
                        #find combined mse for each split

                        y_lower = y[curr_column<=split]
                        y_higher = y[curr_column>split]
                        y_lower_mean = y_lower.mean()
                        y_higher_mean = y_higher.mean()
         
        
                        variances.append(((y_lower-y_lower_mean)**2).sum()+((y_higher-y_higher_mean)**2).sum())

                # split using best splitpoint
                arg_min = np.argmin(np.array(variances))

                self.split_characteristic = arg_min // len(random_splits)

                final_split = total_splits[arg_min]

                self.split = final_split

                final_X_lower = X[X[:,self.split_characteristic]<=self.split, :]
                final_X_higher = X[X[:,self.split_characteristic]>self.split, :]
                final_y_lower = y[X[:,self.split_characteristic]<=self.split]
                final_y_higher = y[X[:,self.split_characteristic]>self.split]
                #assign children
                if current_depth<max_depth:
                    self.left = self.tree.Node_(final_X_lower,\
                                                final_y_lower,\
                                                max_depth,\
                                                num_splits,\
                                                self.tree,\
                                                current_depth+1)
                    self.right = self.tree.Node_(final_X_higher,\
                                                 final_y_higher,\
                                                 max_depth,\
                                                 num_splits,\
                                                 self.tree,\
                                                 current_depth+1)
                    self.prediction_value = None
                else:
                    #value to predict with
                    self.prediction_value = y.mean()

In [104]:
regress = XGBoostRegressor(toy_data,toy_y,num_trees=10)

print((toy_y-regress.predict(toy_data)))

[1.00003387 1.99989839 1.00003387 5.00003387]


In [105]:
regress = XGBoostRegressor(toy_data,toy_y,num_trees=50)
print((toy_y-regress.predict(toy_data)))

[1. 2. 1. 5.]


In [109]:
class XGBoostClassifier():
    def __init__(self,X:np.ndarray,y:np.ndarray,num_trees=1,num_splits=10,max_depth = 1, learning_rate = 1):
        self.constants = np.ones_like(y) * np.median(y)
        self.tree=None
        for i in range(num_trees):
            curr_targ = (y - self.constants)>0
            self.tree = self.__DecisionTreeClassifier__(X,curr_targ,max_depth,num_splits)
            print( self.tree.predict(X))
            self.constants+=   self.tree.predict(X)
    def predict(self, X):
        return self.tree.predict(X)
    class __DecisionTreeClassifier__():
        def __init__(self,X:np.ndarray,y:np.ndarray,max_depth,num_splits):
            '''
            Decision Tree Classifier:
            Parameters:
                X: An array of numerical factors
                y: The responding variable, must be categorical
                max_depth: The maximum number of nodes from root to leaves in each estimator
                num_splits: The number of splits to be tested for each factor at each node
            '''
            self.tree = self.Node_(X,y,self,num_splits,max_depth)
        def predict(self,X):
            '''
            Predict:
            Predicts using the established tree

            Parameters:
                X: An array of numerical factors

            Returns:
                An array of predictions.
            '''
            total_predictions=[]
            for x in X:
                node = self.tree
                while node.prediction_value is None:
                    #decend tree
                    if x[node.split_characteristic] <= node.split: node = node.left
                    else: node = node.right
                total_predictions.append(node.prediction_value)
            return total_predictions
        class Node_():
            def __init__(self, X,y,tree, num_splits, max_depth,current_depth=0):
                self.left = None
                self.right = None

                self.tree = tree
                #calculate entropy
                classes = np.unique(y)
                if len(classes) == 1:
                    self.prediction_value = classes[0]
                    return #early stop
                p_classes = []
                for class_ in classes:
                    p_class = np.sum(y==class_)/len(y)
                    p_classes.append(p_class)

                self.entropy = -np.sum(np.array([p_class*np.log2(p_class) for p_class in p_classes]))


                info_gains = [] # used to find best split
                total_splits = []

                for column_index in range(X.shape[1]):
                    curr_column = X[:,column_index]

                    #pick 10 random potential split points

                    random_splits = np.random.random_sample(num_splits,)*(curr_column.max()-curr_column.min())\
                                    + curr_column.min()
                    total_splits = np.concatenate([total_splits, random_splits],axis=0)

                    # decide on best split using information gain
                    for split in random_splits:

                        y_lower = y[curr_column<=split]
                        y_higher = y[curr_column>split]
                        lower_p_classes = []
                        higher_p_classes = []
                        #find entropy of each split
                        for class_ in classes:
                            lower_p_class = np.sum(y_lower==class_)/len(y_lower)
                            lower_p_classes.append(lower_p_class)    
                            higher_p_class = np.sum(y_higher==class_)/len(y_higher)
                            higher_p_classes.append(higher_p_class)    
                        lower_entropy = -np.sum(np.array([p_class*np.log2(p_class) for p_class in lower_p_classes]))
                        higher_entropy = -np.sum(np.array([p_class*np.log2(p_class) for p_class in higher_p_classes]))

                        info_gains.append(self.entropy -  higher_entropy - lower_entropy)
                # split using best splitpoint
                arg_max = np.argmax(np.array(info_gains))
                self.split_characteristic = arg_max // len(random_splits)
                final_split = total_splits[arg_max]
                self.split = final_split
                # also split X and y
                final_X_lower = X[X[:,self.split_characteristic]<=self.split, :]
                final_X_higher = X[X[:,self.split_characteristic]>self.split, :]
                final_y_lower = y[X[:,self.split_characteristic]<=self.split]
                final_y_higher = y[X[:,self.split_characteristic]>self.split]
                #assign children
                if current_depth<max_depth:
                    self.left = self.tree.Node_(X=final_X_lower,\
                                                y=final_y_lower,\
                                                tree=self.tree,\
                                                num_splits=num_splits,\
                                                max_depth=max_depth,\
                                                current_depth=current_depth+1)
                    self.right = self.tree.Node_(X=final_X_higher,\
                                                y=final_y_higher,\
                                                tree=self.tree,\
                                                num_splits=num_splits,\
                                                max_depth=max_depth,\
                                                current_depth=current_depth+1)
                    self.prediction_value = None
                #asign property to be predicted
                else:
                    self.prediction_value = classes[np.argmax(np.array(p_classes))]

In [110]:
class_data = np.array([
    [1,2,4],
    [1,1,1],
    [0,0,5],
    [4,2,2],
        [1,2,4],
    [1,3,1],
    [0,1,5],
    [4,4,2],
        [1,2,4],
    [2,1,1],
    [0,2,5],
    [4,2,3]
])
class_y= np.array([1,2,1,5,1,2,1,5,1,2,1,5])

In [111]:
regress = XGBoostClassifier(class_data,class_y,num_trees=10)

print((regress.predict(toy_data)))

[False, False, False, True, False, False, False, True, False, False, False, True]
[False, False, False, True, False, False, False, True, False, False, False, True]
[False, False, False, True, False, False, False, True, False, False, False, True]
[False, False, False, True, False, False, False, True, False, True, False, True]
[False, False, False, False, False, False, False, False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
[False, False, False, False]


