## Random Forest

### A random forest keeps a collection of decision trees. Based on the type of variable you are predicting you either find the mean of the tree's predictions or you find the most common prediction (in the case of a catagorical response variable).  

In [482]:
import numpy as np

In [551]:
class RandomForest():
    def __init__(self,X:np.ndarray,y:np.ndarray,num_trees=1,max_depth=3,num_splits=10):
        '''
        Random Forest:
        Parameters:
            X: An array of numerical factors
            y: The responding variable, can be categorical or continuous
            num_trees: The number of estimators which are voting to form a prediction
            max_depth: The maximum number of nodes from root to leaves in each estimator
            num_splits: The number of splits to be tested for each factor at each node
        '''
        self.trees = []
        self.__is_continuous__ = self.__is_continuous__(y)

        for tree in range(num_trees):
            if self.__is_continuous__:
                self.trees.append(self.__DecisionTreeRegressor__(X,y,max_depth,num_splits))
            else:
                self.trees.append(self.__DecisionTreeClassifier__(X,y,max_depth,num_splits))
    def predict(self,X):
        '''
        Predict:
        Parameters:
            X: An array of numerical factors
            
        Returns:
            A series of predictions, the continuity of predictions depends on the labels the model was trained on
        '''
        total_predictions = []
        for x in X:
            predictions = []
            for tree in self.trees:
                predictions.append(tree.predict(x))

            if self.__is_continuous__:
                'For regression we find the mean of the predictions'

                total_predictions.append(np.array(predictions).mean())
            else:
                'For classification we find the most common prediction'
                counts = {}
                for prediction in predictions:
                    if prediction in counts:
                        counts[prediction] += 1
                    else:
                        counts[prediction] = 1
                most = 0
                most_key=''
                for key in counts.keys():
                    if counts[key] > most:
                        most = counts[key]
                        most_key=key
                total_predictions.append( most_key)    
        return np.array(total_predictions)
    class __DecisionTreeClassifier__():
        def __init__(self,X:np.ndarray,y:np.ndarray,max_depth,num_splits):
            "Constructs a Decision Tree which uses Information Gain to choose the best splits"
            self.tree = self.Node_(X,y,self,num_splits,max_depth)
        def predict(self,x):
            "Predicts using the established tree"
            node = self.tree
            while node.prediction_value is None:
                #decend tree
                if x[node.split_characteristic] <= node.split: node = node.left
                else: node = node.right
            return node.prediction_value
        class Node_():
            def __init__(self, X,y,tree, num_splits, max_depth,current_depth=0):
                self.left = None
                self.right = None

                self.tree = tree
                #calculate entropy
                classes = np.unique(y)
                if len(classes) == 1:
                    self.prediction_value = classes[0]
                    # early stop as we have reached a pur
                    return
                p_classes = []
                for class_ in classes:
                    p_class = np.sum(y==class_)/len(y)
                    p_classes.append(p_class)

                self.entropy = -np.sum(np.array([p_class*np.log2(p_class) for p_class in p_classes]))


                info_gains = [] # used to find best split
                total_splits = []

                for column_index in range(X.shape[1]):
                    curr_column = X[:,column_index]

                    #pick 10 random potential split points

                    random_splits = np.random.random_sample(num_splits,)*(curr_column.max()-curr_column.min())\
                                    + curr_column.min()
                    total_splits = np.concatenate([total_splits, random_splits],axis=0)

                    # decide on split using information gain
                    for split in random_splits:
                        
                        y_lower = y[curr_column<=split]
                        y_higher = y[curr_column>split]
                        lower_p_classes = []
                        higher_p_classes = []
                        for class_ in classes:
                            lower_p_class = np.sum(y_lower==class_)/len(y_lower)
                            lower_p_classes.append(lower_p_class)    
                            higher_p_class = np.sum(y_higher==class_)/len(y_higher)
                            higher_p_classes.append(higher_p_class)    
                        lower_entropy = -np.sum(np.array([p_class*np.log2(p_class) for p_class in lower_p_classes]))
                        higher_entropy = -np.sum(np.array([p_class*np.log2(p_class) for p_class in higher_p_classes]))
                        info_gain = self.entropy -  higher_entropy - lower_entropy
                        info_gains.append(info_gain)
                # split using best splitpoint
                arg_max = np.argmax(np.array(info_gains))
                self.split_characteristic = arg_max // len(random_splits)
                final_split = total_splits[arg_max]
                self.split = final_split

                final_X_lower = X[X[:,self.split_characteristic]<=self.split, :]
                final_X_higher = X[X[:,self.split_characteristic]>self.split, :]
                final_y_lower = y[X[:,self.split_characteristic]<=self.split]
                final_y_higher = y[X[:,self.split_characteristic]>self.split]
                #assign children
                if current_depth<max_depth:
                    self.left = self.tree.Node_(X=final_X_lower,\
                                                y=final_y_lower,\
                                                tree=self.tree,\
                                                num_splits=num_splits,\
                                                max_depth=max_depth,\
                                                current_depth=current_depth+1)
                    self.right = self.tree.Node_(X=final_X_higher,\
                                                y=final_y_higher,\
                                                tree=self.tree,\
                                                num_splits=num_splits,\
                                                max_depth=max_depth,\
                                                current_depth=current_depth+1)
                    self.prediction_value = None
                else:
                    #value to predict with
                    self.prediction_value = classes[np.argmax(np.array(p_classes))]
    class __DecisionTreeRegressor__():
        def __init__(self,X:np.ndarray,y:np.ndarray,max_depth,num_splits):
            "Constructs a Decision Tree which uses the sum of variance to choose the best splits"
            self.tree = self.Node_(X,y,max_depth,num_splits,self,0)

        def predict(self,x):
            "Predicts using the established tree"
            node = self.tree
            while node.prediction_value is None:
                #decend tree
                if x[node.split_characteristic] <= node.split: node = node.left
                else: node = node.right
            return node.prediction_value
        class Node_():
            def __init__(self, X,y, max_depth, num_splits, tree=None,current_depth=0):
                self.left = None
                self.right = None
                self.tree = tree
                #calculate entropy
                classes = np.unique(y)
                if len(classes) == 1:
                    self.prediction_value = classes[0]
                    return #early stop
                p_classes = []
                for class_ in classes:
                    p_class = np.sum(y==class_)/len(y)
                    p_classes.append(p_class)

                variances = [] # used to find best split
                total_splits = []
                for column_index in range(X.shape[1]):
                    curr_column = X[:,column_index]

                    #pick 10 random potential split points
                    #TODO: make this related to the column's value's range in a smart way
                    random_splits = np.random.random_sample(num_splits,)*(curr_column.max()-curr_column.min())\
                                    + curr_column.min()
                    total_splits = np.concatenate([total_splits, random_splits],axis=0)
    
                    # decide on split using information gain
                    for split in random_splits:

                        y_lower = y[curr_column<=split]
                        y_higher = y[curr_column>split]
                        y_lower_mean = y_lower.mean()
                        y_higher_mean = y_higher.mean()
         
        
                        variances.append(((y_lower-y_lower_mean)**2).sum()+((y_higher-y_higher_mean)**2).sum())

                # split using best splitpoint
                arg_min = np.argmin(np.array(variances))

                self.split_characteristic = arg_min // len(random_splits)

                final_split = total_splits[arg_min]

                self.split = final_split

                final_X_lower = X[X[:,self.split_characteristic]<=self.split, :]
                final_X_higher = X[X[:,self.split_characteristic]>self.split, :]
                final_y_lower = y[X[:,self.split_characteristic]<=self.split]
                final_y_higher = y[X[:,self.split_characteristic]>self.split]
                #assign children
                if current_depth<max_depth:
                    self.left = self.tree.Node_(final_X_lower,\
                                                final_y_lower,\
                                                max_depth,\
                                                num_splits,\
                                                self.tree,\
                                                current_depth+1)
                    self.right = self.tree.Node_(final_X_higher,\
                                                 final_y_higher,\
                                                 max_depth,\
                                                 num_splits,\
                                                 self.tree,\
                                                 current_depth+1)
                    self.prediction_value = None
                else:
                    #value to predict with
                    self.prediction_value = y.mean()

    def __is_continuous__(self,x):

        if type(x[0]) in [np.float64,np.float,np.float128,np.float16,np.float32]: return True
        # Enough ints that we can consider them continuous
        elif type(x[0]) == np.int64 and len(np.unique(x[0])) > 10: return True
        else: return False

In [552]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
dataset = load_breast_cancer()

In [553]:
data = dataset['data']
target = dataset['target']

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [554]:
forest =RandomForrest(data_train,target_train,num_trees=100,max_depth=5,num_splits=10)



In [536]:
predictions = forest.predict(data_test)

In [537]:
#print recall as this is critical for medical tests

TP = np.array([prediction and target for prediction,target in zip(predictions,target_test)])
FN = np.array([ target and not prediction  for prediction,target in zip(predictions,target_test)])
print(TP.sum()/(FN.sum()+TP.sum()))


1.0


### Really? That seems too good...

In [538]:
predictions[:10]

array([1, 0, 1, 1, 1, 0, 0, 1, 1, 1])

In [539]:
FN.sum()

0

### Thats good!

## Lets regress...


In [540]:
from sklearn.datasets import load_boston
dataset = load_boston()

In [541]:
data = dataset['data']
target = dataset['target']

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [542]:
regression_forrest = RandomForest(data_train,target_train,max_depth=7,num_splits=10)

  ret = ret.dtype.type(ret / rcount)


In [543]:
predictions = np.array(regression_forrest.predict(data_test))

In [545]:
predictions[:10]

array([23.41      , 25.76666667, 15.44375   , 23.41      , 15.58      ,
       20.33658537, 20.96363636, 19.3       , 20.33658537, 18.00416667])

In [546]:
target_test[:10]

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])

In [547]:
from sklearn.metrics import mean_squared_error

In [548]:
mean_squared_error(predictions,target_test)

11.17451755167276

In [549]:
max(predictions-target_test)

6.600000000000001

In [550]:
min(predictions-target_test)

-9.799999999999997

### Again it seems we've made some errors but they are less than with one decision tree due to lower variance (due to the voting of the trees)