### Import Numpy

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

### Create Tree Object

In [101]:
class RandomForest:
    
    def __init__(self, X, y, predict_fn, score_fn, vars_per_split, max_depth = 3, n_trees = 5):
        
        self.trees = []
        self.bag_idxs = []
        
        for i in range(n_trees):
            if i % 20 == 0: print('Creating tree '+ str(i))
            bag_idx = np.random.choice(a = np.arange(X.shape[0]), size = X.shape[0], replace = True)
            self.bag_idxs.append(bag_idx)
            self.trees.append(Tree(X[bag_idx, :], y[bag_idx], predict_fn, score_fn, vars_per_split, max_depth))
            
    def __str__(self):
        return 'RandomForest model with ' + str(len(self.trees)) + ' trees'
    
    def predict(self, X):
        preds = np.empty(shape = [X.shape[0], 0])
        for tree in self.trees:
            preds = np.append(preds, np.array(tree.predict(X)).reshape(-1, 1), axis = 1)
        return np.mean(preds, axis = 1)
            

In [3]:
class Tree:
    
    def __init__(self, X, y, predict_fn, score_fn, vars_per_split, max_depth = 3):
        self.rootNode = Node('Root', X, y, predict_fn, score_fn,
                             idxs = np.arange(len(y)), max_depth = max_depth, current_depth = 0,
                             vars_per_split = vars_per_split)
        
    def __str__(self):
        return self.rootNode.__str__()
        
    def predict(self, X):
        return self.rootNode.predict(X)

### Create Node Object

In [95]:
class Node:
    
    def __init__(self, name, X, y, predict_fn, score_fn, idxs, max_depth, current_depth, vars_per_split):
        

        self.name = name
        self.X = X[idxs, :]
        self.y = y[idxs]
        self.predict_fn = predict_fn
        self.score_fn = score_fn
        self.idxs = idxs
        self.num_rows = self.X.shape[0]
        self.num_cols = self.X.shape[1]
        self.score = float('inf')
        self.split_col_idx = 0
        self.vars_per_split = vars_per_split
        self.split_cols = np.random.choice(a = np.arange(self.X.shape[1]), size = self.vars_per_split, replace = False)
        self.split_value = None
        self.max_depth = max_depth
        self.current_depth = current_depth
        
        self.current_depth += 1
        if self.current_depth > self.max_depth:
            return
        
        self.find_best_split()
        self.create_child_nodes()
        
        
        
    def find_best_split(self):
        for col in self.split_cols:
            for row in range(self.num_rows):
                if self.calculate_score(self.X[row, col], col) < self.score:
                    self.score = self.calculate_score(self.X[row, col], col)
                    self.split_col_idx = col
                    self.split_value = self.X[row, col]
      
    def calculate_score(self, value, col_idx):
        return self.score_fn(self.X, self.y, value, col_idx)
    
    def create_child_nodes(self):
        
        lhs_idx = np.where(self.X[:, self.split_col_idx] <= self.split_value)[0]
        rhs_idx = np.where(self.X[:, self.split_col_idx] > self.split_value)[0]
        
        if len(lhs_idx) == 0 or len(rhs_idx) == 0:
            self.score = float('inf')
            return
        
        self.lhn = Node(self.name + ' - Left', self.X, self.y, self.predict_fn, self.score_fn,
                        lhs_idx, self.max_depth, self.current_depth, self.vars_per_split)
        self.rhn = Node(self.name + ' - Right', self.X, self.y, self.predict_fn, self.score_fn,
                        rhs_idx, self.max_depth, self.current_depth, self.vars_per_split)
        
    def predict_row(self, new_X_i):
        if self.score == float('inf'):
            return self.predict_fn(self.y)
        else:
            if new_X_i[self.split_col_idx] <= self.split_value:
                return self.lhn.predict_row(new_X_i)
            else:
                return self.rhn.predict_row(new_X_i)
            
    def predict(self, new_X):
        return [self.predict_row(new_X[i]) for i in range(new_X.shape[0])]
    
    def __str__(self):
        if self.score == float('inf'):
            return ('\n' + '  ' * (self.current_depth - 1) + 'Rows: ' + str(self.num_rows) +
                   ' - Prediction: ' + str(self.predict_fn(self.y)))
        else:
            return ('\n' + '  ' * (self.current_depth - 1) + 'Rows: ' + str(self.num_rows) +
                    ' - Split Col: ' + str(self.split_col_idx) + ' - Split Value: ' + str(self.split_value) +
                    ' - Score: ' + str(self.score) + self.lhn.__str__() + self.rhn.__str__())

### Create Scoring Function

In [5]:
def find_gini(X, y, value, col_idx):
    probs = np.array([])
    ginis = np.array([])
    
    lhs = np.where(X[:, col_idx] <= value)[0]
    rhs = np.where(X[:, col_idx] > value)[0]

    lhs_mode = predict_mode(y[lhs])
    
    # Fixing bug for when there is no rhs data
    if len(rhs) == 0:
        rhs_mode = 0
    else:
        rhs_mode = predict_mode(y[rhs])
    
    preds = np.zeros(len(y))
    preds[lhs] = lhs_mode
    preds[rhs] = rhs_mode
    
    for predicted_class in np.unique(preds):
        class_ginis = np.array([])
        
        probs = np.append(probs, np.sum(predicted_class == preds) / len(preds))
        
        for actual_class in np.unique(y):
            class_ginis = np.append(class_ginis,
                np.sum((predicted_class == preds) & (y == actual_class)) / np.sum(predicted_class == preds))
        ginis = np.append(ginis, 1 - np.sum(class_ginis ** 2))
        
    return np.dot(ginis, probs)

def find_rmse(X, y, value, col_idx):
    lhs = np.where(X[:, col_idx] <= value)[0]
    rhs = np.where(X[:, col_idx] > value)[0]
    
    lhs_mean = predict_mean(y[lhs])
    rhs_mean = predict_mean(y[rhs])
    
    preds = np.zeros(len(y))
    preds[lhs] = lhs_mean
    preds[rhs] = rhs_mean
    
    return np.sqrt(np.mean((y - preds) ** 2))

def find_information_gain(X, y, value, col_idx):
    probs = np.array([])
    
    lhs = np.where(X[:, col_idx] <= value)[0]
    rhs = np.where(X[:, col_idx] > value)[0]

    lhs_mode = predict_mode(y[lhs])
    
    # Fixing bug for when there is no rhs data
    if len(rhs) == 0:
        rhs_mode = 0
    else:
        rhs_mode = predict_mode(y[rhs])
        
    preds = np.zeros(len(y))
    preds[lhs] = lhs_mode
    preds[rhs] = rhs_mode
    
    for predicted_class in np.unique(preds):
        probs = np.append(probs, np.sum(predicted_class == preds) / len(preds))
        
    # Make answer negative to minimize
    return np.sum(probs * np.log2(probs))

### Create Prediction Logic Function

In [6]:
def predict_mode(y):
    return np.bincount(y).argmax()

def predict_mean(y):
    return np.mean(y)

### Import Data

In [125]:
iris = datasets.load_iris() # Classification
wine = datasets.load_wine() # Classification
breast_cancer = datasets.load_breast_cancer() # Classification
boston = datasets.load_boston() # Regression
diabetes = datasets.load_diabetes() # Regression

train_X, test_X, train_y, test_y = train_test_split(boston['data'], boston['target'],
                                                    test_size = .5, random_state = 0)


### Train and Test Model

In [126]:
rf = RandomForest(train_X, train_y, predict_mean, find_rmse, vars_per_split = 4, max_depth = 100, n_trees = 50)
print(rf)
print(np.round(rf.predict(test_X)))
print(test_y)
np.sqrt(np.mean((rf.predict(test_X) - test_y) ** 2))
# np.mean(np.round(rf.predict(test_X)) == test_y)

Creating tree 0
Creating tree 20
Creating tree 40
RandomForest model with 50 trees
[24. 20. 22.  9. 21. 21. 21. 20. 21. 19. 10. 12. 15.  8. 47. 33. 20. 35.
 26. 21. 24. 20. 20. 25. 21. 16. 17. 17. 40. 19. 15. 18. 20. 21. 24. 19.
  8. 19. 16. 15. 24. 20. 23. 16. 24. 23. 21. 19. 17. 24. 18. 20. 21. 37.
 17. 20. 21. 19. 16. 20. 23. 21. 32. 33. 19. 29. 18. 19. 13. 22. 21. 23.
 26. 30. 25.  9. 40. 22. 25. 20. 27. 18. 16. 40. 40. 24. 24. 17. 25. 17.
 20. 13. 23. 30. 21. 22. 12. 25. 16. 19. 24. 20. 28. 22. 28. 20.  9. 17.
 22. 25. 35. 11. 19. 20. 18. 21. 12. 19. 11. 44. 31. 10. 19. 20. 22. 19.
 38. 18. 21. 33. 13. 11. 15. 20. 13. 34. 22. 19. 27. 10.  9. 21. 33. 26.
 24. 17. 34. 33. 13.  9. 30. 27. 14. 25. 17. 26. 19. 16. 20. 15. 12. 19.
 34. 34. 30. 21. 23. 26. 24. 13. 29. 13. 20. 26. 20.  9. 19. 19. 24. 23.
 16. 15. 20. 20. 18. 26. 26. 28. 40. 17. 13. 31. 31. 20. 46. 25. 26. 19.
 24. 42. 24. 18. 44. 34. 16. 25. 18. 24. 22. 19. 26. 27. 14. 22. 20. 12.
 16. 25. 14. 33. 10. 21. 18. 16. 22. 21. 

4.238314304676364