### Import Numpy

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

### Create Tree Object

In [2]:
class Tree:
    
    def __init__(self, X, y, predict_fn, score_fn, max_depth = 3):
        self.rootNode = Node('Root', X, y, predict_fn, score_fn,
                             idxs = np.arange(len(y)), max_depth = max_depth, current_depth = 0)
        
    def predict(self, X):
        return self.rootNode.predict(X)
    
    def __str__(self):
        return self.rootNode.__str__()

### Create Node Object

In [3]:
class Node:
    
    def __init__(self, name, X, y, predict_fn, score_fn, idxs, max_depth, current_depth):
        

        self.name = name
        self.X = X[idxs, :]
        self.y = y[idxs]
        self.predict_fn = predict_fn
        self.score_fn = score_fn
        self.idxs = idxs
        self.num_rows = self.X.shape[0]
        self.num_cols = self.X.shape[1]
        self.score = float('inf')
        self.split_col_idx = 0
        self.split_value = None
        self.max_depth = max_depth
        self.current_depth = current_depth
        
        self.current_depth += 1
        if self.current_depth > self.max_depth:
            return
        
        self.find_best_split()
        self.create_child_nodes()
        
        
        
    def find_best_split(self):
        for col in range(self.num_cols):
            for row in range(self.num_rows):
                if self.calculate_score(self.X[row, col], col) < self.score:
                    self.score = self.calculate_score(self.X[row, col], col)
                    self.split_col_idx = col
                    self.split_value = self.X[row, col]
      
    def calculate_score(self, value, col_idx):
        return self.score_fn(self.X, self.y, value, col_idx)
    
    def create_child_nodes(self):
        
        lhs_idx = np.where(self.X[:, self.split_col_idx] <= self.split_value)[0]
        rhs_idx = np.where(self.X[:, self.split_col_idx] > self.split_value)[0]
        
        if len(lhs_idx) == 0 or len(rhs_idx) == 0:
            self.score = float('inf')
            return
        
        self.lhn = Node(self.name + ' - Left', self.X, self.y, self.predict_fn, self.score_fn,
                        lhs_idx, self.max_depth, self.current_depth)
        self.rhn = Node(self.name + ' - Right', self.X, self.y, self.predict_fn, self.score_fn,
                        rhs_idx, self.max_depth, self.current_depth)
        
    def predict_row(self, new_X_i):
        if self.score == float('inf'):
            return self.predict_fn(self.y)
        else:
            if new_X_i[self.split_col_idx] <= self.split_value:
                return self.lhn.predict_row(new_X_i)
            else:
                return self.rhn.predict_row(new_X_i)
            
    def predict(self, new_X):
        return [self.predict_row(new_X[i]) for i in range(new_X.shape[0])]
    
    def __str__(self):
        if self.score == float('inf'):
            return ('\n' + '  ' * (self.current_depth - 1) + 'Rows: ' + str(self.num_rows) +
                   ' - Prediction: ' + str(self.predict_fn(self.y)))
        else:
            return ('\n' + '  ' * (self.current_depth - 1) + 'Rows: ' + str(self.num_rows) +
                    ' - Split Col: ' + str(self.split_col_idx) + ' - Split Value: ' + str(self.split_value) +
                    ' - Score: ' + str(self.score) + self.lhn.__str__() + self.rhn.__str__())

### Create Scoring Function

In [4]:
def find_gini(X, y, value, col_idx):
    probs = np.array([])
    ginis = np.array([])
    
    lhs = np.where(X[:, col_idx] <= value)[0]
    rhs = np.where(X[:, col_idx] > value)[0]

    lhs_mode = predict_mode(y[lhs])
    
    # Fixing bug for when there is no rhs data
    if len(rhs) == 0:
        rhs_mode = 0
    else:
        rhs_mode = predict_mode(y[rhs])
    
    preds = np.zeros(len(y))
    preds[lhs] = lhs_mode
    preds[rhs] = rhs_mode
    
    for predicted_class in np.unique(preds):
        class_ginis = np.array([])
        
        probs = np.append(probs, np.sum(predicted_class == preds) / len(preds))
        
        for actual_class in np.unique(y):
            class_ginis = np.append(class_ginis,
                np.sum((predicted_class == preds) & (y == actual_class)) / np.sum(predicted_class == preds))
        ginis = np.append(ginis, 1 - np.sum(class_ginis ** 2))
        
    return np.dot(ginis, probs)

def find_rmse(X, y, value, col_idx):
    lhs = np.where(X[:, col_idx] <= value)[0]
    rhs = np.where(X[:, col_idx] > value)[0]
    
    lhs_mean = predict_mean(y[lhs])
    rhs_mean = predict_mean(y[rhs])
    
    preds = np.zeros(len(y))
    preds[lhs] = lhs_mean
    preds[rhs] = rhs_mean
    
    return np.sqrt(np.mean((y - preds) ** 2))

def find_information_gain(X, y, value, col_idx):
    probs = np.array([])
    
    lhs = np.where(X[:, col_idx] <= value)[0]
    rhs = np.where(X[:, col_idx] > value)[0]

    lhs_mode = predict_mode(y[lhs])
    
    # Fixing bug for when there is no rhs data
    if len(rhs) == 0:
        rhs_mode = 0
    else:
        rhs_mode = predict_mode(y[rhs])
        
    preds = np.zeros(len(y))
    preds[lhs] = lhs_mode
    preds[rhs] = rhs_mode
    
    for predicted_class in np.unique(preds):
        probs = np.append(probs, np.sum(predicted_class == preds) / len(preds))
        
    # Make answer negative to minimize
    return np.sum(probs * np.log2(probs))

### Create Prediction Logic Function

In [5]:
def predict_mode(y):
    return np.bincount(y).argmax()

def predict_mean(y):
    return np.mean(y)

### Import Data

In [54]:
iris = datasets.load_iris() # Classification
wine = datasets.load_wine() # Classification
breast_cancer = datasets.load_breast_cancer() # Classification
boston = datasets.load_boston() # Regression
diabetes = datasets.load_diabetes() # Regression

train_X, test_X, train_y, test_y = train_test_split(boston['data'], boston['target'],
                                                    test_size = .5, random_state = 0)


### Train and Test Model

In [56]:
tree = Tree(train_X, train_y, predict_mean, find_rmse, max_depth = 10)

# print(tree)
print(np.round(tree.predict(test_X)))
print(test_y)
np.sqrt(np.mean((tree.predict(test_X) - test_y) ** 2))
# np.mean(tree.predict(test_X) == test_y)

[25. 14. 23.  8. 21. 19. 22. 19. 15. 19.  7.  8. 18.  5. 50. 35. 21. 35.
 25. 20. 23. 23. 19. 25. 21. 14. 19. 13. 42. 20. 15. 21. 18. 21. 23. 20.
  8. 20. 15. 13. 21. 21. 19. 13. 25. 23. 22. 18. 16. 21. 18. 20. 23. 47.
 13. 20. 19. 16. 14. 19. 23. 23. 32. 32. 21. 32. 18. 19.  8. 23. 21. 21.
 25. 32. 25.  8. 42. 23. 23. 17. 32. 20. 14. 42. 42. 25. 21. 12. 28. 19.
 18. 12. 23. 25. 21. 25. 12. 24. 16. 16. 23. 19. 31. 22. 31. 19.  8. 20.
 21. 19. 30.  8. 20. 18. 19. 23. 11. 20. 10. 42. 32. 11. 20. 17. 23. 17.
 35. 20. 16. 36. 17.  8. 15. 23. 11. 36. 23. 16. 28.  9.  5. 22. 30. 25.
 28. 19. 35. 37. 14. 11. 25. 25. 16. 23. 19. 24. 17. 16. 22. 15. 12. 19.
 35. 32. 22. 22. 23. 25. 28. 17. 31. 13. 21. 28. 22. 11. 20. 20. 15. 23.
 14. 16. 20. 23. 20. 25. 21. 30. 40. 13. 12. 34. 33. 23. 50. 27. 23. 22.
 23. 45. 23. 22. 50. 35. 18. 28. 20. 25. 22. 23. 25. 28. 12. 19. 23. 15.
 18. 30. 17. 36.  8. 19. 19. 15. 19. 20. 15. 19. 24. 19. 16. 23. 23. 28.
 19. 23. 15. 21. 34. 18. 42. 25. 15. 18. 32. 35. 26

5.040327404969242