In [None]:
# Random Forest Boostrape
# Random Forest Feature selection

In [715]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

X, y = load_iris(return_X_y=True)

In [716]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

In [717]:
def gini_index(*groups):
    gini_value = 0
    total_num = np.sum([group[0].shape[0] for group in groups])

    for group in groups:
        y = group[1]
        group_size = y.shape[0]

        _, count = np.unique(y, return_counts=True)
        p = count/group_size
        weight = group_size/total_num

        gini_value += (1- np.sum(p**2)) * weight
    return gini_value


def split(X, y, split_value):
    mask = X < split_value
    group_1 = (X[mask], y[mask])
    group_2 = (X[mask == 0], y[mask == 0])
    return group_1, group_2

def best_split(X, y):
    spliting_list = []
    for i in range(X.shape[1]):
        split_range = np.unique(X[:,i])
        for split_value in split_range:
            split_group = split(X[:, i], y, split_value)
            if check_legal_split(*split_group, min_samples_leaf=1):
                spliting_list.append([gini_index(*split_group), split_value, i])
                
    if len(spliting_list) ==0:
        return np.NaN, np.NaN, None, None
    
    optimal_split, _,_ = np.argmin(np.array(spliting_list), axis = 0)
    _, optimal_split_value, label = np.array(spliting_list[optimal_split])
    
    bool_mask = X[:, int(label)] < optimal_split_value
    group1 = (X[bool_mask], y[bool_mask])
    group2 = (X[bool_mask == 0 ], y[bool_mask == 0])
    return optimal_split_value, label, group1, group2

def check_legal_split(*groups, min_samples_leaf=1):
    for group in groups:
        if group[0].shape[0] < min_samples_leaf:
            return False
        else:
            return True
        
def pprint_tree(node, file=None, _prefix="", _last=True):
    print(_prefix, "`- " if _last else "|- ", "Depth :{}, Split Feature:{}, Split Value:{}, Label:{}".format(node.depth,node.split_value, node.split_feature, node._label), sep="", file=file)
    _prefix += "   " if _last else "|  "
    
    if node._left_child == None or node._right_child == None:
        return None
    
    pprint_tree(node._left_child, file, _prefix, _last = False)
    pprint_tree(node._right_child, file, _prefix, _last = True)

In [718]:
class Node:
    def __init__(self, depth = 0):
        self.split_value = None
        self.split_feature = None
        self._label = None
        self._left_child = None
        self._right_child = None
        self._leaf = False
        self.depth = depth
    
    def train(self, X, y, max_depth=8, min_samples_split=2, min_samples_leaf=1):
        self.split_value, self.split_feature, group1, group2 = best_split(X, y)
        if self.depth < max_depth and X.shape[0] > min_samples_split:
            self._left_child = Node(self.depth + 1)
            self._right_child = Node(self.depth + 1)
            self._left_child.train(*group1, max_depth, min_samples_split, min_samples_leaf)
            self._right_child.train(*group2, max_depth, min_samples_split, min_samples_leaf)
        else:
            self.sprout(y)
            
    def sprout(self, y):
        self._leaf = True

        labels, count = np.unique(y, return_counts = True)
        self._label = labels[np.argmax(count)]
        
    def predict(self, x):
        if self._leaf == True:
            return self._label
        
        if x[int(self.split_feature)] < self.split_value:
            return self._left_child.predict(x)
        else:
            return self._right_child.predict(x)

class Tree:
    def __init__(self, max_depth=8, min_samples_split=2, min_samples_leaf=1):
        self._root = Node()
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        
    def train(self, X, y):
        self._root.train(X, y,
                        self.max_depth, 
                        self.min_samples_split, 
                        self.min_samples_leaf)
        
        
    def predict(self, X):
        prediction = []
        for i in range(len(X)):
            prediction.append(self._root.predict(X[i]))
        return prediction
    
    def print_tree(self):
        pprint_tree(self._root)
        

# Compare to Sklearn

In [680]:
from sklearn import tree

In [681]:
clf = tree.DecisionTreeClassifier()

In [682]:
clf = clf.fit(X_train, y_train)

In [683]:
skprediction = clf.predict(X_test)

In [707]:
tree = Tree()

In [708]:
tree.train(X_train, y_train)

In [710]:
prediction = tree.predict(X_test)

In [711]:
skprediction == prediction

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True])

In [712]:
accuracy_score(y_test, skprediction)

1.0

In [713]:
accuracy_score(y_test, prediction)

0.98

In [720]:
tree.print_tree()

`- Depth :0, Split Feature:3.0, Split Value:2.0, Label:None
   |- Depth :1, Split Feature:4.4, Split Value:0.0, Label:None
   |  |- Depth :2, Split Feature:nan, Split Value:nan, Label:0
   |  `- Depth :2, Split Feature:4.5, Split Value:0.0, Label:None
   |     |- Depth :3, Split Feature:3.0, Split Value:1.0, Label:0
   |     `- Depth :3, Split Feature:4.6, Split Value:0.0, Label:None
   |        |- Depth :4, Split Feature:nan, Split Value:nan, Label:0
   |        `- Depth :4, Split Feature:4.7, Split Value:0.0, Label:None
   |           |- Depth :5, Split Feature:3.2, Split Value:1.0, Label:None
   |           |  |- Depth :6, Split Feature:nan, Split Value:nan, Label:0
   |           |  `- Depth :6, Split Feature:3.4, Split Value:1.0, Label:0
   |           `- Depth :5, Split Feature:4.8, Split Value:0.0, Label:None
   |              |- Depth :6, Split Feature:nan, Split Value:nan, Label:0
   |              `- Depth :6, Split Feature:4.9, Split Value:0.0, Label:None
   |               