In [1]:
from __future__ import annotations

from typing import List, Tuple

from collections import deque

import numpy as np
import pandas as pd

In [2]:
class Node(object):
    def __init__(self, feature: str = None, value: object = None, left: Node = None, right: Node = None) -> None:
        self.value = value
        self.left = left
        self.right = right
        self.f = feature

    def __repr__(self) -> str:
        return "{{f: {}, v: {}, left: {}, right: {}}}".format(self.f, self.value, self.left, self.right)


class DecisionTree(object):
    def __init__(self, root: Node, verbose: bool = False) -> None:
        self.root = root
        self.__verbose = verbose

    def __debug(self, value = None) -> None:
        if self.__verbose:
            print(value)

    def predict(self, x: np.ndarray) -> int:
        self.__debug(f"predicting for input: {x}")
        curr = self.root
        
        while curr.f is not None:
            self.__debug(f'feature: {curr.f}, value: {curr.value}')
            if x[curr.f] >= curr.value:
                curr = curr.left
            else:
                curr = curr.right
        
        self.__debug()
        return curr.value


In [3]:
X_train = np.array([[1, 1, 1],
[0, 0, 1],
 [0, 1, 0],
 [1, 0, 1],
 [1, 1, 1],
 [1, 1, 0],
 [0, 0, 0],
 [1, 1, 0],
 [0, 1, 0],
 [0, 1, 0]])

y_train = np.array([1, 1, 0, 0, 1, 1, 0, 1, 0, 0])

In [4]:
tree = DecisionTree(
    Node(
        feature=0,
        value=1,
        left=Node(
            feature=1,
            value=1,
            left=Node(
                value=1
            ),
            right=Node(
                value=0
            )
        ),
        right=Node(
            feature=2,
            value=1,
            left=Node(
                value=1
            ),
            right=Node(
                value=0
            )
        )
    )
)

In [5]:
y_hat = [tree.predict(X_train[i]) for i in range(X_train.shape[0])]
y_train - y_hat

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [6]:
tree.predict(X_train[0])

1

In [7]:
def entropy(y: float) -> float:
    if y == 0 or y == 1:
        return 0
    else:
        return -y * np.log2(y) - (1 - y) * np.log2(1 - y)

In [8]:
for i in np.arange(0, 1.1, .1):
    print(entropy(i))

0
0.4689955935892812
0.7219280948873623
0.8812908992306927
0.9709505944546686
1.0
0.9709505944546686
0.8812908992306926
0.7219280948873623
0.4689955935892811
0


In [9]:
def information_gain(left_idx: np.ndarray, right_idx: np.ndarray, y: np.ndarray) -> float:
    p_root = np.sum(y) / y.shape[0]

    left = y[left_idx]

    w_left = len(left) / len(y)
    p_left = np.sum(left) / (len(left) + 1e-15)

    right = y[right_idx]

    w_right = len(right) / len(y)
    p_right = np.sum(right) / (len(right) + 1e-15)

    return inf_gain(p_root, p_left, w_left, p_right, w_right)

def inf_gain(p_root: float, p_left: float, w_left: float, p_right: float, w_right: float) -> float:
    return entropy(p_root) - (w_left * entropy(p_left) + w_right * entropy(p_right))

In [10]:
inf_gain(.5, 4/5, .5, 1/5, .5)

0.2780719051126377

In [11]:
def split_by_feature(X: np.ndarray, idx: np.ndarray, feature: str, value: object) -> Tuple[np.ndarray, np.ndarray]:
    left_mask = X[idx, feature] >= value
    right_mask = ~left_mask

    return (idx[left_mask], idx[right_mask])


def get_best_feature_idx(X: np.ndarray, y: np.ndarray, idx: np.ndarray, features: List[object]) -> int:

    inf_gains = []
    for feature, value in features:
        left_idx, right_idx = split_by_feature(X, idx, feature, value)

        gain = information_gain(left_idx, right_idx, y)
        inf_gains.append(gain)
    
    return np.argmax(inf_gains)

In [12]:
features = [(i, 1) for i in range(X_train.shape[1])]

X = X_train
y = y_train

idx = np.arange(X.shape[0])

feature_idx = get_best_feature_idx(X, y, idx, features)
print(f'first level feature: {feature_idx}')

left_idx, right_idx = split_by_feature(X, idx, *features[feature_idx])
print(left_idx)
print(right_idx)

left_feature_idx = get_best_feature_idx(X, y, left_idx, features)
print(f'second level left feature: {left_feature_idx}')

left_left_idx, left_right_idx = split_by_feature(X, left_idx, *features[left_feature_idx])
print(left_left_idx)
print(left_right_idx)

right_feature_idx = get_best_feature_idx(X, y, right_idx, features)
print(f'second level right feature: {right_feature_idx}')

right_left_idx, right_right_idx = split_by_feature(X, right_idx, *features[right_feature_idx])
print(right_left_idx)
print(right_right_idx)

first level feature: 0
[0 3 4 5 7]
[1 2 6 8 9]
second level left feature: 1
[0 4 5 7]
[3]
second level right feature: 2
[1]
[2 6 8 9]


In [16]:
def build_tree(X: np.ndarray, y: np.ndarray, max_height: int) -> DecisionTree:
    features = [(feature, 1) for feature in range(X.shape[1])]

    idx = np.arange(X.shape[0])
    root = Node()
    
    queue = deque([(idx, root, 0)])
    
    while len(queue) > 0:
        idx, node, height = queue.popleft()

        # If stopping criteria is met
        if height == max_height:
            # Set the leaf node to be the dominant class
            node.value = np.argmax(np.bincount(y[idx]))
            continue

        best_feature_id = get_best_feature_idx(X, y, idx, features)

        # Update the node with the best feature
        f, fval = features[best_feature_id]
        node.f = f
        node.value = fval
        node.left = Node()
        node.right = Node()

        # Create the children nodes
        left_idx, right_idx = split_by_feature(X, idx, f, fval)

        queue.append((left_idx, node.left, height + 1))
        queue.append((right_idx, node.right, height + 1))
    
    return DecisionTree(root)

In [17]:
tree = build_tree(X_train, y_train, 2)

tree.predict(X_train[0])
y_hat = [tree.predict(X_train[i]) for i in range(X_train.shape[0])]
y_train - y_hat

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)