In [257]:
import math
import numpy as np

from pprint import pprint
from collections import defaultdict, Counter

In [258]:
def entropy(data):
    n, _ = data.shape
    
    c = Counter(data[:, -1])
    ent = lambda x: -(x/n * math.log(x/n, 2))
    
    return sum([ent(x) for x in c.values()])

In [259]:
data = [[1, 1, 'yes'],
        [1, 1, 'yes'],
        [1, 0, 'no'],
        [0, 1, 'no'],
        [0, 1, 'no']]
data = np.array(data)

In [260]:
labels = ['no surfacing', 'no flippers']

In [261]:
data_copy = data.copy()
data_copy[0][-1] = 'maybe'
entropy(data)

0.9709505944546686

In [262]:
def split_column(data, col, value):
    """
    Splits the data at the given column with the given value.
    
    Paramters
    ---------
    data: [n_samples, n_features:labels]
        A data with a number of feature colums and the last column is the label.
    col: int
        The integer column to split the data.
    value: int
        The value to match when splitting the column.
        
    Returns
    -------
    result: [n_samples, n_features-1:labels]
        The original dataset with the column removed at where it was split.
    """
    _, cols = data.shape
    columns = [i for i in range(cols) if i != col]
    return data[data[:, col] == value][:, columns]

In [263]:
split_column(data, 0, '1')

array([['1', 'yes'],
       ['1', 'yes'],
       ['0', 'no']], dtype='<U21')

In [264]:
split_column(data, 0, '0')

array([['1', 'no'],
       ['1', 'no']], dtype='<U21')

In [265]:
def select_best_feature(data):
    n, cols = data.shape
    
    parent_entropy = entropy(data)
    best_gain, best_feat = -float('inf'), -1
    
    for i in range(cols - 1): # Exclude the last label column.
        feat_value = np.unique(data[:, i])
        child_entropy = 0
        
        for val in feat_value:
            subset = split_column(data, i, val)
            weight = len(subset) / n
            child_entropy += entropy(subset) * weight
        
        info_gain = parent_entropy - child_entropy
        if info_gain > best_gain:
            best_gain = info_gain
            best_feat = i

    return best_feat

In [266]:
select_best_feature(data)

0

In [267]:
def majority(classes):
    return Counter(classes).most_common(1)[0][0]

In [284]:
def create_tree(data, labels):
    classes = data[:, -1]
    
    if len(set(classes)) == len(classes): # All classes are equal.
        return classes[0]
    
    n, features = data.shape
    if features == 1: # Single column left.
        return majority(classes)

    best_feat = select_best_feature(data)
    best_class = labels[best_feat]
    
    tree = {best_class: {}}
    del(labels[best_feat])
    
    values = np.unique(data[:, best_feat])
    for val in values:
        subclass = labels[:]
        tree[best_class][val] = create_tree(split_column(data, best_feat, val), 
                                            subclass)
    return tree

In [285]:
pprint(create_tree(data, labels[:]))

{'no surfacing': {'0': {'no flippers': {'1': 'no'}},
                  '1': {'no flippers': {'0': 'no', '1': 'yes'}}}}


In [286]:
np.unique([1,1,2,3,2,3,3,3,3,3], return_counts=True)

(array([1, 2, 3]), array([2, 2, 6]))