In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
clean_dataset = np.loadtxt("datasets/clean_dataset.txt")
noisy_dataset = np.loadtxt("datasets/noisy_dataset.txt")

In [None]:
class Node:
    def __init__(self, attribute, value, left, right, label = None):
        self.attribute = attribute
        self.value = value
        self.left = left
        self.right = right
        self.label = label

    def is_leaf(self):
        return not self.left and not self.right

In [None]:
def entropy(dataset):
    num = dataset.shape[0]
    labels, label_counts = np.unique(dataset, return_counts=True)
    probs = label_counts / num
    entropy = -1 * np.sum(probs * np.log2(probs))
    return entropy

In [None]:
def remainder(left, right):
    left_n = left.shape[0]
    right_n = right.shape[0]
    total_n = left_n + right_n
    h_l = entropy(left)
    h_r = entropy(right)
    rem = left_n / total_n * h_l + right_n / total_n * h_r

    return rem


In [None]:
def gain(total, left, right):
    return entropy(total) - remainder(left, right)

In [None]:
def find_split(dataset):
    attrs = dataset.shape[1] - 1
    h_max, attr_max, val_max = -1, None, None

    for i in range(attrs):
        points = np.unique(np.sort(dataset[:, i]))
        splits = np.sum(np.vstack((points[:-1], points[1:])), axis=0) / 2
        for val in splits:

            left = dataset[dataset[:, i] <= val]
            right = dataset[dataset[:, i] > val]
            left_labels = left[:, -1]
            right_labels = right[:, -1]

            h = gain(dataset[:, -1], left_labels, right_labels)

            if(h > h_max):
                h_max, attr_max, val_max = h, i, val

    left = dataset[dataset[:, attr_max] <= val_max]
    right = dataset[dataset[:, attr_max] > val_max]
    return (attr_max, val_max, left, right)


In [None]:
attr_max, val_max, left, right = find_split(clean_dataset)
#print(attr_max, val_max, left.shape, right.shape)
#print(find_split(clean_dataset))

In [None]:
def decision_tree_learning(training_set, depth=0):
    labels = np.unique(training_set[:, -1])

    # There is only one unique label
    if(labels.shape[0] == 1):
        return Node(0, 0, None, None, labels[0]), depth

    attr_max, val_max, left, right = find_split(training_set)

    l_branch, l_depth = decision_tree_learning(left, depth + 1)
    r_branch, r_depth = decision_tree_learning(right, depth + 1)

    node = Node(attr_max, val_max, l_branch, r_branch)

    return node, max(l_depth, r_depth)


In [None]:
dec_tree, total_depth = decision_tree_learning(clean_dataset)

In [251]:
def _draw_tree(root, depth=1):
    if(root.left == None and root.right == None):
        return "|  " * (depth - 1) + ">> " + "class " + str(root.label)

    return ("|  ") * (depth - 1) + ("*  ") + "feature " + str(root.attribute) + " <= " + str(root.value) + '\n' + _draw_tree(root.left, depth + 1) + '\n' + ("|  ") * (depth - 1) + ("*  ") + "feature " + str(root.attribute) + " > " + str(root.value) + '\n' + _draw_tree(root.right, depth + 1)


In [252]:
def draw_tree(tree):
    return _draw_tree(tree)

In [253]:
print(draw_tree(dec_tree))

*  feature 0 <= -54.5
|  *  feature 4 <= -59.5
|  |  *  feature 3 <= -55.5
|  |  |  *  feature 2 <= -55.5
|  |  |  |  >> class 1.0
|  |  |  *  feature 2 > -55.5
|  |  |  |  *  feature 6 <= -85.5
|  |  |  |  |  *  feature 4 <= -62.5
|  |  |  |  |  |  *  feature 5 <= -85.5
|  |  |  |  |  |  |  *  feature 0 <= -58.0
|  |  |  |  |  |  |  |  >> class 4.0
|  |  |  |  |  |  |  *  feature 0 > -58.0
|  |  |  |  |  |  |  |  >> class 3.0
|  |  |  |  |  |  *  feature 5 > -85.5
|  |  |  |  |  |  |  >> class 1.0
|  |  |  |  |  *  feature 4 > -62.5
|  |  |  |  |  |  >> class 4.0
|  |  |  |  *  feature 6 > -85.5
|  |  |  |  |  >> class 1.0
|  |  *  feature 3 > -55.5
|  |  |  *  feature 1 <= -50.5
|  |  |  |  >> class 3.0
|  |  |  *  feature 1 > -50.5
|  |  |  |  *  feature 0 <= -59.0
|  |  |  |  |  >> class 3.0
|  |  |  |  *  feature 0 > -59.0
|  |  |  |  |  >> class 1.0
|  *  feature 4 > -59.5
|  |  *  feature 4 <= -56.5
|  |  |  *  feature 3 <= -58.5
|  |  |  |  >> class 4.0
|  |  |  *  feature 3 > 