In [359]:
import numpy as np
import pandas as pd
import math


In [3]:
arr_data = np.array([['-', 'short', 'blond', 'brown'],
                 ['-', 'tall', 'dark', 'brown'],
                 ['+', 'tall', 'blond', 'blue'],
                 ['-', 'tall', 'dark', 'blue'],
                 ['-', 'short', 'dark', 'blue'],
                 ['+', 'tall', 'red', 'blue'],
                 ['-', 'tall', 'blond', 'brown'],
                 ['+', 'short', 'blond', 'blue']])


data = pd.DataFrame(arr_data, columns = ['class', 'height', 'hair', 'eyes'])
data

Unnamed: 0,class,height,hair,eyes
0,-,short,blond,brown
1,-,tall,dark,brown
2,+,tall,blond,blue
3,-,tall,dark,blue
4,-,short,dark,blue
5,+,tall,red,blue
6,-,tall,blond,brown
7,+,short,blond,blue


In [368]:
class TreeNode:
    def __init__(self, instance):
        self.instances = instance
        self.isleaf = True
        self.class_labels = instance.iloc[0, 0]
        self.children = {}



class IncrementalDTree:
    def __init__(self):
        self.root = None


    def predict(self, instance, current_node):
        if current_node.isleaf == True:
            return current_node

        else:
            split_attribute_val = instance[current_node.instances]
            try:
                return self.predict(instance.drop(split_attribute_val, axis = 1), current_node.children[split_attribute_val])
            except:
                return 'bad'

    def entropy(self, columns, data, target_attributes):
        frequency = {}
        entropy_data = 0.0
        i = 0
        for entry in columns:
            if (target_attributes == entry):
                break
            i = i + 1
        i = i - 1
        for entry in data:
            if (entry[i] in frequency.keys()):
                frequency[entry[i]] += 1.0
            else:
                frequency[entry[i]]  = 1.0
        for frequency in frequency.values():
            entropy_data += (-frequency/len(data)) * math.log(frequency/len(data), 2)
        return entropy_data

    def informationGain(self, columns, data, attr, target_attributes):
        frequency_of_desired_var = {}
        subset_entropy = 0.0
        i = columns.index(attr)

        for entry in data:
            if (entry[i] in frequency_of_desired_var.keys()):
                frequency_of_desired_var[entry[i]] += 1.0
            else:
                frequency_of_desired_var[entry[i]]  = 1.0

        for val in frequency_of_desired_var.keys():
            valProb        = frequency_of_desired_var[val] / sum(frequency_of_desired_var.values())
            dataSubset     = [entry for entry in data if entry[i] == val]
            subset_entropy += valProb * self.entropy(columns, dataSubset, target_attributes)

        return (self.entropy(columns, data, target_attributes) - subset_entropy)

                    

    def find_split(self, node):
        columns = list(node.columns)
        best = columns[0]
        # And maximum information gain to be 0
        maximim_gain = 0;

        for attr in columns:
            # For each columnn find out the information gain
            new_info_gain = self.informationGain(columns, data, attr, columns[0])
            # If the new information gain happens to be more than the current maximum
            # Then update the current max info_gain with the new gain
            if new_info_gain>maximim_gain:
                maximim_gain = new_info_gain
                best = attr


        return best

    def add_node(self, instance, current_node):
        if current_node.isleaf == True:
            pass

        else:
            split_attribute_val = instance[current_node.instances].iloc[0] 
            try:
                self.add_node(instance.drop(split_attribute_val, axis = 1), current_node.children[split_attribute_val])
            except:
                new_node = TreeNode(instance)
                print(split_attribute_val)
                current_node.children[split_attribute_val] = new_node



    def split(self, node):
        split_feat = self.find_split(node.instances)
        print(split_feat)
        # split_feat = node.instances.columns[split_feat]
        if split_feat == None:
            return node

        else:
            print(split_feat, '\n', node.instances)
            for val in node.instances[split_feat]:
                temp_instances = node.instances.drop(split_feat, axis = 1)
                new_node = TreeNode(temp_instances[node.instances[split_feat] == val])
                node.children[val] = new_node
            node.instances = split_feat
            node.isleaf = False
            print(node.instances, node.children)
            return node

    def fit_instance(self, instance, label):
        if self.root == None:
            new_node = TreeNode(instance)
            self.root = new_node

        elif self.root != None:
            pred_node = self.predict(instance, self.root)
            if pred_node != 'bad':
                pred = pred_node.class_labels
                if pred == label:
                    # Could add the the instances to the node but skipping for now
                    pass
                else:
                    pred_node.instances = pd.concat([pred_node.instances, instance[pred_node.instances.columns]])
                    pred_node = self.split(pred_node)
                    print(pred_node.instances, pred_node.children)
                    print(self.root.instances, self.root.children)
            else:
                self.add_node(instance, self.root)

    def print_tree(self, current_node):
        if current_node.isleaf == True:
            print(current_node.class_labels, end = '\t')
        elif current_node.isleaf != True:
            print(current_node.instances, end = '\t')
            for node in current_node.children:
                self.print_tree(current_node.children[node])
                print()
        # except Exception as e:
        #     print(e)
        #     print(current_node)
                    


                    

            

In [369]:
tree = IncrementalDTree()

In [370]:
tree.fit_instance(data.iloc[[0]], data.iloc[0, 0])

In [371]:
tree.print_tree(tree.root)

-	

In [372]:
tree.fit_instance(data.iloc[[1]], data.iloc[1, 0])

In [373]:
tree.predict(data.iloc[[1]], tree.root).class_labels

'-'

In [374]:
tree.print_tree(tree.root)

-	

In [375]:
tree.fit_instance(data.iloc[[2]], data.iloc[2, 0])

height


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [325]:
tree.print_tree(tree.root)

hair	+	
-	
+	


In [326]:
tree.fit_instance(data.iloc[[3]], data.iloc[3, 0])

dark


In [327]:
tree.print_tree(tree.root)

hair	+	
-	
+	


In [328]:
tree.root.instances

'hair'

In [329]:
tree.root.children

{'blond': <__main__.TreeNode at 0x11766b460>,
 'dark': <__main__.TreeNode at 0x11766c4f0>,
 'red': <__main__.TreeNode at 0x1174e37f0>}

In [330]:
tree.root.children['blond'].instances

Unnamed: 0,class,height,hair,eyes
7,+,short,blond,blue


In [331]:
tree.root.children['dark'].instances

Unnamed: 0,class,height,hair,eyes
3,-,tall,dark,blue


In [332]:
tree.root.class_labels

'-'

In [333]:
pd.concat([data.iloc[[0]], data.iloc[[-1]]])

Unnamed: 0,class,height,hair,eyes
0,-,short,blond,brown
7,+,short,blond,blue


In [334]:
data.iloc[0].iloc[0]

'-'

In [335]:
temp_instances = data.drop('height', axis = 1)
temp_instances[data['height'] == 'tall']

Unnamed: 0,class,hair,eyes
1,-,dark,brown
2,+,blond,blue
3,-,dark,blue
5,+,red,blue
6,-,blond,brown


In [336]:
tree = IncrementalDTree()

for i in range(data.shape[0]):
    tree.fit_instance(data.iloc[[i]], data.iloc[i, 0])

hair 
   class height   hair   eyes
0     -  short  blond  brown
2     +   tall  blond   blue
hair {'blond': <__main__.TreeNode object at 0x1176693d0>}
hair {'blond': <__main__.TreeNode object at 0x1176693d0>}
hair {'blond': <__main__.TreeNode object at 0x1176693d0>}
dark
dark
red
blond
blond


In [337]:
tree.print_tree(tree.root)

hair	+	
-	
+	


In [338]:
tree.root.instances

'hair'

In [339]:
tree.root.children

{'blond': <__main__.TreeNode at 0x1176691c0>,
 'dark': <__main__.TreeNode at 0x1173d45b0>,
 'red': <__main__.TreeNode at 0x11766cf70>}

In [340]:
tree.root.children['blond'].children

{}

In [341]:
tree.root.children['dark'].children

{}

In [342]:
tree.root.children['red'].children

{}