In [2]:
import numpy as np
import matplotlib.pyplot as plt

# Step 1: Loading Data #

In [28]:

def split_data(data):
    """ This function splits a dataset into instances (x) and labels (y)
    
    Args: 
        data (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
    
    Returns:
        tuple: returns tuple of (x, y) each being a numpy array
            - x : the data instances with shape (N,7)
            - y : the corresponsing labels with shape (N, ) 
        
    """
    x = data[:,:7]
    y = data[:,7:].reshape(data.shape[0], )
    return (x, y)

# load dataset from local machine 
clean_data = np.loadtxt("wifi_db/clean_dataset.txt", dtype=int)
noisy_data = np.loadtxt("wifi_db/noisy_dataset.txt", dtype=float)
a = noisy_data[:,-1:].astype(int)
print(noisy_data)



#clean_x, clean_y  = split_data(clean_data)
#noisy_x, noisy_y = split_data(noisy_data)

# verify the data and labels have the correct shape
#print(clean_x.shape, clean_y.shape)
#print(noisy_x.shape, noisy_y.shape)

#a = np.unique(clean_data[:,-1:])
#print(a)


[[-59. -53. -51. ... -79. -87.   4.]
 [-66. -53. -59. ... -81. -79.   1.]
 [-41. -57. -63. ... -66. -65.   2.]
 ...
 [-57. -54. -56. ... -79. -82.   1.]
 [-56. -52. -50. ... -85. -88.   3.]
 [-46. -54. -47. ... -80. -73.   3.]]


# Step 2: Creating Decision Trees #

In [52]:

from numpy import argmax


class Node:
    """ Class representing a node data structure

    Attributes:
        left (Node): Object reference to this node's left child.
        right (Node): Object reference to this node's right child.
        attribute (Node): The attribute index to be tested on (0-7).
        value (float): 
                Decision Node: The value at the indexed attribute, to be compared with the split point
                Leaf Node:     The label of any feature vector, where the decision tree path leads to this leaf node. 
        leaf (Boolean): Specifies whether the node is a leaf node (True) or not (False) 

    """

    def __init__(self, left=None, right=None, attribute=None, value=None, leaf=None):
        """ Constructor 
        Args:
            left (Node): Object reference to this node's left child.
            right (Node): Object reference to this node's right child.
            attribute (Node): The attribute index to be tested on (0-7).
            value (float): 
                Decision Node: The value at the indexed attribute, to be compared with the split point
                Leaf Node:     The label of any feature vector, where the decision tree path leads to this leaf node. 
            leaf (Boolean): Specifies whether the node is a leaf node (True) or not (False)

        """

        self.left = left
        self.right = right
        self.attribute = attribute
        self.value = value
        self.leaf = leaf
    
    # Getters -------------------------------------------------------------------------------
    #@property
    def get_left(self):
        return self.left
    
    #@property
    def get_right(self):
        return self.right
    
    #@property
    def get_attribute(self):
        return self.attribute
    
    #@property
    def get_value(self):
        return self.value
    
    #@property
    def get_leaf(self):
        return self.leaf
    #----------------------------------------------------------------------------------------

    # Setters -------------------------------------------------------------------------------
    #@property
    def set_left(self,left):
        self.left = left
    
    #@property
    def set_right(self,right):
        self.right = right
    
    #@property
    def set_attribute(self,attribute):
        self.attribute = attribute
    
    #@property
    def set_value(self,value):
        self.value = value
    
    #@property
    def set_leaf(self,leaf):
        self.leaf = leaf
    #----------------------------------------------------------------------------------------

class DecisionTree:
    """  Class schema for Decision Tree implementation

     Attributes:
        unique_labels (array): Array of unique labels in the datatset 
        root (Node): The root node of the tree (which contains an object reference to its children) 
        depth (int): Maximum path length from root to a leaf
        
    """

    def __init__(self, dataset):
        """ Constructor 

            Args:
                dataset (numpy.ndarray): The datatset that is to be fit with a decision tree, with shape (N,8)

        """

        self.unique_labels = self.get_labels(dataset)
        self.root, self.depth = self.decision_tree_learning(dataset, 0)
        
    
    # @property
    def get_labels(self, dataset):
        """ Creates a list of the unique labels, and assigns to the unique_labels attribute

            Args:
                dataset (numpy.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

        """

        return np.unique(dataset[:,-1:])
    
        

    # @property
    def decision_tree_learning(self, dataset, depth):
        """ This function fits a binary decision tree to dataset, using recursion
        
        Args: 
            dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
            depth (int): Maximum path length from root to a leaf. Always initialised to 0 
        
        Returns:
            root (Node): The root node of the tree (which contains an object reference to its children)
            depth (int): Maximum path length from root to a leaf
        
        """

        # 1. Base Case: All dataset samples have the same label. 
        if self.have_same_label(dataset):
            value = self.shared_label(dataset) # Create leaf node, with value = shared label
            return self.create_leaf_node(value), depth
        
        # Stated assumption that there are no inconsistent data points i.e. identical feature vectors having different classes, thus no need for second base case.


        # 2. Inductive Case
        split_attribute, split_value = self.FIND_SPLIT(dataset)
        
        # instantiate node
        new_node = Node()    
        new_node.set_value(split_value)
        new_node.set_attribute(split_attribute)

        left_dataset, right_dataset = self.split_dataset(dataset, split_attribute, split_value)

        # recursive step (depth-first fashion)
        l_branch, l_depth = self.decision_tree_learning(left_dataset, depth+1)
        r_branch, r_depth = self.decision_tree_learning(right_dataset, depth+1)

        new_node.set_left(l_branch)
        new_node.set_right(r_branch)
        depth = max(l_depth, r_depth)

        return new_node, depth

    
    def shared_label(self, dataset):
        """ Returns the label shared by every sample in dataset 
            
            *Function is called iff have_same_label() returns True

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

            Returns:
                label (float): The label shared by all samples in dataset
        """

        label = dataset[0,-1]
        return label


    def have_same_label(self, dataset):
        """ Checks if all samples in dataset share the same label.

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

            Returns:
                shared (Boolean): True  = all samples in dataset share the same label
                                  False = variety of labels **(meaning it can be split further)

        """

        labels = dataset[:, -1:]
        shared = np.all(labels == labels[0])
        return shared


    def create_leaf_node(self, value):
        """ Creates a leaf node with `value` set as the label

            Args: 
                value (float): The label that is to be assigned to the node's `value` attribute 

            Returns:
                node (Node): A leaf node that is to be appended to the node path
        """
        
        node = Node()
        node.set_leaf(True)
        node.set_value(value)
        return node


    def FIND_SPLIT(self, dataset):
        """ Chooses the attribute and the value that results in the highest information gain

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

            Returns:
                Tuple:
                    attribute (int): The index of the optimal attribute to be split at
                    value: (float) :  The decision boundary of the attribute, which will split the incoming data
        """

        print(self.unique_labels)

        max_info_gain_per_column = []

        # -1 so we don't loop over labels
        for col in range(dataset.shape[1] - 1): 
            sorted_by_col = dataset[dataset[:, col].argsort()] # ascending order

            ig_values_per_split_value_per_column = []

            # loop over rows of specific attribute (column) being evaluated
            for row in range(sorted_by_col.shape[0] - 1): 
                """  
                we evaluate every possible splitting value at the current attribute, perform the split, to then
                determine the information gain of every possible split. The overall information gain for each 
                attribute is the maximum split value point.

                """

                split_value = (sorted_by_col[row,col] + sorted_by_col[row+1, col]) / 2
                left_branch, right_branch = self.split_dataset(dataset, col, split_value)
                info_gain = self.information_gain(dataset, left_branch, right_branch)
                ig_values_per_split_value_per_column.append(info_gain)
                
            max_info_gain_per_column.append(max(ig_values_per_split_value_per_column))

        # we now have array of 7 items, containing the max information gain values of each attribute
        attribute = np.argmax(max_info_gain_per_column)
        value = np.max(max_info_gain_per_column)
        return (attribute, value)


    def entropy(self,dataset):
        """ Returns the entropy of a given dataset

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

            Returns:
                entropy (float): The calculated entropy of the dataset 
        """

        return - sum([self.p_label(dataset,label) * np.log2(self.p_label(dataset,label)) for label in self.unique_labels] )

    def p_label(self, dataset, label):
        """ Returns the proportion of dataset, that contains label values equal to label parameter

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
                label (int): The label we are evaluating the proportion of

            Returns:
                entropy (float): The calculated entropy of the dataset 

        """
        return sum([dataset[:,-1:] == int(label) for row in dataset]) / len(dataset)

    def information_gain(self, dataset, left, right):
        """ Returns the information gain from a given split of the dataset

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
                left (np.ndarray): The left split of dataset
                right (np.ndarray): The right split of dataset

            Returns:
                information gain (float): The calculated information gain

        """
        return self.entropy(dataset) - self.remainder(left,right)

    def remainder(self, left, right):
        """ Returns the entropy remaining after a given split. The information gain function subtracts this value 
            from the overall entropy, in order to calculate the information gained. 

            Args: 
                left (np.ndarray): The left split of the dataset
                right (np.ndarray): The right split of the dataset

            Returns:
                remainder (float): The calculated entropy remainder
                
        """
        
        # components
        size_left = left.shape[0]
        size_right = right.shape[0]
        entropy_left = self.entropy(left)
        entropy_right = self.entropy(right)

        return ((size_left / size_left + size_right) * entropy_left) + ((size_right / size_left + size_right)* entropy_right)

    def split_dataset(self, dataset, attribute, split_value):
        """ Splits the dataset into left + right datasets, from a given split value on a specific attribute.

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
                attribute (int): The column index in dataset, of the specific attribute that will be split over.
                split_value (float): The value that each row of data will compared with. 
                                     When the value of the row indexed at attribute <= split value, that row is moved into the left branch.
                                     When the value of the row indexed at attribute > split value, that row is moved into the right branch
            Returns:
                left (np.ndarray): The left split of the dataset
                right (np.ndarray): The right split of the dataset
                
        """
        # implementational design to go left when equal to split_value
        left = dataset[((dataset[:,attribute] <= split_value))] # https://stackoverflow.com/questions/47885848/filter-a-2d-numpy-array
        right = dataset[((dataset[:,attribute] > split_value))] 
        return left, right




In [53]:
a = DecisionTree(clean_data)

[1 2 3 4]


  return - sum([self.p_label(dataset,label) * np.log2(self.p_label(dataset,label)) for label in self.unique_labels] )
  return - sum([self.p_label(dataset,label) * np.log2(self.p_label(dataset,label)) for label in self.unique_labels] )


ValueError: operands could not be broadcast together with shapes (2000,1) (1999,1) 