In [129]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import argmax
from numpy.random import default_rng

# Step 1: Loading Data #

In [130]:

def split_dataset(data, test_proportion, random_generator=default_rng()):
    """ Split dataset into training and test sets, according to the given 
        test set proportion.
    
    Args:
        *******
        test_proprotion (float): the desired proportion of test examples 
                                 (0.0-1.0)
        random_generator (np.random.Generator): A random generator

    Returns:
        tuple: returns a tuple of (x_train, x_test, y_train, y_test) 
               - x_train (np.ndarray): Training instances shape (N_train, K)
               - x_test (np.ndarray): Test instances shape (N_test, K)
    """

    shuffled_indices = random_generator.permutation(len(data))
    n_test = round(len(data) * test_proportion)
    n_train = len(data) - n_test

    training_data = data[shuffled_indices[:n_train]]
    test_data = data[shuffled_indices[n_train:]]

    return (training_data, test_data)


# load dataset from local machine 
clean_data = np.loadtxt("wifi_db/clean_dataset.txt", dtype=int)
noisy_data = np.loadtxt("wifi_db/noisy_dataset.txt", dtype=float)

seed = 60012
rg = default_rng(seed)

clean_data_train, clean_data_test = split_dataset(clean_data, test_proportion=0.2,random_generator=rg)


# Step 2: Creating Decision Trees #

In [131]:

class Node:
    """ Class representing a node data structure

    Attributes:
        left (Node): Object reference to this node's left child.
        right (Node): Object reference to this node's right child.
        attribute (Node): The attribute index to be tested on (0-7).
        value (float): 
                Decision Node: The value at the indexed attribute, to be compared with the split point
                Leaf Node:     The label of any feature vector, where the decision tree path leads to this leaf node. 
        leaf (Boolean): Specifies whether the node is a leaf node (True) or not (False) 

    """

    def __init__(self, left=None, right=None, attribute=None, value=None, leaf=None):
        """ Constructor 
        Args:
            left (Node): Object reference to this node's left child.
            right (Node): Object reference to this node's right child.
            attribute (Node): The attribute index to be tested on (0-7).
            value (float): 
                Decision Node: The value at the indexed attribute, to be compared with the split point
                Leaf Node:     The label of any feature vector, where the decision tree path leads to this leaf node. 
            leaf (Boolean): Specifies whether the node is a leaf node (True) or not (False)

        """

        self.left = left
        self.right = right
        self.attribute = attribute
        self.value = value
        self.leaf = leaf
    
    # Getters -------------------------------------------------------------------------------
    def get_left(self):
        return self.left
    
    def get_right(self):
        return self.right
    
    def get_attribute(self):
        return self.attribute
    
    def get_value(self):
        return self.value
    
    def is_leaf(self):
        return self.leaf
    
    #----------------------------------------------------------------------------------------

    # Setters -------------------------------------------------------------------------------
    def set_left(self,left):
        self.left = left
    
    def set_right(self,right):
        self.right = right
    
    def set_attribute(self,attribute):
        self.attribute = attribute
    
    def set_value(self,value):
        self.value = value
    
    def set_leaf(self,leaf):
        self.leaf = leaf
    #----------------------------------------------------------------------------------------



class DecisionTree:
    """  Class schema for Decision Tree implementation

     Attributes:
        unique_labels (array): Array of unique labels in the datatset 
        root (Node): The root node of the tree (which contains an object reference to its children) 
        depth (int): Maximum path length from root to a leaf
        
    """

    def __init__(self, dataset):
        """ Constructor 

            Args:
                dataset (numpy.ndarray): The datatset that is to be fit with a decision tree, with shape (N,8)
        """

        self.unique_labels = self.get_labels(dataset)
        self.root, self.depth = self.decision_tree_learning(dataset, 0)
        
    
    # @property
    def get_labels(self, dataset):
        """ Creates a list of the unique labels, and assigns to the unique_labels attribute

            Args:
                dataset (numpy.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
        """

        return np.unique(dataset[:,-1:])
    
    

    def predict(self, samples):
        return np.asarray([self.predict_single(self.root, sample) for sample in samples])

    def predict_single(self, node, sample):

        if node.is_leaf():
            return node.get_value()

        # Recursive case. Find the attribute we are splitting on and get the value
        node_attribute = node.get_attribute()
        node_value = node.get_value()
        sample_value = sample[node_attribute]
        
        # compare
        if sample_value <= node_value:
            return self.predict_single(node.get_left(), sample)
        else:
            return self.predict_single(node.get_right(), sample)


    # @property
    def decision_tree_learning(self, dataset, depth):
        """ This function fits a binary decision tree to dataset, using recursion
        
        Args: 
            dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
            depth (int): Maximum path length from root to a leaf. Always initialised to 0 
        
        Returns:
            root (Node): The root node of the tree (which contains an object reference to its children)
            depth (int): Maximum path length from root to a leaf
        """
        # 1. Base Case: All dataset samples have the same label. 
        if self.have_same_label(dataset):
            value = self.shared_label(dataset) # Create leaf node, with value = shared label
            return self.create_leaf_node(value), depth
        
        # Stated assumption that there are no inconsistent data points i.e. identical feature vectors having different classes, thus no need for second base case.


        # 2. Inductive Case
        split_attribute, split_value = self.FIND_SPLIT(dataset)
        # print("Found split:  {} {}".format(split_attribute, split_value))
        
        # instantiate node
        new_node = Node()    
        new_node.set_value(split_value)
        new_node.set_attribute(split_attribute)

        left_dataset, right_dataset = self.split_dataset(dataset, split_attribute, split_value)

        # recursive step (depth-first fashion)
        l_branch, l_depth = self.decision_tree_learning(left_dataset, depth+1)
        r_branch, r_depth = self.decision_tree_learning(right_dataset, depth+1)

        new_node.set_left(l_branch)
        new_node.set_right(r_branch)
        depth = max(l_depth, r_depth)

        return new_node, depth

    
    def shared_label(self, dataset):
        """ Returns the label shared by every sample in dataset 
            
            *Function is called iff have_same_label() returns True

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

            Returns:
                label (float): The label shared by all samples in dataset
        """

        label = dataset[0,-1]
        return label


    def have_same_label(self, dataset):
        """ Checks if all samples in dataset share the same label.

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

            Returns:
                shared (Boolean): True  = all samples in dataset share the same label
                                  False = variety of labels **(meaning it can be split further)

        """

        labels = dataset[:, -1:]
        shared = np.all(labels == labels[0])
        return shared


    def create_leaf_node(self, value):
        """ Creates a leaf node with `value` set as the label

            Args: 
                value (float): The label that is to be assigned to the node's `value` attribute 

            Returns:
                node (Node): A leaf node that is to be appended to the node path
        """
        
        node = Node()
        node.set_leaf(True)
        node.set_value(value)
        return node


    def FIND_SPLIT(self, dataset):
        # print("Dataset size: {}".format(len(dataset)))
        """ Chooses the attribute and the value that results in the highest information gain

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

            Returns:
                Tuple:
                    attribute (int): The index of the optimal attribute to be split at
                    value: (float) :  The decision boundary of the attribute, which will split the incoming data
        """

        max_info_gain = (0, 0, 0) # (info_gain, attribute, split_value)

        # -1 so we don't loop over labels
        for col in range(dataset.shape[1] - 1): 
            sorted_by_col = dataset[dataset[:, col].argsort()] # ascending order
            # print(col)
            # loop over rows of specific attribute (column) being evaluated
            for row in range(sorted_by_col.shape[0] - 1): 
                """  
                we evaluate every possible splitting value at the current attribute, perform the split, to then
                determine the information gain of every possible split. The overall information gain for each 
                attribute is the maximum split value point.

                """
                split_value = (sorted_by_col[row,col] + sorted_by_col[row+1, col]) / 2
                left_branch, right_branch = self.split_dataset(dataset, col, split_value)
                info_gain = self.information_gain(dataset, left_branch, right_branch)
                if info_gain > max_info_gain[0]:
                    # update maximal 
                    max_info_gain = (info_gain, col, split_value)
            # print("")

        # we now have array of 7 items, containing the max information gain values of each attribute
        attribute = max_info_gain[1]
        value = max_info_gain[2]
        return (attribute, value)



    def entropy(self,dataset):
        """ Returns the entropy of a given dataset

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)

            Returns:
                entropy (float): The calculated entropy of the dataset 
        """
        sum = 0
        for label in self.unique_labels:
            proportion = self.p_label(dataset, label)
            log_of_proportion = 0 if proportion == 0 else np.log2(proportion)
            sum += (proportion * log_of_proportion)
        return -sum

    def p_label(self, dataset, label):
        """ Returns the proportion of dataset, that contains label values equal to label parameter

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
                label (int): The label we are evaluating the proportion of

            Returns:
                entropy (float): The calculated entropy of the dataset 

        """
        return 0 if len(dataset) == 0 else sum([int(row[-1]) == int(label) for row in dataset]) / len(dataset)

    def information_gain(self, dataset, left, right):
        """ Returns the information gain from a given split of the dataset

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
                left (np.ndarray): The left split of dataset
                right (np.ndarray): The right split of dataset

            Returns:
                information gain (float): The calculated information gain

        """

        #print("Remainder is {}".format(self.remainder(left, right)))
        #print("Enropy is {}".format(self.entropy(dataset)))

        return self.entropy(dataset) - self.remainder(left,right)

    def remainder(self, left, right):
        """ Returns the entropy remaining after a given split. The information gain function subtracts this value 
            from the overall entropy, in order to calculate the information gained. 

            Args: 
                left (np.ndarray): The left split of the dataset
                right (np.ndarray): The right split of the dataset

            Returns:
                remainder (float): The calculated entropy remainder
                
        """
        
        # components
        size_left = left.shape[0]
        size_right = right.shape[0]
        entropy_left = self.entropy(left)
        entropy_right = self.entropy(right)
        return ((size_left / (size_left + size_right)) * entropy_left) + ((size_right / (size_left + size_right))* entropy_right)

    def split_dataset(self, dataset, attribute, split_value):
        """ Splits the dataset into left + right datasets, from a given split value on a specific attribute.

            Args: 
                dataset (np.ndarray): Data instances (first 7 columns) + data labels (8th column), with shape (N,8)
                attribute (int): The column index in dataset, of the specific attribute that will be split over.
                split_value (float): The value that each row of data will compared with. 
                                     When the value of the row indexed at attribute <= split value, that row is moved into the left branch.
                                     When the value of the row indexed at attribute > split value, that row is moved into the right branch
            Returns:
                left (np.ndarray): The left split of the dataset
                right (np.ndarray): The right split of the dataset
                
        """
        # implementational design to go left when equal to split_value
        left = dataset[((dataset[:,attribute] <= split_value))] # https://stackoverflow.com/questions/47885848/filter-a-2d-numpy-array
        right = dataset[((dataset[:,attribute] > split_value))] 
        #left_mask = [row[attribute] <= split_value for row in dataset]
        #right_mask = [row[attribute] > split_value for row in dataset]
        #left = dataset[left_mask]
        #right = dataset[right_mask]
        return left, right




In [132]:
decision_tree = DecisionTree(clean_data_train)

# Step 3: Evaluation

In [136]:

def k_fold_split(k_folds, n_instances, random_generator=default_rng()):
    """ Split k_instances into n mutually exclusive splits at random.
    
    Args:
        n_splits (int): Number of splits
        n_instances (int): Number of instances to split
        random_generator (np.random.Generator): A random generator

    Returns:
        list: a list (length n_splits). Each element in the list should contain a 
            numpy array giving the indices of the instances in that split.
    """

    # generate a random permutation of indices from 0 to n_instances
    shuffled_indices = random_generator.permutation(n_instances)

    # split shuffled indices into almost equal sized splits
    split_indices = np.array_split(shuffled_indices, k_folds)

    return split_indices


def train_test_k_fold(k_folds, n_instances, random_generator=default_rng()):
    """ Generate train and test indices at each fold.
    
    Args:
        k_folds (int): Number of folds
        n_instances (int): Total number of instances
        random_generator (np.random.Generator): A random generator

    Returns:
        list: a list of length k_folds. Each element in the list is a list (or tuple) 
            with two elements: a numpy array containing the train indices, and another 
            numpy array containing the test indices.
    """

    # split the dataset into k splits
    split_indices = k_fold_split(k_folds, n_instances, random_generator)

    folds = []
    for k in range(k_folds):
        # the selected k_fold for test
        test_indices = split_indices[k]

        # combine remaining splits as train
        # this solution is fancy and worked for me
        # feel free to use a more verbose solution that's more readable
        train_indices = np.hstack(split_indices[:k] + split_indices[k+1:])

        folds.append([train_indices, test_indices])

    return folds


def confusion_matrix(y_gold, y_prediction, class_labels=None):
    """ Compute the confusion matrix.
        
    Args:
        y_gold (np.ndarray): the correct ground truth/gold standard labels
        y_prediction (np.ndarray): the predicted labels
        class_labels (np.ndarray): a list of unique class labels. 
                               Defaults to the union of y_gold and y_prediction.

    Returns:
        np.array : shape (C, C), where C is the number of classes. 
                   Rows are ground truth per class, columns are predictions
    """

    # if no class_labels are given, we obtain the set of unique class labels from
    # the union of the ground truth annotation and the prediction
    if not class_labels:
        class_labels = np.unique(np.concatenate((y_gold, y_prediction)))

    confusion = np.zeros((len(class_labels), len(class_labels)), dtype=np.int)

    # for each correct class (row), 
    # compute how many instances are predicted for each class (columns)
    for (i, label) in enumerate(class_labels):
        # get predictions where the ground truth is the current class label
        indices = (y_gold == label)
        gold = y_gold[indices]
        predictions = y_prediction[indices]

        # quick way to get the counts per label
        (unique_labels, counts) = np.unique(predictions, return_counts=True)

        # convert the counts to a dictionary
        frequency_dict = dict(zip(unique_labels, counts))

        # fill up the confusion matrix for the current row
        for (j, class_label) in enumerate(class_labels):
            confusion[i, j] = frequency_dict.get(class_label, 0)

    return confusion


In [138]:
# temporary cell
def accuracy(y_actual, y_predicted):
    if len(y_actual) != len(y_predicted):
        return None
    correct = 0
    for i in range(len(y_actual)):
        if y_predicted[i] == y_actual[i]:
            correct += 1
    accuracy = correct / len(y_predicted)
    return accuracy

In [140]:
k_folds = 10
accuracies = np.zeros((k_folds, ))

for i, (train_indices, test_indices) in enumerate(train_test_k_fold(k_folds, len(clean_data), rg)):

    # Train the KNN (we'll use one nearest neighbour)
    decision_tree = DecisionTree(clean_data[train_indices])
    predictions = decision_tree.predict(clean_data[test_indices])
    acc = accuracy(clean_data[test_indices][:,-1], predictions)
    accuracies[i] = acc

In [141]:
print(accuracies)

[0.96  0.965 0.98  0.965 0.965 0.98  0.975 0.97  0.985 0.99 ]


In [106]:
import numpy as np

arr = np.array([[1, 2, 3, 4, 5], [9, 2, 3, 4, 5], [7, 2, 3, 4, 5]])
mask = [row[0] < 9 for row in arr]
filtered = arr[mask]
arr[0][1] = 10
print(filtered)
print(arr)

[[1 2 3 4 5]
 [7 2 3 4 5]]
[[ 1 10  3  4  5]
 [ 9  2  3  4  5]
 [ 7  2  3  4  5]]
