### Bram Otten
### UvA ID: 10992456
### Group F

In [None]:
import numpy as np
from collections import Counter
import math
import matplotlib.pyplot as plt


def load_data(datafile):
    return np.loadtxt(open(datafile, 'r'),
                      dtype='str',
                      delimiter=',')


def count_missing(data):
    n = 0
    for row in data:
        for value in row:
            if value == '?':
                n += 1
    return n


print("Loading Cleveland data.")
cdata = load_data('data5/processed.cleveland.data')
print("Size:", len(cdata), "x", len(cdata[0]))
print("First row:", cdata[0, :])
nmc = count_missing(cdata)
print(nmc, "missing values.")

print()
print("Loading Hungarian data.")
hdata = load_data('data5/processed.hungarian.data')
print("Size:", len(hdata), "x", len(hdata[0]))
print("First row:", hdata[0, :])
hmc = count_missing(hdata)
print(hmc, "missing values.")

print()
print("Loading Switzerland data.")
sdata = load_data('data5/processed.switzerland.data')
print("Size:", len(sdata), "x", len(sdata[0]))
print("First row:", sdata[0, :])
smc = count_missing(sdata)
print(smc, "missing values.")

print()
print("Loading VA data.")
vdata = load_data('data5/processed.va.data')
print("Size:", len(vdata), "x", len(vdata[0]))
print("First row:", vdata[0, :])
vmc = count_missing(vdata)
print(vmc, "missing values.")

In [None]:
def remove_missing(data):
    clean = []
    for row in data:
        if '?' not in row:
            clean.append(row)
    return np.array(clean)


def unique_vals(data):
    natts = len(data[0])
    ua = [[] for _ in range(natts)]
    for row in data:
        for i in range(natts):
            v = row[i]
            a = ua[i]
            if v not in a:
                a.append(v)
    return [len(ua[i]) for i in range(natts)]


def count_occurence(datacol):
    cnt = Counter()
    for value in datacol:
        cnt[value] += 1
    return dict(cnt)


ccombo = remove_missing(np.concatenate((cdata, hdata, sdata, vdata)))
print("Number of complete rows:", len(ccombo))
nua = unique_vals(ccombo)
print("Number of unique values per column of those rows:")
print(nua)
occsp = count_occurence(ccombo[:, -1])
print("Occurences last/prediction column:")
print(occsp)

In [None]:
def validation_split(data, ratio=0.7):
    rand = np.copy(data)
    np.random.shuffle(rand)
    splidx = int(len(data) * ratio)
    return (rand[:splidx, :], rand[splidx:, :])


def x_y_split(data):
    n = len(data)
    nc = len(data[0])
    yidx = nc - 1
    yrow = np.array([data[i, yidx] != '0' for i in range(n)])
    return data[:, : yidx], yrow[:]


def disc_num_split(data, thres=20):
    nua = unique_vals(data)
    ncols = len(nua)
    da = []
    na = []
    for i in range(ncols):
        if nua[i] < thres:
            da.append(i)
        else:
            na.append(i)
    xn = data[:, da].astype(np.float).astype(int)
    xf = data[:, na].astype(np.float)
    return xn, xf


(tset, vset) = validation_split(ccombo)
(tx, ty) = x_y_split(tset)
(vx, vy) = x_y_split(vset)
(txd, txn) = disc_num_split(tx)
print(len(tset))
print("Discrete variables:")
print(txd)
print()
print("Continuous variables")
print(txn)
print()
print("Outcome vector:")
print(ty)

In [None]:
def ratio(labels):
    t = 0
    n = len(labels)
    for l in labels:
        t += l == True
    if t == 0:
        return 0
    if t == n:
        return 1
    return t / n


def entropy_sub(p):
    if p == 0 or p == 1:
        return 0
    return - p * math.log(p, 2) - (1 - p) * math.log((1 - p), 2)


def entropy(labels):
    return entropy_sub(ratio(labels))


def split_entropy(labelsl, N):
    s = 0
    for labels in labelsl:
        s += len(labels) / N * entropy(labels)
    return s


def information_gain(labels, indices):
    seplabs = labels[indices], list(set(labels) - set(labels[indices]))
    return entropy(labels) - split_entropy(seplabs, len(labels))


def plot_entropy(N):
    x = []
    y = []
    for i in range(N):
        l = [i >= j for j in range(N + 1)]
        x.append(ratio(l))
        y.append(entropy(l))
    plt.figure()
    plt.plot(x, y)
    plt.ylabel("Entropy")
    plt.xlabel("Label ratio (or, probability of a true)")
    plt.show()


ey = entropy(ty)
print("Entropy of training y:", ey)
plot_entropy(int(420 / 4.2))

In [None]:
class DataRow(object):
    def __init__(self, discrete, numeric):
        self.da = discrete
        self.na = numeric

    def get_discrete(self, index):
        return self.da[index]

    def get_numeric(self, index):
        return self.na[index]

    def size_discrete(self):
        return len(self.da)

    def size_numeric(self):
        return len(self.na)


def create_rows(discrete, numeric):
    n = len(discrete)  # number of rows
    ra = []
    for i in range(n):
        ra.append(DataRow(discrete[i], numeric[i]))
    return ra


tdra = create_rows(txd, txn)
print(txd[4, 2], tdra[4].get_discrete(2))
print(txn[4, 2], tdra[4].get_numeric(2))

In [None]:
class DecisionTree(object):
    def __init__(self, data, y, tree_type=0, thres=0.1):
        """ Decision Tree creation, according to the follows the steps: 
            1. Stores the attributes
                data - A vector of DataRow objects, each instance containing
                       the discrete and numeric data for one patient
                y - A vector of boolean class labels, each corresponding to a
                    DataRow instance of a patient at the same index. 
                tree_type - 0: create the Tree with the highest IG every node 
                            1: create DiscreteTrees only
                            2: create NumericTrees only
                thres - The cutoff value for IG, to stop splitting the tree.
                        Below this value the node becomes terminal and no further
                        splits are made.
            2. Computes the optimal discrete and numeric splits and converts itself
                to correct type based on the tree_type parameter.
            3. The Information Gain of the split is compared to the thres parameter,
                to determine if the node is terminal. If the node is not terminal,
                new subtrees are created for each distribution resulting from the
                split. """
        self.data = np.array(data)
        self.y = y
        self.tree_type = tree_type
        self.thres = thres
        # TODO: that continuous stuff
        if tryingNumeric is True:
            self.convert_tree(NumericTree(self))
        else:
            self.convert_tree(DiscreteTree(self))
        self.terminal = self.ig < thres
        if not self.terminal:
            self.create_subtrees()

    def store_split_values(self, split_variable, indices, ig, vals):
        """ Sets the values of the passed parameters as object attributes.
            Also sets the result label for this node, based on the set of y-values.
                split_variable - The index of variable on which the split was based 
                indices - The list of index lists, each corresponding to the
                            indices for a subset resulting from the split
                ig - Information Gain computed from the split"""
        self.split_variable = split_variable
        self.indices = indices
        self.ig = ig
        self.vals = vals
        self.ratli = []
        self.outli = []
        maxlen = 0
        for valind in indices:
            r = ratio(self.y[valind])
            self.ratli.append(r)
            self.outli.append(r > 0.5)

    def convert_tree(self, new_tree):
        """ Converts this object to the tree passed as the new_tree parameter.
            All attributes from the new_tree are transfered.
            N.B. This function has already been provided and does not need to be
            modified.
            new_tree - Either a DiscreteTree or a NumericTree instance, to which
                        this object is converted"""
        self.__class__ = new_tree.__class__
        self.__dict__ = new_tree.__dict__

    def create_subtrees(self):
        """ Based on the indices parameter stored after the split, the different
            subsets of the data are created and for each a new DecisionTree made.
            The DecisionTrees are stored in a dict mapping the variable value from the
            split to the DecisionTree created by selecting that value for the split"""
        self.branches = {}
        nindices = len(self.indices)
        for i in range(nindices):
            thesedice = self.indices[i]
            newx = self.data[thesedice]
            newy = self.y[thesedice]
            nt = DecisionTree(newx, newy)
            self.branches[self.vals[i]] = nt

    def classify(self, row):
        """ Returns the most common label for the values stored in the row, based on
            the splits in the DecisionTree.
            row - The DataRow object containing the values that are being
                    classified"""
        i = self.indices.index(max(self.indices, key=len))
        mcommon = self.outli[i]

        if self.terminal:
            v = row.get_discrete(self.split_variable)  # TODO: hard-coded
            if v in self.vals:
                i = self.vals.index(v)
                return self.outli[i]
            else:
                return mcommon
        else:
            subtree = self.get_subtree(row)
            if subtree == None:
                return mcommon
            else:
                return subtree.classify(row)

    def validate(self, data, y):
        """ Classifies all the DataRow instances in data and compares the outcome to 
            the labels specified in y. Returns the percentage of elements that was
            classified correctly.
            data - List of DataRow instances from the validation set.
            y - List of boolean labels each belonging to a DataRow instances at the
                same index"""
        disc, num = disc_num_split(data)
        superdata = create_rows(disc, num)
        results = []
        for row in superdata:
            results.append(self.classify(row))
        results = np.array(results)
        yippie = 0
        for x in range(len(results)):
            yippie += results[x] == y[x]
        return yippie / len(y) * 100

    def split(self):
        raise NotImplementedError

    def get_subtree(self, instance):
        raise NotImplementedError


class DiscreteTree(DecisionTree):
    def __init__(self, dtree):
        """ Takes a DecisionTree as initialization parameter and copies all its
            attributes. Then calls the split() function to determine the optimal
            discrete variable to split this subset of the data on.
            dtree - The DecisionTree instance whose attributes are copied to this
                    DiscreteTree instance."""
        self.__dict__ = dtree.__dict__.copy()
        self.split()

    def split(self):
        """ Determines the best discrete variable to split the current dataset on,
            based on the resulting IG. For this best split variable, it stores
            the following values as object attributes:
                1. The index of the variable on which the split is based
                2. List of lists containing indices for the split
                3. IG of the split
                4. List of unique values for the split variable"""
        nrows = len(self.data)
        natts = self.data[0].size_discrete()
        maxgain = bestatt = float('-inf')
        bestindices = []
        for att in range(natts):
            attvals = {}
            for i in range(nrows):
                v = self.data[i].get_discrete(att)
                if v not in attvals:
                    attvals[v] = [i]
                else:
                    attvals[v].append(i)
            gain = 0
            values = attvals.keys()
            for val in values:
                indices = attvals[val]
                gain += information_gain(self.y, indices)
            gain /= len(values)
            if gain > maxgain:
                maxgain = gain
                bestatt = att
                bestindices = attvals
        self.store_split_values(bestatt, list(bestindices.values()),
                                maxgain, list(bestindices.keys()))

    def get_subtree(self, row):
        """ Returns the subtree one branch down corresponding the to value of
            variable on which the split at this node was performed.
            Returns None if the value was not present at the split.
            row - The DataRow object containing the values that are being
                    classified"""
        #v = row[self.split_variable]
        v = row.get_discrete(self.split_variable)
        if v in self.vals:
            i = self.vals.index(v)
            if i in self.branches.keys():
                return self.branches[i]
            else:
                return None
        else:
            return None


tryingNumeric = False
ditr = DecisionTree(tdra, ty)
print("Accuracy (%) using discrete only:")
print(ditr.validate(vx, vy))

In [None]:
class NumericTree(DecisionTree):
    def __init__(self, dtree):
        """ Takes a DecisionTree as initialization parameter and copies all its
            attributes. Then calls the split() function to determine the optimal
            discrete variable to split this subset of the data on.
            dtree - The DecisionTree instance whose attributes are copied to this
                    NumericTree instance."""
        self.__dict__ = dtree.__dict__.copy()
        self.split()

    def split(self):
        """ Determines the best boundary for any numeric variable to split the
            current dataset on, based on the resulting IG. For this best split
            boundary, it stores the following values as object attributes:
                1. The index of the variable on which the split is based
                2. List of lists containing indices for the split
                3. IG of the split
                4. The value of the boundary for the split
                5. A value list containing [0, 1]"""
        # TODO
        nrows = len(self.data)
        natts = self.data[0].size_numeric()
        maxgain = bestatt = float('-inf')
        bestindices = []
        for att in range(natts):
            attvals = []
            for i in range(nrows):
                v = self.data[i].get_numeric(att)
                attvals.append(v)
            minv = min(attvals)
            maxv = max(attvals)
            rng = np.linspace(minv, maxv, nrows)

            for thr in rng:
                gain = 0
                indices = []
                for i in range(nrows):
                    thingy = attvals[i]
                    if thingy < thr:
                        indices.append(i)
                gain = information_gain(self.y, indices)
                if gain > maxgain:
                    maxgain = gain
                    bestatt = att
                    bestindices = indices
        self.store_split_values(bestatt, attvals,
                                maxgain, indices)

    def get_subtree(self, row):
        """ Returns the subtree one branch down based on the value of
            variable on which the split at this node was performed and the
            boundary used for this split.
            row - The DataRow object containing the values that are being
                    classified"""
        # TODO: check
        v = row.get_numeric(self.split_variable)
        if v > self.vals:  # == threshold
            i = self.vals.index(v)
            if i in self.branches.keys():
                return self.branches[i]
            else:
                return None
        else:
            return None


tryingNumeric = True
nitr = DecisionTree(tdra, ty) # I kinda gave up, this HW is so long!
tryingNumeric = False

In [None]:
def split_data(data):
    (tset, vset) = validation_split(data)
    (tx, ty) = x_y_split(tset)
    (txd, txn) = disc_num_split(tx)
    tdra = create_rows(txd, txn)
    (tx, ty) = x_y_split(tset)
    ttree = DecisionTree(tdra, ty)
    (vx, vy) = x_y_split(vset)
    (vxd, vxn) = disc_num_split(vx)
    # The validate handles the disc/num and DataRow stuff
    return ttree.validate(vx, vy)


tryingNumeric = False
split_data(ccombo)

## Analysis questions [2 pts]

If your algorithm is correct and you averaged over enough different validation splits, you might see some strange results in the comparison you just produced. For the last part of the assignment, answer these questions about the results. Write you answers below each question inside this cell.

#### Why does the validation score for the training for numeric and combined splits eventually reach 100% correct, but never for just discrete splits?

Could be there's the discrete data's not complete and/or consistent enough to make 100% accuracy possible. The tree will always be impure.

#### Can you explain how it is possible that the value of just the discrete variables is higher than the value of the discrete and numeric variables combined? What property of the algorithm makes this outcome possible?

The validation split splits the data randomly. Therefore the outcome is somewhat random as well.

#### What is your hypothesis for why this happens for this particular data? What could you do to improve the validation results?

People are more complicated than 13 variables, etc. There are exceptions to most rules when dealing with something so complex.