# Implement the Recursive Algorithm of Decision Tree

In [4]:
import numpy as np

In [185]:
def value_counts(data):
    # TODO Calculate the number of each label.
    # Return a dict with the labels as keys, and
    # their accurances as values
    unique, counts = np.unique(data[:,-1], return_counts=True)
    labels = dict(zip(unique, counts))
    return labels

def divide_data(data, feature_column, feature_val):
    data1 = []
    data2 = []
    # TODO split the data into two parts by feature_column,
    # where data1 contains all with value at feature column less than
    # feature_value, and data2 contains all values larger that veature_val
    data1 = data[data[:,feature_column] < feature_val]
    data2 = data[data[:,feature_column] >= feature_val]
    return data1, data2

def gini(data):
    #TODO calculate the gini
    sumUp = 0
    for i in np.unique(data[:,-1]):
        sumUp+=(len(data[data[:,-1]==i])/len(data[:,-1]))**2
    gini = 1 - sumUp      
    return gini

def entropy(data):
    #TODO calculate the entropy
    sumUp = 0
    for i in np.unique(data[:,-1]):
        sumUp+=(len(data[data[:,-1]==i])/len(data[:,-1]))*np.log(len(data[data[:,-1]==i])/len(data[:,-1]))
    entropy = -sumUp    
    return entropy

def square_loss(data):
    #TODO calculate the loss
    for i in data[:,-1]:
        y_pred = np.sum(data[:,-1])/len(data[:,-1])
        loss+=(i-y_pred)**2
    loss = loss/len(data[:,-1])
    return loss

class DecisionNode(object):
    def __init__(self,
                 column=None,
                 value=None,
                 false_branch=None,
                 true_branch=None,
                 current_results=None,
                 is_leaf=False):
        """
        column: column is the index of feature by wich data is splitted
        value: value is column's value by which we filter data into splits
        true_branch: boolean, if True, it is the true branch of it's parent
        false_branch: boolean, if True, it is the false branch of it's parent
        is_leaf: boolean, if True, node has no child
        current_results: is value_counts(data) for data which reached this node
        """
        
        self.column = column
        self.value = value
        self.false_branch = false_branch
        self.true_branch = true_branch
        self.current_results = current_results
        self.is_leaf = is_leaf

def build_tree(data, current_depth=0, max_depth=4, criterion=gini, task="classification"):
    """
    Task can be classification or regression
    Criterion is inpurity function to use
    """

    if len(data) == 0:
        return DecisionNode(is_leaf=True)

    if current_depth == max_depth:
        return DecisionNode(current_results=value_counts(data), is_leaf=True)
    
    if len(value_counts(data)) == 1:
        return DecisionNode(current_results=value_counts(data), is_leaf=True)

    # TODO, calculate best split 
    # split_pos = []
    # split_neg = []
    info_gain = 0

    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            split_neg,split_pos = divide_data(data,j,i)
            temp = criterion(data) - (split_neg.shape[0]/data.shape[0])*criterion(split_neg) - (split_pos.shape[0]/data.shape[0])*criterion(split_pos)
            if(temp>info_gain):
                info_gain = temp
                best_column = j
                best_value = i
    
    # if we cannot improve by splitting:           
    if(criterion(data)==0):
        return DecisionNode(current_results=value_counts(data), is_leaf=True)
    else:
        return DecisionNode(column=best_column,
                            value=best_value,
                            current_results=value_counts(data),
                            false_branch=build_tree(split_neg, current_depth+1, max_depth),
                            true_branch=build_tree(split_pos, current_depth+1, max_depth))


In [186]:
class DecisionTree(object):
    def __init__(self, max_tree_depth=4, criterion="gini", task="classification"):
        self.max_depth = max_tree_depth
        self.tree = None
        self.task = task
        
        self.criterion = gini
        if criterion == "entropy":
            self.criterion = entropy
        if criterion == "square_loss":
            self.criterion = square_loss

    def fit(self, X, y):
        # build data
        data = np.concatenate((X, y.reshape(-1,1)),axis = 1)
        self.tree = build_tree(data,
                               task=self.task,
                               max_depth=self.max_depth, 
                               criterion=self.criterion)
    def predict(self, X):
        Y = []
        
        for i in X:
            tree = self.tree
            while tree.is_leaf==False:   
                if(i[tree.column] < tree.value):   
                    tree = tree.false_branch
                else:
                    tree = tree.true_branch
            else:
                Y.append(tree.current_results)
        # TODO get labels       
        return Y

# Perform Results on Iris Dataset

In [187]:
from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

boston = load_boston(return_X_y=True) 
boston = np.concatenate((boston[0], boston[1].reshape(-1,1)),axis = 1)
iris = load_iris(return_X_y=True)
iris = np.concatenate((iris[0], iris[1].reshape(-1,1)),axis = 1)


In [188]:
X_train, X_test, y_train, y_test = train_test_split(iris[:,:-1], iris[:,-1], test_size=0.33, random_state=58)

In [189]:
model = DecisionTree()

In [190]:
model.fit(X_train,y_train)

UnboundLocalError: local variable 'info_gain' referenced before assignment

In [184]:
model.predict(X_test)

AttributeError: 'NoneType' object has no attribute 'is_leaf'