In [19]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import csv
from random import choices

### Questions
* how to handle categorical variables?
* make classes?

### To Do
* find new dataset
* function to pretty print tree

## To Explain

### Decision Tree
* purity/impurity
* entropy vs. Gini index
* using decision tree for prediction

### Random Forest
* bootstrap aggregating
* OOB error estimating
* pros and cons of random forest

### Extra
* compare classification with sci-kit learn functions vs. random forest

In [20]:
X = []
y = []

#with open("Churn.csv", newline="") as f:
#with open("ChurnTestMedium.csv", newline="") as f:
with open("ChurnTest.csv", newline="") as f:
    reader = csv.reader(f)
    next(reader)
    for line in reader:
        X.append([float(num) for num in line[0:-1]]) # save features to X list
        y.append(int(line[-1])) # save class to y list
        #X_list.append([float(num) for num in line[0:-1]]) # save features to X list
        #y_list.append(int(line[-1])) # save class to y list

print(X)
print(y)

[[619.0, 0.0], [502.0, 0.0], [645.0, 1.0], [822.0, 1.0], [376.0, 0.0], [501.0, 1.0], [684.0, 1.0], [528.0, 1.0], [616.0, 1.0], [653.0, 1.0]]
[1, 1, 1, 0, 1, 0, 0, 0, 0, 1]


In [24]:
# take a dataset and create bootstrapped datasets

# return one bootstrapped dataset
# take len(data) samples from the dataset with replacement
def bootstrap(X, y):
    bsX = choices(X, k=len(X))
    bsY = choices(y, k=len(y))
    return bsX, bsY

#bootstrap(X, y)
# return numtrees bootstrapped datasets (call prev function)

([[616.0, 1.0],
  [528.0, 1.0],
  [645.0, 1.0],
  [645.0, 1.0],
  [528.0, 1.0],
  [684.0, 1.0],
  [619.0, 0.0],
  [645.0, 1.0],
  [502.0, 0.0],
  [619.0, 0.0]],
 [1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

In [4]:
# utility functions to support building decision tree

# calculate cost function for split (entropy)
def calc_entropy(y_vals):
    ent = 0
    for y_val in set(y_vals):
        prop = len([val for val in y_vals if val==y_val]) / len(y_vals)
        ent += (-1 * prop) * np.log2(prop) # update entropy using formula
    return ent

# split data
def split_data(pred_idx, pred_val, X_vals, y_vals):
        X_left, X_right, y_left, y_right = [], [], [], []
        for i in range(len(X_vals)):
            if X_vals[i][pred_idx] < pred_val:
                X_left.append(X_vals[i])
                y_left.append(y_vals[i])
            else:
                X_right.append(X_vals[i])
                y_right.append(y_vals[i])
        return X_left, X_right, y_left, y_right

# calculate information gain
def calc_infogain(parent_yvals, left_yvals, right_yvals):
    H = calc_entropy(parent_yvals) # entropy of parent node
    #print("H: ", H)
    H_left = calc_entropy(left_yvals) # entropy of left child node
    #print("H_left: ", H_left)
    H_right = calc_entropy(right_yvals) # entropy of right child node
    #print("H_right: ", H_right)
    P_left = len(left_yvals) / len(parent_yvals)
    P_right = len(right_yvals) / len(parent_yvals)
    cond_entropy = (H_left * P_left) + (H_right * P_right) # conditional entropy to compare to parent node
    #print("cond_entropy: ", cond_entropy)
    return H - cond_entropy # difference between parent node and child node entropy

# determine best split (or no split)
def best_split(X, y):
    m = int(np.round(np.sqrt(len(X[1])),2)) # set number of predectors to test = sqrt total # predictors
    pred_idxs_to_test = np.random.choice(range(0,len(X[1])),m, replace=False) # select random subset of predictors to test
    pred_vals_to_test = np.mean(X, axis=0)[pred_idxs_to_test] # use mean value for each predictor as split value
    best_idx = 0
    best_val = 0
    max_infogain = 0
    #max_infogain, best_idx, best_val, best_left, best_right = 0, 9999999, 9999999, {}, {}
    X_left, X_right, y_left, y_right = [], [], [], []
    for i in range(len(pred_idxs_to_test)): # for each predictor in random subset
        X_l, X_r, y_l, y_r = split_data(pred_idxs_to_test[i], pred_vals_to_test[i], X, y) # split data on mean value for each predictor
        infogain = calc_infogain(y, y_l, y_r)
        if infogain > max_infogain: # determine if split increases information gain / reduces entropy
            max_infogain = infogain
            best_idx = pred_idxs_to_test[i]
            best_val = pred_vals_to_test[i]
            X_left, y_left = X_l, y_l
            X_right, y_right = X_r, y_r
            #best_left = {"X_left": X_l, "y_left": y_l}
            #best_right = {"X_right": X_r, "y_right": y_r}
    #print("max_infogain", max_infogain)
    return {"pred_idx": best_idx, "pred_val": best_val, "left": {"X_left": X_left, "y_left": y_left}, "right": {"X_right": X_right, "y_right": y_right}}

In [5]:
class DecisionTree(object):
    # create new instance of DecisionTree
    def __init__(self, depth):
        self.max_depth = depth
        self.tree = {}
    
    # build decision tree
    def build_tree(self, X, y, parent={}, depth=0):
        
        # grow decision tree
        def grow_tree(X, y):
            self.tree = best_split(X, y) # get root node with best split for full data
            parent = {} # begin with empty parent node
            split_tree(self.tree, parent, 1) # call recursive function to build tree

        # split tree, called recursively
        def split_tree(node, parent_node, d):

            # save data from node to be used in split if needed
            left, right = node["left"], node["right"]
            #print("left: ", left)
            #print("right: ", right)

            # delete data from node so can reassign best classification
            del(node["left"], node["right"])
            #print("node: ", node)

            # check if node contains empty dataset
            if len(left["X_left"])==0 or len(right["X_right"])==0:
            #if not left["X_left"] or not right["X_right"]:
                # assign each branch of the node to the most common class from the parent node
                node["left"] = max(set(parent_node["left"]["y_left"]), key=parent_node["left"]["y_left"].count)
                node["right"] = max(set(parent_node["right"]["y_right"]), key=parent_node["right"]["y_right"].count)
                return

            elif d >= self.max_depth: # check if tree has been split maximum number of times
                # assign each branch of the node to the most common class from this node
                node['left'] = max(set(left['y_left']), key=left['y_left'].count)
                node['right'] = max(set(right['y_right']), key=right['y_right'].count)
                #node["left"] = max(set(parent_node["left"]["y_left"]), key=parent_node["left"]["y_left"].count)
                #node["right"] = max(set(parent_node["right"]["y_right"]), key=parent_node["right"]["y_right"].count)
                return
            else:
                # check left and right datasets to see if need to split more or make terminal node
                # assess left node
                if len(set(left["y_left"]))==1:
                    # assign each branch of the node to the most common class from this node
                    node['left'] = max(set(left['y_left']), key=left['y_left'].count)
                else:
                    # split this branch by calling split_tree function
                    node["left"] = best_split(left["X_left"], left["y_left"])
                    parent = {"left": left, "right": right}
                    split_tree(node["left"], parent, d+1)
                # assess right node
                if len(set(right["y_right"]))==1: 
                    # assign each branch of the node to the most common class from this node
                    node['right'] = max(set(right['y_right']), key=right['y_right'].count)
                    return
                else:
                    # split this branch by calling split_tree function
                    node["right"] = best_split(right["X_right"], right["y_right"])
                    parent = {"left": left, "right": right}
                    split_tree(node["right"], parent, d+1)
        
        # call grow_tree to create decision tree
        grow_tree(X, y)
                
    # predict classification for new datapoint
    def predict(self, x):
        curr_node = self.tree
        while True:
            if x[curr_node['pred_idx']] < curr_node['pred_val']:
                if type(curr_node['left'])==int:
                    return curr_node['left']
                else:
                    curr_node = curr_node['left']
                    continue
            else:
                if type(curr_node['right'])==int:
                    return curr_node['right']
                else:
                    curr_node = curr_node['right']
                    continue

In [12]:
# test decision tree
mytree = DecisionTree(5)
mytree.build_tree(X, y)
#print("tree: ", mytree.tree)

In [13]:
# test predict
preds = []
for i in range(len(X)):
    preds.append(mytree.predict(X[i]))
#print("predictions: ", preds)
#print("actual vals: ", y)

In [14]:
# calculate accuracy
def calc_accuracy(y_hat, y):
    correct = []
    for i in range(len(y_hat)):
        correct.append(y_hat[i]==y[i])
    return sum(correct) / len(y_hat)
    
print("accuracy: ", calc_accuracy(preds, y))

accuracy:  0.7116


In [15]:
from statistics import mean
accuracies = []
for i in range(100):
    #print("building tree #", i+1)
    tree = DecisionTree(8)
    tree.build_tree(X, y)
    preds = []
    for j in range(len(X)):
        preds.append(tree.predict(X[j]))
    accuracies.append(calc_accuracy(preds, y))
    #print(accuracies)
print("avg accuracy: ", mean(accuracies))

avg accuracy:  0.770266


In [None]:
# function to calculate cost function for split (entropy)
def calc_entropy(y_vals):
    ent = 0
    for y_val in set(y_vals):
        prop = len([val for val in y_vals if val==y_val]) / len(y_vals)
        ent += (-1 * prop) * np.log2(prop) # update entropy using formula
    return ent

calc_entropy(y)
#calc_entropy(data[:,-1])
#test_list = [1, 0, 1, 1, 1]
#calc_entropy(test_list)

In [4]:
# new function to split data
def split_data(pred_idx, pred_val, X_vals, y_vals):
    X_left, X_right, y_left, y_right = [], [], [], []
    for i in range(len(X_vals)):
        if X_vals[i][pred_idx] < pred_val:
            X_left.append(X_vals[i])
            y_left.append(y_vals[i])
        else:
            X_right.append(X_vals[i])
            y_right.append(y_vals[i])
    return X_left, X_right, y_left, y_right

#X_left, X_right, y_left, y_right = split_data(0, 510, X, y)
#print("left: ", X_left, y_left)
#print("right: ", X_right, y_right)

In [5]:
# function to calculate information gain
def calc_infogain(parent_yvals, left_yvals, right_yvals):
    H = calc_entropy(parent_yvals) # entropy of parent node
    #print("H: ", H)
    H_left = calc_entropy(left_yvals) # entropy of left child node
    #print("H_left: ", H_left)
    H_right = calc_entropy(right_yvals) # entropy of right child node
    #print("H_right: ", H_right)
    P_left = len(left_yvals) / len(parent_yvals)
    P_right = len(right_yvals) / len(parent_yvals)
    cond_entropy = (H_left * P_left) + (H_right * P_right) # conditional entropy to compare to parent node
    #print("cond_entropy: ", cond_entropy)
    return H - cond_entropy # difference between parent node and child node entropy

#parent_y = data[:,-1]
#left_node, right_node = split_data(0, 600, data)
#print(left_node)
#print(right_node)
#calc_infogain(parent_y, [obs[-1] for obs in left_node], [obs[-1] for obs in right_node])
X_left, X_right, y_left, y_right = split_data(0, 600, X, y)
calc_infogain(y, y_left, y_right)

0.01100085281782226

In [None]:
m = int(np.round(np.sqrt(len(X[1])),2)) # set number of predectors to test = sqrt total # predictors
pred_idxs_to_test = np.random.choice(range(0,len(X[1])),m, replace=False) # select random subset of predictors to test
pred_vals_to_test = np.mean(X, axis=0)[pred_idxs_to_test] # use mean value for each predictor as split value
print(pred_vals_to_test)

In [110]:
# new function to determine best split (or no split)
def best_split(X, y):
    m = int(np.round(np.sqrt(len(X[1])),2)) # set number of predectors to test = sqrt total # predictors
    pred_idxs_to_test = np.random.choice(range(0,len(X[1])),m, replace=False) # select random subset of predictors to test
    pred_vals_to_test = np.mean(X, axis=0)[pred_idxs_to_test] # use mean value for each predictor as split value
    max_infogain, best_idx, best_val, best_left, best_right = 0, 9999999, 9999999, {}, {}
    for i in range(len(pred_idxs_to_test)): # for each predictor in random subset
        X_left, X_right, y_left, y_right = split_data(pred_idxs_to_test[i], pred_vals_to_test[i], X, y) # split data on mean value for each predictor
        infogain = calc_infogain(y, y_left, y_right)
        if infogain > max_infogain: # determine if split increases information gain / reduces entropy
            max_infogain = infogain
            best_idx = pred_idxs_to_test[i]
            best_val = pred_vals_to_test[i]
            best_left = {"X_left": X_left, "y_left": y_left}
            best_right = {"X_right": X_right, "y_right": y_right}
    #print("max_infogain", max_infogain)
    node = {"pred_idx": best_idx, "pred_val": best_val, "left": best_left, "right": best_right}
    print("node from best split: ", node)
    return node
    #return {"pred_idx": best_idx, "pred_val": best_val, "left": best_left, "right": best_right}

#best_split(X, y)

In [None]:
# function to determine best split (or no split)
'''def best_split(data):
    y_vals = data[:,-1] # extract response values from data
    #max_infogain = 0
    m = int(np.round(np.sqrt(data.shape[1]-1))) # set number of predectors to test = sqrt total # predictors
    pred_idxs_to_test = np.random.choice(range(0,data.shape[1]-1),m, replace=False) # select random subset of predictors to test
    pred_vals_to_test = np.mean(data[:,0:-1], axis=0)[pred_idxs_to_test] # use mean value for each predictor as split value
    #print(pred_idxs_to_test)
    #max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, list(), list()
    max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, None, None
    for i in range(len(pred_idxs_to_test)): # for each predictor in random subset
        left, right = split_data(pred_idxs_to_test[i], pred_vals_to_test[i], data) # split data on mean value for each predictor
        #infogain = calc_infogain(y_vals, [obs[-1] for obs in left], [obs[-1] for obs in right])
        infogain = calc_infogain(y_vals, left[:,-1], right[:,-1])
        if infogain > max_infogain: # determine if split increases information gain / reduces entropy
            max_infogain = infogain
            best_idx = pred_idxs_to_test[i]
            best_val = pred_vals_to_test[i]
            best_left, best_right = left, right
    print("max_infogain", max_infogain)
    return {"pred_idx": best_idx, "pred_val": best_val, "left": best_left, "right": best_right}

best_split(data)'''

In [None]:
# old
# function to determine best split (or no split)
'''def best_split(data):
    y_vals = data[:,-1] # extract response values from data
    max_infogain = 0
    m = int(np.round(np.sqrt(data.shape[1]-1))) # set number of predectors to test = sqrt total predictors
    pred_vals_to_test = np.random.choice(range(0,data.shape[1]-1),m, replace=False) # select random subset of predictors to test
    print(pred_vals_to_test)
    #max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, list(), list()
    for idx in pred_vals_to_test:
        for row in data:
            left, right = split_data(idx, row[idx], data)
            infogain = calc_infogain(y_vals, [obs[-1] for obs in left], [obs[-1] for obs in right])
            #print("infogain: ", infogain)
            if infogain > max_infogain:
                max_infogain = infogain
                best_idx = idx
                best_val = row[idx]
                best_left, best_right = left, right
    print("max_infogain", max_infogain)
    return {"pred_idx": best_idx, "pred_val": best_val, "left": best_left, "right": best_right}

best_split(data)'''

In [7]:
# function to build decision tree
def build_tree(X, y):
    #tree = best_split(X, y) # get root node with best split for full data
    depth = 0 # set initial depth to 0
    parent = {} # begin with empty parent node
    split_tree(tree, parent, depth) # call recursive function to build tree

In [8]:
# function to split tree, called recursively
def split_tree(node, parent_node, d):
    
    # save data from node to be used in split if needed
    left, right = node["left"], node["right"]
    
    # delete data from node so can reassign best classification
    del(node["left"], node["right"])
    
    # check if node contains empty dataset
    if len(left["X_left"])==0 or len(right["X_right"])==0:
        # assign each branch of the node to the most common class from the parent node
        node["left"] = max(set(parent_node["left"]["y_left"]), key=parent_node["left"]["y_left"].count)
        node["right"] = max(set(parent_node["right"]["y_right"]), key=parent_node["right"]["y_right"].count)
        return
    
    # check if tree has been split maximum number of times
    elif d >= max_depth:
        # assign each branch of the node to the most common class from the parent node
        node["left"] = max(set(parent_node["left"]["y_left"]), key=parent_node["left"]["y_left"].count)
        node["right"] = max(set(parent_node["right"]["y_right"]), key=parent_node["right"]["y_right"].count)
        return
    else:
        # check left and right datasets to see if need to split more or make terminal node
        # assess left node
        if len(set(left["y_left"]))==1:
            # assign left branch of the node to the most common class from the parent node
            node["left"] = max(set(parent_node["left"]["y_left"]), key=parent_node["left"]["y_left"].count)
        else:
            # split this branch by calling split_tree function
            node["left"] = best_split(left["X_left"], left["y_left"])
            parent = {"left": left, "right": right}
            split_tree(node["left"], parent, d+1)
        # assess right node
        if len(set(right["y_right"]))==1:
            # assign right branch of the node to the most common class from the parent node
            node["right"] = max(set(parent_node["right"]["y_right"]), key=parent_node["right"]["y_right"].count)
            return
        else:
            # split this branch by calling split_tree function
            node["right"] = best_split(right["X_right"], right["y_right"])
            parent = {"left": left, "right": right}
            split_tree(node["right"], parent, d+1)

In [9]:
# set number of trees
num_trees = 100

In [None]:
# bootstrap datasets

In [10]:
# main function
# set maximum depth for tree
max_depth = 5
# get root node for tree
tree = best_split(X, y)
# split root node recursively
build_tree(X, y)
print("tree: ", tree)

tree:  {'pred_idx': 7, 'pred_val': 0.55, 'left': {'pred_idx': 8, 'pred_val': 104090.83333333333, 'left': {'pred_idx': 5, 'pred_val': 1.3333333333333333, 'left': 1, 'right': 1}, 'right': 0}, 'right': {'pred_idx': 3, 'pred_val': 3.909090909090909, 'left': {'pred_idx': 1, 'pred_val': 0.6666666666666666, 'left': {'pred_idx': 8, 'pred_val': 100768.155, 'left': 0, 'right': 0}, 'right': 0}, 'right': 0}}


In [11]:
# function to predict classification for new datapoint
def predict(x):
    curr_node = tree
    while True:
        if x[curr_node['pred_idx']] < curr_node['pred_val']:
            if type(curr_node['left'])==int:
                return curr_node['left']
            else:
                curr_node = curr_node['left']
                continue
        else:
            if type(curr_node['right'])==int:
                return curr_node['right']
            else:
                curr_node = curr_node['right']
                continue

#preds = []
#for i in range(len(X)):
#    preds.append(predict(X[i]))
#print("predictions: ", preds)
#print("actual vals: ", y)

In [12]:
# function to calculate accuracy
def calc_accuracy(y_hat, y):
    correct = []
    for i in range(len(y_hat)):
        correct.append(y_hat[i]==y[i])
    return sum(correct) / len(y_hat)
    
#print("accuracy: ", calc_accuracy(preds, y))

In [159]:
from statistics import mean
accuracies = []
for i in range(100):
    #print("building tree #", i+1)
    # set maximum depth for tree
    max_depth = 5
    # get root node for tree
    tree = best_split(X, y)
    # split root node recursively
    build_tree(X, y)
    preds = []
    for j in range(len(X)):
        preds.append(predict(X[j]))
    accuracies.append(calc_accuracy(preds, y))
print("avg accuracy: ", mean(accuracies))

avg accuracy:  0.694758


#### References
http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf
https://towardsdatascience.com/what-is-out-of-bag-oob-score-in-random-forest-a7fa23d710