In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

## To Explain

### Decision Tree
* purity/impurity
* entropy vs. Gini index
* using decision tree for prediction

### Random Forest
* bootstrap aggregating
* OOB error estimating
* pros and cons of random forest

### Extra
* compare classification with sci-kit learn functions vs. random forest

In [63]:
data = pd.read_csv("ChurnTest.csv").to_numpy()
#data.head()
data

array([[619,   0,   1],
       [502,   0,   1],
       [645,   1,   1],
       [822,   1,   0],
       [376,   0,   1],
       [501,   1,   0],
       [684,   1,   0],
       [528,   1,   0],
       [616,   1,   0],
       [653,   1,   1]], dtype=int64)

In [68]:
np.mean(data[:,0:-1], axis=0)[1]

0.7

In [71]:
data = pd.read_csv("Churn.csv").to_numpy()

In [62]:
pd.read_csv("Churn.csv").head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,Y
0,619,0,42,2,0.0,1,1,1,101348.88,1
1,502,0,42,8,159660.8,3,1,0,113931.57,1
2,645,1,44,8,113755.78,2,1,0,149756.71,1
3,822,1,50,7,0.0,2,1,1,10062.8,0
4,376,0,29,4,115046.74,4,1,0,119346.88,1


In [None]:
#sns.FacetGrid(data, hue="Y", height=4).map(plt.scatter, "X1", "X2").add_legend()
#plt.show()

In [None]:
# function to calculate metric for split (entropy)
#def calc_entropy(obs_list):
#    ent = 0
 #   for unique_class in set(obs_list.iloc[:,-1]):
  #      prop = len(obs_list[obs_list.iloc[:,-1]==unique_class]) / len(obs_list)
   #     ent += (-1 * prop) * np.log2(prop)
    #return ent

#calc_entropy(data)

In [55]:
# function to calculate cost function for split (entropy)
def calc_entropy(y_vals):
    ent = 0
    for y_val in set(y_vals):
        prop = len([val for val in y_vals if val==y_val]) / len(y_vals)
        ent += (-1 * prop) * np.log2(prop) # update entropy using formula
    return ent

calc_entropy(data[:,-1])
#test_list = [1, 0, 1, 1, 1]
#calc_entropy(test_list)

0.9751150605666907

In [56]:
# function to split data
def split_data(pred_idx, pred_val, data):
    left_node = data[data[:,pred_idx] < pred_val] # left holds obs with vals less than pred_val
    right_node = data[data[:,pred_idx] >= pred_val] # right holds obs with vals greater than or equal to pred_val
    return left_node, right_node

#left, right = split_data(0, 600, data)
#print("left: ", left)
#print("right: ", right)

In [57]:
# function to calculate information gain
def calc_infogain(parent_yvals, left_yvals, right_yvals):
    H = calc_entropy(parent_yvals) # entropy of parent node
    #print("H: ", H)
    H_left = calc_entropy(left_yvals) # entropy of left child node
    #print("H_left: ", H_left)
    H_right = calc_entropy(right_yvals) # entropy of right child node
    #print("H_right: ", H_right)
    P_left = len(left_yvals) / len(parent_yvals)
    P_right = len(right_yvals) / len(parent_yvals)
    cond_entropy = (H_left * P_left) + (H_right * P_right) # conditional entropy to compare to parent node
    #print("cond_entropy: ", cond_entropy)
    return H - cond_entropy # difference between parent node and child node entropy

parent_y = data[:,-1]
left_node, right_node = split_data(0, 400, data)
#print(left_node)
#print(right_node)
calc_infogain(parent_y, [obs[-1] for obs in left_node], [obs[-1] for obs in right_node])

0.004938050068641009

In [72]:
# function to determine best split (or no split)
def best_split(data):
    y_vals = data[:,-1] # extract response values from data
    max_infogain = 0
    m = int(np.round(np.sqrt(data.shape[1]-1))) # set number of predectors to test = sqrt total predictors
    pred_idxs_to_test = np.random.choice(range(0,data.shape[1]-1),m, replace=False) # select random subset of predictors to test
    pred_vals_to_test = np.mean(data[:,0:-1], axis=0)[pred_idxs_to_test] # use mean value for each predictor as split value
    #print(pred_idxs_to_test)
    #max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, list(), list()
    for i in range(len(pred_idxs_to_test)): # for each predictor in random subset
        left, right = split_data(pred_idxs_to_test[i], pred_vals_to_test[i], data) # split data on mean value for each predictor
        infogain = calc_infogain(y_vals, [obs[-1] for obs in left], [obs[-1] for obs in right])
        if infogain > max_infogain: # determine if split increases information gain / reduces entropy
            max_infogain = infogain
            best_idx = pred_idxs_to_test[i]
            best_val = pred_vals_to_test[i]
            best_left, best_right = left, right
    print("max_infogain", max_infogain)
    return {"pred_idx": best_idx, "pred_val": best_val, "left": best_left, "right": best_right}

best_split(data)

max_infogain 0.000640876886273789


{'pred_idx': 8,
 'pred_val': 99187.11793800001,
 'left': array([[8.220000e+02, 1.000000e+00, 5.000000e+01, ..., 1.000000e+00,
         1.006280e+04, 0.000000e+00],
        [5.010000e+02, 1.000000e+00, 4.400000e+01, ..., 1.000000e+00,
         7.494050e+04, 0.000000e+00],
        [6.840000e+02, 1.000000e+00, 2.700000e+01, ..., 1.000000e+00,
         7.172573e+04, 0.000000e+00],
        ...,
        [7.090000e+02, 0.000000e+00, 3.600000e+01, ..., 1.000000e+00,
         4.208558e+04, 1.000000e+00],
        [7.720000e+02, 1.000000e+00, 4.200000e+01, ..., 0.000000e+00,
         9.288852e+04, 1.000000e+00],
        [7.920000e+02, 0.000000e+00, 2.800000e+01, ..., 0.000000e+00,
         3.819078e+04, 0.000000e+00]]),
 'right': array([[6.1900000e+02, 0.0000000e+00, 4.2000000e+01, ..., 1.0000000e+00,
         1.0134888e+05, 1.0000000e+00],
        [5.0200000e+02, 0.0000000e+00, 4.2000000e+01, ..., 0.0000000e+00,
         1.1393157e+05, 1.0000000e+00],
        [6.4500000e+02, 1.0000000e+00, 4.400

In [59]:
# function to determine best split (or no split)
def best_split(data):
    y_vals = data[:,-1] # extract response values from data
    max_infogain = 0
    m = int(np.round(np.sqrt(data.shape[1]-1))) # set number of predectors to test = sqrt total predictors
    pred_vals_to_test = np.random.choice(range(0,data.shape[1]-1),m, replace=False) # select random subset of predictors to test
    print(pred_vals_to_test)
    #max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, list(), list()
    for idx in pred_vals_to_test:
        for row in data:
            left, right = split_data(idx, row[idx], data)
            infogain = calc_infogain(y_vals, [obs[-1] for obs in left], [obs[-1] for obs in right])
            #print("infogain: ", infogain)
            if infogain > max_infogain:
                max_infogain = infogain
                best_idx = idx
                best_val = row[idx]
                best_left, best_right = left, right
    print("max_infogain", max_infogain)
    return {"pred_idx": best_idx, "pred_val": best_val, "left": best_left, "right": best_right}

best_split(data)

[0 4 8]
max_infogain 0.01762366920019498


{'pred_idx': 4,
 'pred_val': 3768.69,
 'left': array([[6.1900000e+02, 0.0000000e+00, 4.2000000e+01, ..., 1.0000000e+00,
         1.0134888e+05, 1.0000000e+00],
        [8.2200000e+02, 1.0000000e+00, 5.0000000e+01, ..., 1.0000000e+00,
         1.0062800e+04, 0.0000000e+00],
        [7.2600000e+02, 0.0000000e+00, 2.4000000e+01, ..., 1.0000000e+00,
         5.4724030e+04, 0.0000000e+00],
        ...,
        [6.1300000e+02, 1.0000000e+00, 4.0000000e+01, ..., 0.0000000e+00,
         1.5132524e+05, 0.0000000e+00],
        [7.7500000e+02, 1.0000000e+00, 3.0000000e+01, ..., 0.0000000e+00,
         4.9337840e+04, 0.0000000e+00],
        [7.0900000e+02, 0.0000000e+00, 3.6000000e+01, ..., 1.0000000e+00,
         4.2085580e+04, 1.0000000e+00]]),
 'right': array([[5.0200000e+02, 0.0000000e+00, 4.2000000e+01, ..., 0.0000000e+00,
         1.1393157e+05, 1.0000000e+00],
        [6.4500000e+02, 1.0000000e+00, 4.4000000e+01, ..., 0.0000000e+00,
         1.4975671e+05, 1.0000000e+00],
        [3.7600000

In [None]:
# function to build decision tree
def build_tree(params):
    return "tree has been built!"

In [None]:
# load data

In [None]:
# set number of trees

In [None]:
# set number of splits

In [None]:
# bootstrap datasets

In [None]:
# build decision tree

In [None]:
# main function

#### References
http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf