In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import csv

## To Explain

### Decision Tree
* purity/impurity
* entropy vs. Gini index
* using decision tree for prediction

### Random Forest
* bootstrap aggregating
* OOB error estimating
* pros and cons of random forest

### Extra
* compare classification with sci-kit learn functions vs. random forest

In [12]:
X = []
y = []

with open("ChurnTest.csv", newline="") as f:
    reader = csv.reader(f)
    next(reader)
    for line in reader:
        X.append([int(num) for num in line[0:-1]]) # save features to X list
        y.append(int(line[-1])) # save class to y list

print(X)
print(y)

[[619, 0], [502, 0], [645, 1], [822, 1], [376, 0], [501, 1], [684, 1], [528, 1], [616, 1], [653, 1]]
[1, 1, 1, 0, 1, 0, 0, 0, 0, 1]


In [13]:
#data = pd.read_csv("ChurnTest.csv").to_numpy()
#data.head()
#data[:,:10]

In [14]:
#np.mean(data[:,0:-1], axis=0)[1]

In [27]:
#data = pd.read_csv("Churn.csv").to_numpy()

In [6]:
#pd.read_csv("Churn.csv").head()

In [None]:
#sns.FacetGrid(data, hue="Y", height=4).map(plt.scatter, "X1", "X2").add_legend()
#plt.show()

In [None]:
# function to calculate metric for split (entropy)
#def calc_entropy(obs_list):
#    ent = 0
 #   for unique_class in set(obs_list.iloc[:,-1]):
  #      prop = len(obs_list[obs_list.iloc[:,-1]==unique_class]) / len(obs_list)
   #     ent += (-1 * prop) * np.log2(prop)
    #return ent

#calc_entropy(data)

In [15]:
# function to calculate cost function for split (entropy)
def calc_entropy(y_vals):
    ent = 0
    for y_val in set(y_vals):
        prop = len([val for val in y_vals if val==y_val]) / len(y_vals)
        ent += (-1 * prop) * np.log2(prop) # update entropy using formula
    return ent

calc_entropy(y)
#calc_entropy(data[:,-1])
#test_list = [1, 0, 1, 1, 1]
#calc_entropy(test_list)

1.0

In [21]:
# new function to split data
def split_data(pred_idx, pred_val, X_vals, y_vals):
    X_left, X_right, y_left, y_right = [], [], [], []
    for i in range(len(X_vals)):
        if X_vals[i][pred_idx] < pred_val:
            X_left.append(X_vals[i])
            y_left.append(y_vals[i])
        else:
            X_right.append(X_vals[i])
            y_right.append(y_vals[i])
    return X_left, X_right, y_left, y_right

X_left, X_right, y_left, y_right = split_data(0, 510, X, y)
print("left: ", X_left, y_left)
print("right: ", X_right, y_right)

left:  [[502, 0], [376, 0], [501, 1]] [1, 1, 0]
right:  [[619, 0], [645, 1], [822, 1], [684, 1], [528, 1], [616, 1], [653, 1]] [1, 1, 0, 0, 0, 0, 1]


In [22]:
# function to split data
#def split_data(pred_idx, pred_val, data):
 #   left_node = data[data[:,pred_idx] < pred_val] # left holds obs with vals less than pred_val
  #  right_node = data[data[:,pred_idx] >= pred_val] # right holds obs with vals greater than or equal to pred_val
   # return left_node, right_node

#left, right = split_data(0, 600, data)
#print("left: ", left)
#print("right: ", right)

In [23]:
# function to calculate information gain
def calc_infogain(parent_yvals, left_yvals, right_yvals):
    H = calc_entropy(parent_yvals) # entropy of parent node
    #print("H: ", H)
    H_left = calc_entropy(left_yvals) # entropy of left child node
    #print("H_left: ", H_left)
    H_right = calc_entropy(right_yvals) # entropy of right child node
    #print("H_right: ", H_right)
    P_left = len(left_yvals) / len(parent_yvals)
    P_right = len(right_yvals) / len(parent_yvals)
    cond_entropy = (H_left * P_left) + (H_right * P_right) # conditional entropy to compare to parent node
    #print("cond_entropy: ", cond_entropy)
    return H - cond_entropy # difference between parent node and child node entropy

#parent_y = data[:,-1]
#left_node, right_node = split_data(0, 600, data)
#print(left_node)
#print(right_node)
#calc_infogain(parent_y, [obs[-1] for obs in left_node], [obs[-1] for obs in right_node])
calc_infogain(y, y_left, y_right)

0.034851554559677034

In [29]:
# function to determine best split (or no split)
def best_split(data):
    y_vals = data[:,-1] # extract response values from data
    #max_infogain = 0
    m = int(np.round(np.sqrt(data.shape[1]-1))) # set number of predectors to test = sqrt total # predictors
    pred_idxs_to_test = np.random.choice(range(0,data.shape[1]-1),m, replace=False) # select random subset of predictors to test
    pred_vals_to_test = np.mean(data[:,0:-1], axis=0)[pred_idxs_to_test] # use mean value for each predictor as split value
    #print(pred_idxs_to_test)
    #max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, list(), list()
    max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, None, None
    for i in range(len(pred_idxs_to_test)): # for each predictor in random subset
        left, right = split_data(pred_idxs_to_test[i], pred_vals_to_test[i], data) # split data on mean value for each predictor
        #infogain = calc_infogain(y_vals, [obs[-1] for obs in left], [obs[-1] for obs in right])
        infogain = calc_infogain(y_vals, left[:,-1], right[:,-1])
        if infogain > max_infogain: # determine if split increases information gain / reduces entropy
            max_infogain = infogain
            best_idx = pred_idxs_to_test[i]
            best_val = pred_vals_to_test[i]
            best_left, best_right = left, right
    print("max_infogain", max_infogain)
    return {"pred_idx": best_idx, "pred_val": best_val, "left": best_left, "right": best_right}

best_split(data)

max_infogain 0.10371609984200658


{'pred_idx': 2,
 'pred_val': 40.3972,
 'left': array([[3.7600000e+02, 0.0000000e+00, 2.9000000e+01, ..., 0.0000000e+00,
         1.1934688e+05, 1.0000000e+00],
        [6.8400000e+02, 1.0000000e+00, 2.7000000e+01, ..., 1.0000000e+00,
         7.1725730e+04, 0.0000000e+00],
        [5.2800000e+02, 1.0000000e+00, 3.1000000e+01, ..., 0.0000000e+00,
         8.0181120e+04, 0.0000000e+00],
        ...,
        [5.1600000e+02, 1.0000000e+00, 3.5000000e+01, ..., 1.0000000e+00,
         1.0169977e+05, 0.0000000e+00],
        [7.0900000e+02, 0.0000000e+00, 3.6000000e+01, ..., 1.0000000e+00,
         4.2085580e+04, 1.0000000e+00],
        [7.9200000e+02, 0.0000000e+00, 2.8000000e+01, ..., 0.0000000e+00,
         3.8190780e+04, 0.0000000e+00]]),
 'right': array([[6.1900000e+02, 0.0000000e+00, 4.2000000e+01, ..., 1.0000000e+00,
         1.0134888e+05, 1.0000000e+00],
        [5.0200000e+02, 0.0000000e+00, 4.2000000e+01, ..., 0.0000000e+00,
         1.1393157e+05, 1.0000000e+00],
        [6.4500000

In [30]:
# old
# function to determine best split (or no split)
'''def best_split(data):
    y_vals = data[:,-1] # extract response values from data
    max_infogain = 0
    m = int(np.round(np.sqrt(data.shape[1]-1))) # set number of predectors to test = sqrt total predictors
    pred_vals_to_test = np.random.choice(range(0,data.shape[1]-1),m, replace=False) # select random subset of predictors to test
    print(pred_vals_to_test)
    #max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, list(), list()
    for idx in pred_vals_to_test:
        for row in data:
            left, right = split_data(idx, row[idx], data)
            infogain = calc_infogain(y_vals, [obs[-1] for obs in left], [obs[-1] for obs in right])
            #print("infogain: ", infogain)
            if infogain > max_infogain:
                max_infogain = infogain
                best_idx = idx
                best_val = row[idx]
                best_left, best_right = left, right
    print("max_infogain", max_infogain)
    return {"pred_idx": best_idx, "pred_val": best_val, "left": best_left, "right": best_right}

best_split(data)'''

'def best_split(data):\n    y_vals = data[:,-1] # extract response values from data\n    max_infogain = 0\n    m = int(np.round(np.sqrt(data.shape[1]-1))) # set number of predectors to test = sqrt total predictors\n    pred_vals_to_test = np.random.choice(range(0,data.shape[1]-1),m, replace=False) # select random subset of predictors to test\n    print(pred_vals_to_test)\n    #max_infogain, best_idx, best_val, best_left, best_right = 0, 999, 999, list(), list()\n    for idx in pred_vals_to_test:\n        for row in data:\n            left, right = split_data(idx, row[idx], data)\n            infogain = calc_infogain(y_vals, [obs[-1] for obs in left], [obs[-1] for obs in right])\n            #print("infogain: ", infogain)\n            if infogain > max_infogain:\n                max_infogain = infogain\n                best_idx = idx\n                best_val = row[idx]\n                best_left, best_right = left, right\n    print("max_infogain", max_infogain)\n    return {"pred_idx":

In [None]:
# function to build decision tree
def build_tree(params):
    return "tree has been built!"

In [None]:
# load data

In [None]:
# set number of trees

In [None]:
# set number of splits

In [None]:
# bootstrap datasets

In [None]:
# build decision tree

In [None]:
# main function

#### References
http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf