# Compsci 571 Homework 2
Question 2 Variable Importance in Trees and Random Forests
Yilin Gao (yg95)
Python 3.6

In [51]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from os import system
np.random.seed(111)

In [4]:
train = np.genfromtxt('train.csv', delimiter=',', skip_header=1)
test = np.genfromtxt('test.csv', delimiter=',', skip_header=1)

train_X = train[:, 0: -1]
train_y = train[:, -1]

test_X = test[:, 0: -1]
test_y = test[:, -1]

print(train.shape)
print(test.shape)

(500, 6)
(100, 6)


## q2a1, decision stump based on the best split

In [5]:
# split a tree node on variable X[split] and threshold = thre into 2 child nodes
# parameter X: the feature matrix, shape = [n, p]
# parameter y: the label vector, shape = [n, 1]
# parameter split: the index for the splitting variable in the feature matrix, in [0, p)
# parameter thre: the splitting threshold for the splitting variable
# parameter y0: the actual value of one type of label
# parameter y1: the actual value of the other type of label
# return X_left: the feature matrix X of the subset of data with X[split] < thre, shape = [nl, p]
# return y_left: the lable vector y of the subset of data with X[split] < thre, shape = [nl, 1]
# return y_left_0: the label vector y of the subset of data with X[split] < thre and y == 0, shape = [nl0, 1]
# return y_left_1: the label vector y of the subset of data with X[split] < thre and y == 1, shape = [nl1, 1]
# return X_right: the feature matrix X of the subset of data with X[split] >= thre, shape = [nr, p]
# return y_right: the lable vector y of the subset of data with X[split] >= thre, shape = [nr, 1]
# return y_right_0: the label vector y of the subset of data with X[split] >= thre and y == 0, shape = [nr0, 1]
# return y_right_1: the label vector y of the subset of data with X[split] >= thre and y == 1, shape = [nr1, 1]
def split_binary_children(X, y, split, thre, y0 = 0, y1 = 1):
    # left branch of the splitting node
    X_left = X[X[:, split] < thre]
    y_left = y[X[:, split] < thre]
    y_left_0 = y_left[y_left == y0]
    y_left_1 = y_left[y_left == y1]
    # right branch of root
    X_right = X[X[:, split] >= thre]
    y_right = y[X[:, split] >= thre]
    y_right_0 = y_right[y_right == y0]
    y_right_1 = y_right[y_right == y1]
    return X_left, y_left, y_left_0, y_left_1, X_right, y_right, y_right_0, y_right_1

In [6]:
# computes gini index of a node with binary labels (0, 1)
# parameter n0: number of data points of one category
# parameter n1: number of data points of the other category
# return the gini index in the node
def gini(n0, n1):
    n = n0 + n1
    return 2 * n0 * n1 / n ** 2

In [7]:
# find the split split for a binary decision stump (level = 1)
# using gini index (= 2 * p * (1-p)) as the splitting criteria
# parameter X: the feature matrix, shape = [n, p]
# parameter y: the label vector, shape = [n, 1]
# parameter best_thre: the "preset" best splitting threshold for the best split variable (in binary case 0.5)
# return best: the index for the best splitting variable in X, in [0, p)
def best_split(X, y, best_thre):
    assert X.shape[0] == y.shape[0]
    n = X.shape[0]
    p = X.shape[1]
    if p == 1: # if only 1 variable in consideration, it is the best split
        return 0
    best = -1
    min_gini = 1
    for j in range(0, p): # split on variable X[j] on root
        X_left, y_left, y_left_0, y_left_1, X_right, y_right, y_right_0, y_right_1 = split_binary_children(X, y, j, best_thre)
        # left branch of root
        n_left = X_left.shape[0]
        n_left_0 = y_left_0.shape[0]
        n_left_1 = y_left_1.shape[0]
        assert n_left == n_left_1 + n_left_0 
        gini_left = gini(n_left_0, n_left_1)
        # right branch of root
        n_right = X_right.shape[0]
        n_right_0 = y_right_0.shape[0]
        n_right_1 = y_right_1.shape[0]
        assert n_right == n_right_0 + n_right_1 
        gini_right = gini(n_right_0, n_right_1)
        # gini after split
        assert n == n_left + n_right
        gini_j = n_left / n * gini_left + n_right / n * gini_right
        if gini_j < min_gini:
            best = j
            min_gini = gini_j
    return best

In [8]:
best = best_split(train_X, train_y, 0.5)
print('The best split variable index is', best)
print('The best split variable is X[' + str(best + 1) + ']')

The best split variable index is 0
The best split variable is X[1]


Relavent statistics of the decision stump are computed as following:

In [9]:
X_left, y_left, y_left_0, y_left_1, X_right, y_right, y_right_0, y_right_1 = split_binary_children(train_X, train_y, best, 0.5)
print('Number of points in the left child:', X_left.shape[0])
print('Number of points in the left child and y = 0:', y_left_0.shape[0])
print('Number of points in the left child and y = 1:', y_left_1.shape[0])
print('Number of points in the right child:', X_right.shape[0])
print('Number of points in the right child and y = 0:', y_right_0.shape[0])
print('Number of points in the right child and y = 1:', y_right_1.shape[0])
gini_root = gini(y_left.shape[0], y_right.shape[0])
gini_left_0 = gini(y_left_0.shape[0], y_left_1.shape[0])
gini_right_0 = gini(y_right_0.shape[0], y_right_1.shape[0])
print('Gini index before split:', gini_root)
print('Gini index in the left child:', gini_left_0)
print('Gini index in the right child:', gini_right_0)

Number of points in the left child: 243
Number of points in the left child and y = 0: 209
Number of points in the left child and y = 1: 34
Number of points in the right child: 257
Number of points in the right child and y = 0: 32
Number of points in the right child and y = 1: 225
Gini index before split: 0.499608
Gini index in the left child: 0.24068146793341125
Gini index in the right child: 0.21801995488198156


Equivalently we could use Sklearn package to compute the best decision stump:

In [10]:
# best split using the sklearn package, for the picture
dt = tree.DecisionTreeClassifier(max_depth = 1)
dt = dt.fit(train_X, train_y)
dotfile = open('tree_best_split.dot', 'w')
tree.export_graphviz(dt, out_file = dotfile)
dotfile.close()
system('dot -Tpng tree_best_split.dot -o ../hw2_answer/images/tree_best_split.png')
system('rm tree_best_split.dot')

0

## q2a1, decision stump based on the best surrogate split

In [11]:
# find the best surrogate split on the root for a given best split stump
# parameter X: the feature matrix, shape = [n, p]
# parameter best: the index for the best split variable in X, in [0, p)
# parameter best_thre: the splitting threshold for the best split variable X[best]
# parameter best_surr_thre: the "preset" splitting threshold for the best surrogate split variable (in binary case 0.5)
# return best_surr: the index for the best surrogate split variable in X, in [0, p)
def best_surrogate_split(X, best, best_thre, best_surr_thre):
    n = X.shape[0]
    p = X.shape[1]
    assert best >= 0 and best < p
    # pL and pR on the best split variable (not used)
    pl = X[X[:, best] < best_thre] / n
    pr = 1 - pl
    if p == 1: # no best surrogate split variable if only 1 variable in consideration
        return -1
    # pLbLj + pRbRj for all other variables that are not the best split variable
    best_surr = -1
    best_surr_sum = 0;
    for j in range(0, p):
        if j == best: # the best split variable
            continue
        plblj = X[np.logical_and(X[:, best] < best_thre, X[:, j] < best_surr_thre)].shape[0] / n
        prbrj = X[np.logical_and(X[:, best] >= best_thre, X[:, j] >= best_surr_thre)].shape[0] / n
        if plblj + prbrj > best_surr_sum:
            best_surr = j
            best_surr_sum = plblj + prbrj
    return best_surr

In [12]:
best_surr = best_surrogate_split(train_X, 0, 0.5, 0.5)
print('The best surrogate split variable index is', best_surr)
print('The best surrogate split variable is X[' + str(best_surr + 1) + ']')

The best surrogate split variable index is 1
The best surrogate split variable is X[2]


Based on comparison, the best surrogate split for X1 is X2. The splitting threshold doesn't matter since X2 values are binary. Relavent statistics of the decision stump are computed as following:

In [13]:
X_left, y_left, y_left_0, y_left_1, X_right, y_right, y_right_0, y_right_1 = split_binary_children(train_X, train_y, best_surr, 0.5)
print('Number of points in the left child:', X_left.shape[0])
print('Number of points in the left child and y = 0:', y_left_0.shape[0])
print('Number of points in the left child and y = 1:', y_left_1.shape[0])
print('Number of points in the right child:', X_right.shape[0])
print('Number of points in the right child and y = 0:', y_right_0.shape[0])
print('Number of points in the right child and y = 1:', y_right_1.shape[0])
gini_root = gini(y_left.shape[0], y_right.shape[0])
gini_left_1 = gini(y_left_0.shape[0], y_left_1.shape[0])
gini_right_1 = gini(y_right_0.shape[0], y_right_1.shape[0])
print('Gini index before split:', gini_root)
print('Gini index in the left child:', gini_left_1)
print('Gini index in the right child:', gini_right_1)

Number of points in the left child: 246
Number of points in the left child and y = 0: 176
Number of points in the left child and y = 1: 70
Number of points in the right child: 254
Number of points in the right child and y = 0: 65
Number of points in the right child and y = 1: 189
Gini index before split: 0.499872
Gini index in the left child: 0.4071650472602287
Gini index in the right child: 0.38083576167152333


## q2a2, 2 variable importance measures of all variables of the tree based on the best split

In [14]:
x1_importance_2 = gini_root - 243 / 500 * gini_left_0 - 257 / 500 * gini_right_0
x2_importance_3 = gini_root - 246 / 500 * gini_left_1 - 254 / 500 * gini_right_1
print(x1_importance_2, x2_importance_3)

0.2708385497750236 0.10608222981883361


## q2a3, mean squares error of prediction on the test data of 2 trees

In [15]:
# best split
yhat_test_tree_best_split = np.ones(100)
yhat_test_tree_best_split[test_X[:, 0] == 0] = 0
mse_test_tree_best_split = np.sum((yhat_test_tree_best_split - test_y) ** 2) / 100
print(mse_test_tree_best_split)
# best surrogate split
yhat_test_tree_best_sur_split = np.ones(100)
yhat_test_tree_best_sur_split[test_X[:, 1] == 0] = 0
mse_test_tree_best_sur_split = np.sum((yhat_test_tree_best_sur_split - test_y) ** 2) / 100
print(mse_test_tree_best_sur_split)

0.1
0.27


## q2b1 grow random forest of decision stumps

M = 1000 stumps

B = 0.8 * n bootstrap training samples

K = 1, 2, 3, 4, 5 random seleted variables

In [54]:
# parameter M: number of stumps to generate in the forest
# parameter b: bootstrap resample percentage
# parameter K: number of randomly selected features in each stump
def random_forest(X, y, M, b, K):
    np.random.seed(111)
    best_dic = {}
    best_surr_dic = {}
    imp_5 = {}
    imp_6 = {}
    trees = np.empty([0, 3]) # split, left predict, right predict
    assert X.shape[0] == y.shape[0]
    n = X.shape[0]
    B = int(round(b * n))
    p = X.shape[1]
    assert K <= p
    for j in range(p):
        best_dic[j] = 0
        best_surr_dic[j] = 0
        imp_5[j] = np.empty([0, 1])
        imp_6[j] = np.empty([0, 1])
    for m in range(M): # tree
        n_idx = np.random.choice(n, B, replace = False)
        n_oobidx = list(set(range(n)) - set(n_idx))
        feature_idx = np.random.choice(p, K, replace = True) # indices for selected features
        y_sample = y[n_idx, :]
        X_sample = X[n_idx, :]
        X_sample = X_sample[:, feature_idx]
        best_idx = best_split(X_sample.reshape(B, K), y_sample.reshape(B, 1), 0.5) # the "false" best split variable index in feature_idx
        best_surr_idx = best_surrogate_split(X_sample.reshape(B, K), best_idx, 0.5, 0.5) # the "false" best surrogate variable index in feature_idx
        # update counter        
        best = feature_idx[best_idx] # the "real" best split variable index in X
        best_dic[best] = best_dic[best] + 1
        if best_surr_idx != -1: # the best surrogate splitting variable doesn't exist (K = 1)
            best_surr = feature_idx[best_surr_idx] # the "real" best surrogate variable index in X
            best_surr_dic[best_surr] = best_surr_dic[best_surr] + 1
        # importance 5
        _, y_left, y_left_0, y_left_1, _, y_right, y_right_0, y_right_1 = split_binary_children(X_sample, y_sample, best_idx, 0.5)
        gini_root = gini(y_left.shape[0], y_right.shape[0])
        gini_left = gini(y_left_0.shape[0], y_left_1.shape[0])
        gini_right = gini(y_right_0.shape[0], y_right_1.shape[0])
        delta_gini = gini_root - y_left.shape[0] / n * gini_left - y_right.shape[0] / n * gini_right
        imp_5[best] = np.append(imp_5[best], np.array([delta_gini]).reshape(1, 1), axis = 0)
        # importance 6 TODO
        if y_left_0.shape[0] >= (y_left_0.shape[0] + y_left_1.shape[0]) / 2: # the left branch is predicted as 0
            left_predict = 0
            right_predict = 1
        else:
            left_predict = 1
            right_predict = 0
        trees = np.append(trees, np.array([best, left_predict, right_predict]).reshape(1, 3), axis = 0)
        # out-of-bag
        y_oob = y[n_oobidx, :]
        X_oob = X[n_oobidx, :]
        X_oob = X_oob[:, best]
        # if X value < 0.5, goes to left, else goes to right
        yhat_oob = np.ones((n - B, 1))
        if y_left_0.shape[0] >= (y_left_0.shape[0] + y_left_1.shape[0]) / 2: # the left branch is predicted as 0
            yhat_oob[X_oob < 0.5] = 0
        else: # the right branch is predicted as 0
            yhat_oob[X_oob >= 0.5] = 0
        err_oob = np.sum((yhat_oob - y_oob) ** 2) / (n - B)
        # permute X[best]
        np.random.shuffle(X_oob)
        yhat_oob_perm = np.ones((n - B, 1))
        if y_left_0.shape[0] >= (y_left_0.shape[0] + y_left_1.shape[0]) / 2: # the left branch is predicted as 0
            yhat_oob_perm[X_oob < 0.5] = 0
        else: # the right branch is predicted as 0
            yhat_oob_perm[X_oob >= 0.5] = 0
        err_oob_perm = np.sum((yhat_oob_perm - y_oob) ** 2) / (n - B)
        imp_6[best] = np.append(imp_6[best], np.array([err_oob_perm - err_oob]).reshape(1, 1), axis = 0)
    return best_dic, best_surr_dic, imp_5, imp_6, trees

## q2b1

For each K = 1, 2, 3, 4, 5:

    how many times each variable is the best split
    
    how many times each variable is the best surrogate split

In [56]:
best_dic_list = []
imp_5_list = []
imp_6_list = []
trees_list = []
for k in range(1, 6):
    best_dic, best_surr_dic, imp_5, imp_6, trees = random_forest(train_X, train_y.reshape((500, 1)), 1000, 0.8, k)
    best_dic_list.append(best_dic)
    imp_5_list.append(imp_5)
    imp_6_list.append(imp_6)
    trees_list.append(trees)
    print('K = ' + str(k) + ':') 
    print('The map for best split variable is:', best_dic)
    print('The map for best surrogate split variable is:', best_surr_dic)
    print('====================')

K = 1:
The map for best split variable is: {0: 200, 1: 205, 2: 196, 3: 209, 4: 190}
The map for best surrogate split variable is: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}
K = 2:
The map for best split variable is: {0: 333, 1: 278, 2: 157, 3: 156, 4: 76}
The map for best surrogate split variable is: {0: 29, 1: 115, 2: 275, 3: 263, 4: 318}
K = 3:
The map for best split variable is: {0: 509, 1: 283, 2: 79, 3: 93, 4: 36}
The map for best surrogate split variable is: {0: 109, 1: 254, 2: 237, 3: 167, 4: 233}
K = 4:
The map for best split variable is: {0: 588, 1: 280, 2: 56, 3: 53, 4: 23}
The map for best surrogate split variable is: {0: 177, 1: 345, 2: 173, 3: 107, 4: 198}
K = 5:
The map for best split variable is: {0: 668, 1: 237, 2: 46, 3: 29, 4: 20}
The map for best surrogate split variable is: {0: 258, 1: 398, 2: 127, 3: 68, 4: 149}


## q2b2

For each k = 1, 2, 3, 4, 5:

    compute 2 variable importance measures for each variable

In [57]:
for k in range(5):
    imp_5 = imp_5_list[k]
    imp_6 = imp_6_list[k]
    print('K = {}'.format(k))
    for j in range(5): # variable
        imp = imp_5[j]
        imp = np.sum(imp) / imp.shape[0]
        print('Variable importance (5) for variable {} is {}.'.format(j, imp))
        imp = imp_6[j]
        imp = np.sum(imp) / imp.shape[0]
        print('Variable importance (6) for variable {} is {}.'.format(j, imp))

K = 0
Variable importance (5) for variable 0 is 0.31559132460472206.
Variable importance (6) for variable 0 is 0.36589999999999995.
Variable importance (5) for variable 1 is 0.1845434239968977.
Variable importance (6) for variable 1 is 0.2304390243902439.
Variable importance (5) for variable 2 is 0.0870955088035869.
Variable importance (6) for variable 2 is -0.01795918367346939.
Variable importance (5) for variable 3 is 0.10056961508258605.
Variable importance (6) for variable 3 is 0.009760765550239234.
Variable importance (5) for variable 4 is 0.09151885300852307.
Variable importance (6) for variable 4 is -0.02631578947368421.
K = 1
Variable importance (5) for variable 0 is 0.31559759776985774.
Variable importance (6) for variable 0 is 0.36474474474474466.
Variable importance (5) for variable 1 is 0.1844875549486905.
Variable importance (6) for variable 1 is 0.22741007194244603.
Variable importance (5) for variable 2 is 0.08667758851378417.
Variable importance (6) for variable 2 is -0

## q2b3

compute the mean squares loss on the test data using 2 methods:

1. use the majority vote of the stumps as the prediction

2. find the prediction of each stump, compute squares loss on each, and average the results

In [58]:
for k in range(5):
    best_dic = best_dic_list[k]
    trees = trees_list[k]
    yhat_test_votes = np.zeros([100, 1]) # count of vote for 1
    err1 = 0
    err2 = 0
    for m in range(1000):
        split = int(trees[m, 0])
        left_predict = trees[m, 1]
        yhat_test = np.ones([test_y.shape[0], 1])
        X_split = test_X[:, split]
        if left_predict == 0:
            yhat_test[X_split < 0.5] = 0
        else:
            yhat_test[X_split >= 0.5] = 0
        yhat_test_votes = yhat_test_votes + yhat_test # if the current vote is 1, add 1 to yhat_test_votes count
        err_tree = np.sum((yhat_test - test_y) ** 2) / test_y.shape[0]
        err2 = err2 + err_tree
    yhat_test_votes[yhat_test_votes <= 500] = 0
    yhat_test_votes[yhat_test_votes > 500] = 1
    err1 = np.sum((yhat_test_votes - test_y) ** 2) / test_y.shape[0]
    err2 = err2 / 1000
    print(err1, err2)

49.4 49.705879999999496
49.4 49.5607399999994
49.46 49.4446999999993
49.46 49.38655999999923
49.46 49.38451999999923


## q2c grow random forest of decision stumps

B = q * n, q = 0.4, 0.5, 0.6, 0.7, 0.8

K = 2

M = 1000

In [60]:
best_dic_list = []
imp_5_list = []
imp_6_list = []
trees_list = []
for q in range(4, 9):
    best_dic, best_surr_dic, imp_5, imp_6, trees = random_forest(train_X, train_y.reshape((500, 1)), 1000, q / 10, 2)
    best_dic_list.append(best_dic)
    imp_5_list.append(imp_5)
    imp_6_list.append(imp_6)
    trees_list.append(trees)
    print('q = {}:'.format(q / 10)) 
    print('The map for best split variable is:', best_dic)
    print('The map for best surrogate split variable is:', best_surr_dic)
    print('====================')

q = 0.4:
The map for best split variable is: {0: 345, 1: 296, 2: 130, 3: 119, 4: 110}
The map for best surrogate split variable is: {0: 42, 1: 110, 2: 287, 3: 292, 4: 269}
q = 0.5:
The map for best split variable is: {0: 364, 1: 255, 2: 127, 3: 144, 4: 110}
The map for best surrogate split variable is: {0: 42, 1: 117, 2: 273, 3: 278, 4: 290}
q = 0.6:
The map for best split variable is: {0: 364, 1: 252, 2: 128, 3: 161, 4: 95}
The map for best surrogate split variable is: {0: 40, 1: 118, 2: 288, 3: 277, 4: 277}
q = 0.7:
The map for best split variable is: {0: 372, 1: 282, 2: 131, 3: 114, 4: 101}
The map for best surrogate split variable is: {0: 41, 1: 126, 2: 265, 3: 259, 4: 309}
q = 0.8:
The map for best split variable is: {0: 333, 1: 278, 2: 157, 3: 156, 4: 76}
The map for best surrogate split variable is: {0: 29, 1: 115, 2: 275, 3: 263, 4: 318}


## q2c1 variable importance in (5) and (6)

In [61]:
for q in range(5):
    imp_5 = imp_5_list[q]
    imp_6 = imp_6_list[q]
    print('q = {}'.format((q + 4) / 10))
    for j in range(5): # variable
        imp = imp_5[j]
        imp = np.sum(imp) / imp.shape[0]
        print('Variable importance (5) for variable {} is {}.'.format(j, imp))
        imp = imp_6[j]
        imp = np.sum(imp) / imp.shape[0]
        print('Variable importance (6) for variable {} is {}.'.format(j, imp))

q = 0.4
Variable importance (5) for variable 0 is 0.40745183416409836.
Variable importance (6) for variable 0 is 0.3671304347826087.
Variable importance (5) for variable 1 is 0.34193221730413004.
Variable importance (6) for variable 1 is 0.22963963963963963.
Variable importance (5) for variable 2 is 0.28646633665411275.
Variable importance (6) for variable 2 is -0.015230769230769228.
Variable importance (5) for variable 3 is 0.3003327494840429.
Variable importance (6) for variable 3 is 0.0017366946778711466.
Variable importance (5) for variable 4 is 0.2908609318307444.
Variable importance (6) for variable 4 is -0.012727272727272726.
q = 0.5
Variable importance (5) for variable 0 is 0.3837443001015836.
Variable importance (6) for variable 0 is 0.3674065934065934.
Variable importance (5) for variable 1 is 0.3014075918732874.
Variable importance (6) for variable 1 is 0.2339450980392157.
Variable importance (5) for variable 2 is 0.2367689168288802.
Variable importance (6) for variable 2 is

## q2c2 standard deviation of variable imporance in (5) and (6)

In [62]:
for q in range(5):
    imp_5 = imp_5_list[q]
    imp_6 = imp_6_list[q]
    print('q = {}'.format((q + 4) / 10))
    for j in range(5): # variable
        imp = imp_5[j]
        std = np.std(imp, axis = 0)
        print('Standard deviation of variable importance (5) for variable {} is {}.'.format(j, std))
        imp = imp_6[j]
        std = np.std(imp, axis = 0)
        print('Standard deviation of variable importance (6) for variable {} is {}.'.format(j, std))

q = 0.4
Standard deviation of variable importance (5) for variable 0 is [0.01112367].
Standard deviation of variable importance (6) for variable 0 is [0.03103856].
Standard deviation of variable importance (5) for variable 1 is [0.00870762].
Standard deviation of variable importance (6) for variable 1 is [0.03255196].
Standard deviation of variable importance (5) for variable 2 is [0.00952271].
Standard deviation of variable importance (6) for variable 2 is [0.03317124].
Standard deviation of variable importance (5) for variable 3 is [0.00323389].
Standard deviation of variable importance (6) for variable 3 is [0.04077185].
Standard deviation of variable importance (5) for variable 4 is [0.00681136].
Standard deviation of variable importance (6) for variable 4 is [0.0327284].
q = 0.5
Standard deviation of variable importance (5) for variable 0 is [0.01124836].
Standard deviation of variable importance (6) for variable 0 is [0.03793574].
Standard deviation of variable importance (5) for