# Compsci 571 Homework 2
Question 2 Variable Importance in Trees and Random Forests
Yilin Gao (yg95)
Python 3.6

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from os import system

In [2]:
train = np.genfromtxt('train.csv', delimiter=',', skip_header=1)
test = np.genfromtxt('test.csv', delimiter=',', skip_header=1)

train_X = train[:, 0: -1]
train_y = train[:, -1]

test_X = test[:, 0: -1]
test_y = test[:, -1]

print(train.shape)
print(test.shape)

(500, 6)
(100, 6)


## q2a1, decision stump based on the best split

In [26]:
# split a tree node on variable X[split] and threshold = thre into 2 child nodes
# parameter X: the feature matrix, shape = [n, p]
# parameter y: the label vector, shape = [n, 1]
# parameter split: the index for the splitting variable in the feature matrix, in [0, p)
# parameter thre: the splitting threshold for the splitting variable
# parameter y0: the actual value of one type of label
# parameter y1: the actual value of the other type of label
# return X_left: the feature matrix X of the subset of data with X[split] < thre, shape = [nl, p]
# return y_left: the lable vector y of the subset of data with X[split] < thre, shape = [nl, 1]
# return y_left_0: the label vector y of the subset of data with X[split] < thre and y == 0, shape = [nl0, 1]
# return y_left_1: the label vector y of the subset of data with X[split] < thre and y == 1, shape = [nl1, 1]
# return X_right: the feature matrix X of the subset of data with X[split] >= thre, shape = [nr, p]
# return y_right: the lable vector y of the subset of data with X[split] >= thre, shape = [nr, 1]
# return y_right_0: the label vector y of the subset of data with X[split] >= thre and y == 0, shape = [nr0, 1]
# return y_right_1: the label vector y of the subset of data with X[split] >= thre and y == 1, shape = [nr1, 1]
def split_binary_children(X, y, split, thre, y0 = 0, y1 = 1):
    # left branch of the splitting node
    X_left = X[X[:, split] < thre]
    y_left = y[X[:, split] < thre]
    y_left_0 = y_left[y_left == y0]
    y_left_1 = y_left[y_left == y1]
    # right branch of root
    X_right = X[X[:, split] >= thre]
    y_right = y[X[:, split] >= thre]
    y_right_0 = y_right[y_right == y0]
    y_right_1 = y_right[y_right == y1]
    return X_left, y_left, y_left_0, y_left_1, X_right, y_right, y_right_0, y_right_1

In [30]:
# computes gini index of a node with binary labels (0, 1)
# parameter n0: number of data points of one category
# parameter n1: number of data points of the other category
# return the gini index in the node
def gini(n0, n1):
    n = n0 + n1
    return 2 * n0 * n1 / n ** 2

In [35]:
# find the split split for a binary decision stump (level = 1)
# using gini index (= 2 * p * (1-p)) as the splitting criteria
# parameter X: the feature matrix, shape = [n, p]
# parameter y: the label vector, shape = [n, 1]
# parameter best_thre: the "preset" best splitting threshold for the best split variable (in binary case 0.5)
# return best: the index for the best splitting variable in X, in [0, p)
def best_split(X, y, best_thre):
    assert X.shape[0] == y.shape[0]
    n = X.shape[0]
    p = X.shape[1]
    best = -1
    min_gini = 0.5
    for j in range(0, p): # split on variable X[j] on root
        X_left, y_left, y_left_0, y_left_1, X_right, y_right, y_right_0, y_right_1 = split_binary_children(X, y, j, best_thre)
        # left branch of root
        n_left = X_left.shape[0]
        n_left_0 = y_left_0.shape[0]
        n_left_1 = y_left_1.shape[0]
        assert n_left == n_left_1 + n_left_0 
        gini_left = gini(n_left_0, n_left_1)
        # right branch of root
        n_right = X_right.shape[0]
        n_right_0 = y_right_0.shape[0]
        n_right_1 = y_right_1.shape[0]
        assert n_right == n_right_0 + n_right_1 
        gini_right = gini(n_right_0, n_right_1)
        # gini after split
        assert n == n_left + n_right
        gini_j = n_left / n * gini_left + n_right / n * gini_right
        if gini_j < min_gini:
            best = j
            min_gini = gini_j
    return best

In [36]:
best = best_split(train_X, train_y, 0.5)
print('The best split variable index is', best)
print('The best split variable is X[' + str(best + 1) + ']')

The best split variable index is 0
The best split variable is X[1]


Relavent statistics of the decision stump are computed as following:

In [45]:
X_left, y_left, y_left_0, y_left_1, X_right, y_right, y_right_0, y_right_1 = split_binary_children(train_X, train_y, best, 0.5)
print('Number of points in the left child:', X_left.shape[0])
print('Number of points in the left child and y = 0:', y_left_0.shape[0])
print('Number of points in the left child and y = 1:', y_left_1.shape[0])
print('Number of points in the right child:', X_right.shape[0])
print('Number of points in the right child and y = 0:', y_right_0.shape[0])
print('Number of points in the right child and y = 1:', y_right_1.shape[0])
gini_root = gini(y_left.shape[0], y_right.shape[0])
gini_left_0 = gini(y_left_0.shape[0], y_left_1.shape[0])
gini_right_0 = gini(y_right_0.shape[0], y_right_1.shape[0])
print('Gini index before split:', gini_root)
print('Gini index in the left child:', gini_left_0)
print('Gini index in the right child:', gini_right_0)

Number of points in the left child: 243
Number of points in the left child and y = 0: 209
Number of points in the left child and y = 1: 34
Number of points in the right child: 257
Number of points in the right child and y = 0: 32
Number of points in the right child and y = 1: 225
Gini index before split: 0.499608
Gini index in the left child: 0.24068146793341125
Gini index in the right child: 0.21801995488198156


Equivalently we could use Sklearn package to compute the best decision stump:

In [3]:
# best split using the sklearn package, for the picture
dt = tree.DecisionTreeClassifier(max_depth = 1)
dt = dt.fit(train_X, train_y)
dotfile = open('tree_best_split.dot', 'w')
tree.export_graphviz(dt, out_file = dotfile)
dotfile.close()
system('dot -Tpng tree_best_split.dot -o ../hw2_answer/images/tree_best_split.png')
system('rm tree_best_split.dot')

0

## q2a1, decision stump based on the best surrogate split

In [11]:
# find the best surrogate split on the root for a given best split stump
# parameter X: the feature matrix, shape = [n, p]
# parameter best: the index for the best split variable in X, in [0, p)
# parameter best_thre: the splitting threshold for the best split variable X[best]
# parameter best_surr_thre: the "preset" splitting threshold for the best surrogate split variable (in binary case 0.5)
# return best_surr: the index for the best surrogate split variable in X, in [0, p)
def best_surrogate_split(X, best, best_thre, best_surr_thre):
    n = X.shape[0]
    p = X.shape[1]
    assert best >= 0 and best < p
    # pL and pR on the best split variable (not used)
    pl = X[X[:, best] < best_thre] / n
    pr = 1 - pl
    # pLbLj + pRbRj for all other variables that are not the best split variable
    best_surr = -1
    best_surr_sum = 0;
    for j in range(0, p):
        if j == best: # the best split variable
            continue;
        plblj = X[np.logical_and(X[:, best] < best_thre, X[:, j] < best_surr_thre)].shape[0] / n
        prbrj = X[np.logical_and(X[:, best] >= best_thre, X[:, j] >= best_surr_thre)].shape[0] / n
        if plblj + prbrj > best_surr_sum:
            best_surr = j
            best_surr_sum = plblj + prbrj
    return best_surr

In [10]:
best_surr = best_surrogate_split(train_X, 0, 0.5, 0.5)
print('The best surrogate split variable index is', best_surr)
print('The best surrogate split variable is X[' + str(best_surr + 1) + ']')

The best surrogate split variable index is 1
The best surrogate split variable is X[2]


Based on comparison, the best surrogate split for X1 is X2. The splitting threshold doesn't matter since X2 values are binary. Relavent statistics of the decision stump are computed as following:

In [44]:
X_left, y_left, y_left_0, y_left_1, X_right, y_right, y_right_0, y_right_1 = split_binary_children(train_X, train_y, best_surr, 0.5)
print('Number of points in the left child:', X_left.shape[0])
print('Number of points in the left child and y = 0:', y_left_0.shape[0])
print('Number of points in the left child and y = 1:', y_left_1.shape[0])
print('Number of points in the right child:', X_right.shape[0])
print('Number of points in the right child and y = 0:', y_right_0.shape[0])
print('Number of points in the right child and y = 1:', y_right_1.shape[0])
gini_root = gini(y_left.shape[0], y_right.shape[0])
gini_left_1 = gini(y_left_0.shape[0], y_left_1.shape[0])
gini_right_1 = gini(y_right_0.shape[0], y_right_1.shape[0])
print('Gini index before split:', gini_root)
print('Gini index in the left child:', gini_left_1)
print('Gini index in the right child:', gini_right_1)

Number of points in the left child: 246
Number of points in the left child and y = 0: 176
Number of points in the left child and y = 1: 70
Number of points in the right child: 254
Number of points in the right child and y = 0: 65
Number of points in the right child and y = 1: 189
Gini index before split: 0.499872
Gini index in the left child: 0.4071650472602287
Gini index in the right child: 0.38083576167152333


## q2a2, 2 variable importance measures of all variables of the tree based on the best split

In [46]:
x1_importance_2 = gini_root - 243 / 500 * gini_left_0 - 257 / 500 * gini_right_0
x2_importance_3 = gini_root - 246 / 500 * gini_left_1 - 254 / 500 * gini_right_1
print(x1_importance_2, x2_importance_3)

0.27057454977502365 0.10581822981883363


## q2a3, mean squares error of prediction on the test data of 2 trees

In [47]:
# best split
yhat_test_tree_best_split = np.ones(100)
yhat_test_tree_best_split[test_X[:, 0] == 0] = 0
mse_test_tree_best_split = np.sum((yhat_test_tree_best_split - test_y) ** 2) / 100
print(mse_test_tree_best_split)
# best surrogate split
yhat_test_tree_best_sur_split = np.ones(100)
yhat_test_tree_best_sur_split[test_X[:, 1] == 0] = 0
mse_test_tree_best_sur_split = np.sum((yhat_test_tree_best_sur_split - test_y) ** 2) / 100
print(mse_test_tree_best_sur_split)

0.1
0.27
