In [1]:
%matplotlib inline


# Understanding the decision tree structure


The decision tree structure can be analysed to gain further insight on the
relation between the features and the target to predict. In this example, we
show how to retrieve:

- the binary tree structure;
- the depth of each node and whether or not it's a leaf;
- the nodes that were reached by a sample using the ``decision_path`` method;
- the leaf that was reached by a sample using the apply method;
- the rules that were used to predict a sample;
- the decision path shared by a group of samples.




In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [6]:
loanData = pd.read_csv("DataSets/clean_loan_data_2017.csv", low_memory=False)
loanData.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,11875,11875,11875,36 months,11.44%,391.26,B,B4,...,,,Cash,N,,,,,,
1,,,1500,1500,1500,36 months,5.32%,45.18,A,A1,...,,,Cash,N,,,,,,
2,,,35000,35000,35000,60 months,25.49%,1037.38,E,E4,...,,,Cash,N,,,,,,
3,,,12000,12000,12000,36 months,6.99%,370.48,A,A2,...,,,Cash,N,,,,,,
4,,,14000,14000,14000,36 months,8.24%,440.27,B,B1,...,,,Cash,N,,,,,,


In [8]:
data = loanData.drop(["funded_amnt_inv", "dti", "inq_last_6mths", "open_acc", "pub_rec", "revol_bal", "out_prncp_inv", 
"total_rec_prncp", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "collections_12_mths_ex_med",
"policy_code", "tot_coll_amt", "open_acc_6m", "open_act_il", "open_il_12m", "open_il_24m", "mths_since_rcnt_il",
"total_bal_il", "il_util", "open_rv_12m", "open_rv_24m", "all_util", "total_rev_hi_lim", "total_cu_tl",
"inq_last_12m", "chargeoff_within_12_mths", "mo_sin_old_il_acct", "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", 
"mo_sin_rcnt_tl", "mort_acc", "mths_since_recent_bc", "mths_since_recent_inq", "num_actv_bc_tl", "num_actv_rev_tl", 
"num_bc_sats", "num_bc_tl", "num_il_tl", "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0", "num_sats", "percent_bc_gt_75", 
"pub_rec_bankruptcies", "tax_liens", "total_bc_limit", "total_il_high_credit_limit",
"orig_projected_additional_accrued_interest","last_pymnt_d","next_pymnt_d",
"grade", "sub_grade", "emp_title", "emp_length", "home_ownership", "verification_status", "issue_d", "loan_status", "pymnt_plan",
"url", "desc", "purpose", "title", "zip_code", "addr_state", "earliest_cr_line", "mths_since_last_delinq", 
"mths_since_last_record", "revol_util", "initial_list_status", "last_pymnt_d", "next_pymnt_d", "last_credit_pull_d", 
"mths_since_last_major_derog", "application_type", "annual_inc_joint", "dti_joint", "verification_status_joint", 
"bc_open_to_buy", "bc_util", "mths_since_recent_bc_dlq", "mths_since_recent_revol_delinq", "num_tl_120dpd_2m",
"revol_bal_joint", "sec_app_earliest_cr_line", "sec_app_inq_last_6mths", "sec_app_mort_acc", "sec_app_open_acc", 
"sec_app_revol_util", "sec_app_open_act_il", "sec_app_num_rev_accts", "sec_app_chargeoff_within_12_mths", 
"sec_app_collections_12_mths_ex_med", "sec_app_mths_since_last_major_derog", "hardship_flag", "hardship_type", 
"hardship_reason", "hardship_status", "deferral_term", "hardship_amount", "hardship_start_date", "hardship_end_date",
"payment_plan_start_date", "hardship_length", "hardship_dpd", "hardship_loan_status", "hardship_payoff_balance_amount", 
"hardship_last_payment_amount", "disbursement_method", "debt_settlement_flag", "debt_settlement_flag_date", "settlement_status",
"settlement_date", "settlement_amount", "settlement_percentage", "settlement_term","int_rate","term","id","member_id",
"acc_now_delinq","delinq_amnt","num_tl_30dpd","num_tl_90g_dpd_24m","delinq_2yrs","num_accts_ever_120_pd","inq_fi",
"num_tl_op_past_12m","pct_tl_nvr_dlq","acc_open_past_24mths","total_acc","tot_cur_bal","max_bal_bc",
"avg_cur_bal","total_bal_ex_mort","tot_hi_cred_lim","funded_amnt","loan_amnt","installment","total_rec_int",
"total_pymnt_inv","total_pymnt","annual_inc"],axis=1)

feature_names = data.columns


data.isnull().any()


out_prncp          False
last_pymnt_amnt    False
dtype: bool

In [30]:
X = data

target = loan["loan_status"]
target_names = ["current", "charged off","fully paid","in grace period","late (16-30 days)", "late (31-120 days)"]
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

estimator = DecisionTreeClassifier(max_leaf_nodes=2, random_state=0)
estimator.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=2, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [31]:
# Using those arrays, we can parse the tree structure:

n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold
print(n_nodes )
print(children_left)
print(children_right)
print(feature)
print(threshold)

3
[ 1 -1 -1]
[ 2 -1 -1]
[ 0 -2 -2]
[ 0.27500001 -2.         -2.        ]


In [38]:
node_depth = data.zero(shape=n_nodes, dtype=data.float)
is_leaves = data.zero(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth

while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i],
                 ))
print()

AttributeError: 'DataFrame' object has no attribute 'zero'

In [18]:
# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)
print(node_indicator)

  (0, 0)	1
  (0, 2)	1
  (1, 0)	1
  (1, 2)	1
  (2, 0)	1
  (2, 2)	1
  (3, 0)	1
  (3, 2)	1
  (4, 0)	1
  (4, 2)	1
  (5, 0)	1
  (5, 2)	1
  (6, 0)	1
  (6, 1)	1
  (6, 4)	1
  (7, 0)	1
  (7, 2)	1
  (8, 0)	1
  (8, 2)	1
  (9, 0)	1
  (9, 1)	1
  (9, 4)	1
  (10, 0)	1
  (10, 2)	1
  (11, 0)	1
  :	:
  (10750, 3)	1
  (10751, 0)	1
  (10751, 2)	1
  (10752, 0)	1
  (10752, 2)	1
  (10753, 0)	1
  (10753, 2)	1
  (10754, 0)	1
  (10754, 2)	1
  (10755, 0)	1
  (10755, 2)	1
  (10756, 0)	1
  (10756, 2)	1
  (10757, 0)	1
  (10757, 2)	1
  (10758, 0)	1
  (10758, 1)	1
  (10758, 4)	1
  (10759, 0)	1
  (10759, 1)	1
  (10759, 3)	1
  (10760, 0)	1
  (10760, 2)	1
  (10761, 0)	1
  (10761, 2)	1


In [20]:
# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)
print(leave_id)

[2 2 2 ... 3 2 2]


In [23]:
# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]
print(node_index)

print('Rules used to predict sample %s: ' % sample_id)

[0 2]
Rules used to predict sample 0: 


In [24]:
for node_id in node_index:
    if leave_id[sample_id] == node_id:
        continue

    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)"
          % (node_id,
             sample_id,
             feature[node_id],
             X_test[sample_id, feature[node_id]],
             threshold_sign,
             threshold[node_id]))

KeyError: (0, 0)

In [16]:
# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] == node_id:
        continue

    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)"
          % (node_id,
             sample_id,
             feature[node_id],
             X_test[sample_id, feature[node_id]],
             threshold_sign,
             threshold[node_id]))

# For a group of samples, we have the following common node.
sample_ids = [0, 1]
common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
                len(sample_ids))

common_node_id = np.arange(n_nodes)[common_nodes]

print("\nThe following samples %s share the node %s in the tree"
      % (sample_ids, common_node_id))
print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))

Rules used to predict sample 0: 


KeyError: (0, 0)

In [14]:
for node_id in node_index:
    if leave_id[sample_id] == node_id:
        continue

    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)"
          % (node_id,
             sample_id,
             feature[node_id],
             X_test[sample_id, feature[node_id]],
             threshold_sign,
             threshold[node_id]))

# For a group of samples, we have the following common node.
sample_ids = [0, 1]
common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
                len(sample_ids))

common_node_id = np.arange(n_nodes)[common_nodes]

print("\nThe following samples %s share the node %s in the tree"
      % (sample_ids, common_node_id))
print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))

KeyError: (0, 0)

In [15]:

X = loan.data
y = loan.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

estimator = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
estimator.fit(X_train, y_train)

# The decision estimator has an attribute called tree_  which stores the entire
# tree structure and allows access to low level attributes. The binary tree
# tree_ is represented as a number of parallel arrays. The i-th element of each
# array holds information about the node `i`. Node 0 is the tree's root. NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this
# case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
#   - left_child, id of the left child of the node
#   - right_child, id of the right child of the node
#   - feature, feature used for splitting the node
#   - threshold, threshold value at the node
#

# Using those arrays, we can parse the tree structure:

n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold


# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i],
                 ))
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] == node_id:
        continue

    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)"
          % (node_id,
             sample_id,
             feature[node_id],
             X_test[sample_id, feature[node_id]],
             threshold_sign,
             threshold[node_id]))

# For a group of samples, we have the following common node.
sample_ids = [0, 1]
common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
                len(sample_ids))

common_node_id = np.arange(n_nodes)[common_nodes]

print("\nThe following samples %s share the node %s in the tree"
      % (sample_ids, common_node_id))
print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))

AttributeError: 'DataFrame' object has no attribute 'data'