In [1]:
import numpy as np
import pandas as pd

## Load the lending club dataset

In [2]:
loans = pd.read_csv('lending-club-data.csv',low_memory=False)

In [3]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans.drop('bad_loans', axis=1, inplace=True)

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

# Extract the feature columns and target column
loans = loans[features + [target]]

## One Hot Encoding & Training and Validation data creation

In [5]:
loans = pd.get_dummies(loans)
features = list(loans.columns)

In [6]:
features.remove('safe_loans')

In [7]:
train_index = pd.read_json('module-6-assignment-train-idx.json')
train_index = list(train_index[0])

In [8]:
val_index = pd.read_json('module-6-assignment-validation-idx.json')
val_index = list(val_index[0])

In [9]:
train_data = loans.iloc[train_index]
validation_data = loans.iloc[val_index]

In [10]:
print('Training set: %d data points' % len(train_data))
print('Test set: %d data points' % len(validation_data))

Training set: 37224 data points
Test set: 9284 data points


In [11]:
train_data.safe_loans.value_counts(normalize=True)

 1    0.503654
-1    0.496346
Name: safe_loans, dtype: float64

In [12]:
validation_data.safe_loans.value_counts(normalize=True)

-1    0.503447
 1    0.496553
Name: safe_loans, dtype: float64

## Early stopping condition 2: Minimum node size

In [13]:
def reached_minimum_node_size(data, min_node_size):
    # Return True if the number of data points is less than or equal to the minimum node size.
    ## YOUR CODE HERE
    if data.shape[0]<=min_node_size:
        return True
    else:
        return False

## Early stopping condition 3: Minimum gain in error reduction

In [14]:
def error_reduction(error_before_split, error_after_split):
    # Return the error before the split minus the error after the split.
    ## YOUR CODE HERE
    return (error_before_split-error_after_split)

## Grabbing binary decision tree helper functions from past assignment

In [15]:
from week3_assignment2_functions import intermediate_node_num_mistakes
from week3_assignment2_functions import best_splitting_feature
from week3_assignment2_functions import create_leaf
from week3_assignment2_functions import classify
from week3_assignment2_functions import evaluate_classification_error

## Incorporating new early stopping conditions in binary decision tree implementation

In [16]:
def decision_tree_create(data, features, target, current_depth = 0, 
                         max_depth = 10, min_node_size=1, 
                         min_error_reduction=0.0):
    
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print("--------------------------------------------------------------------")
    print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    
    
    # Stopping condition 1: All nodes are of the same type.
    if intermediate_node_num_mistakes(target_values) == 0:
        print("Stopping condition 1 reached. All data points have the same target value.")                
        return create_leaf(target_values)
    
    # Stopping condition 2: No more features to split on.
    if remaining_features == []:
        print("Stopping condition 2 reached. No remaining features.")                
        return create_leaf(target_values)    
    
    # Early stopping condition 1: Reached max depth limit.
    if current_depth >= max_depth:
        print("Early stopping condition 1 reached. Reached maximum depth.")
        return create_leaf(target_values)
    
    # Early stopping condition 2: Reached the minimum node size.
    # If the number of data points is less than or equal to the minimum size, return a leaf.
    if reached_minimum_node_size(data, min_node_size)==True:
        print("Early stopping condition 2 reached. Reached minimum node size.")
        return create_leaf(target_values)  ## YOUR CODE HERE
    
    # Find the best splitting feature
    splitting_feature = best_splitting_feature(data, features, target)
    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    # Early stopping condition 3: Minimum error reduction
    # Calculate the error before splitting (number of misclassified examples 
    # divided by the total number of examples)
    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    
    # Calculate the error after splitting (number of misclassified examples 
    # in both groups divided by the total number of examples)
    left_mistakes = intermediate_node_num_mistakes(left_split[target_values].as_matrix())   ## YOUR CODE HERE
    right_mistakes = intermediate_node_num_mistakes(right_split[target_values].as_matrix())  ## YOUR CODE HERE
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))
    
    # If the error reduction is LESS THAN OR EQUAL TO min_error_reduction, return a leaf.
    if error_reduction(error_before_split, error_after_split)<=min_error_reduction:        ## YOUR CODE HERE
        print("Early stopping condition 3 reached. Minimum error reduction.")
        return create_leaf(target_values) ## YOUR CODE HERE 
    
    
    remaining_features.remove(splitting_feature)
    print("Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split)))
    
    
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction)        
    
    ## YOUR CODE HERE
    right_tree = decision_tree_create(right_split, remaining_features, target,
                                      current_depth + 1, max_depth, min_node_size, min_error_reduction)
    
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

## Build a tree!

In [17]:
my_decision_tree_new = decision_tree_create(train_data, 
                                            features, 
                                            'safe_loans', 
                                            max_depth = 6, 
                                            min_node_size = 100, 
                                            min_error_reduction=0.0)


--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature home_ownership_OTHER. (3824, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (3824 data 

Split on feature grade_E. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (4701 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 3 (0 data points).
Stopping condition 1 reached. All data points have the same target value.


In [18]:
my_decision_tree_old = decision_tree_create(train_data,
                                            features, 
                                            'safe_loans',
                                            max_depth=6,
                                            min_node_size=0,
                                            min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature home_ownership_OTHER. (3824, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (3824 data 

Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Split on

In [19]:
print('Predicted class: %s ' % classify(my_decision_tree_new, validation_data.iloc[0]))

Predicted class: -1 


In [20]:
classify(my_decision_tree_new, validation_data.iloc[0], annotate=True)

Split on term_ 36 months = 0
Split on grade_A = 0
Split on grade_B = 0
Split on grade_C = 0
Split on grade_D = 1
Split on grade_E = 0
at leaf, predicting -1


-1

In [21]:
classify(my_decision_tree_old, validation_data.iloc[0], annotate=True)

Split on term_ 36 months = 0
Split on grade_A = 0
Split on grade_B = 0
Split on grade_C = 0
Split on grade_D = 1
Split on grade_E = 0
at leaf, predicting -1


-1

In [22]:
evaluate_classification_error(my_decision_tree_new, validation_data)

0.3849633778543731

In [23]:
evaluate_classification_error(my_decision_tree_old, validation_data)

0.38507109004739337

## Exploring the effect of max depth

In [24]:
model_1 = decision_tree_create(train_data, features, 'safe_loans', max_depth=2, min_node_size=0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (101 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 1 (28001 data points).
Split on feature grade_D. (23300, 4701)
--------------------------------------------------------------------
Subtree, depth = 2 (23300 data points).
Early stopping condition 1 reached. Reached maximum depth.
-----------------------------------------------

In [25]:
model_2 = decision_tree_create(train_data, features, 'safe_loans', max_depth=6, min_node_size=0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature home_ownership_OTHER. (3824, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (3824 data 

Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Split on

In [26]:
model_3 = decision_tree_create(train_data, features, 'safe_loans', max_depth=14, min_node_size=0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature home_ownership_OTHER. (3824, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (3824 data 

Split on feature home_ownership_RENT. (0, 879)
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 13 (879 data points).
Split on feature emp_length_1 year. (809, 70)
--------------------------------------------------------------------
Subtree, depth = 14 (809 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (70 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 12 (165 data points).
Split on feature emp_length_9 years. (157, 8)
--------------------------------------------------------------------
Subtree, depth = 13 (157 data points).
Split on feature home_ow

Split on feature grade_D. (2190, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (2190 data points).
Split on feature grade_E. (2190, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (2190 data points).
Split on feature grade_F. (2190, 0)
--------------------------------------------------------------------
Subtree, depth = 7 (2190 data points).
Split on feature grade_G. (2190, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (2190 data points).
Split on feature term_ 60 months. (0, 2190)
--------------------------------------------------------------------
Subtree, depth = 9 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 9 (2190 data points).
Split on feature home_ownership_MORTGAGE. (803, 1387)
---------------------------------------

Split on feature home_ownership_MORTGAGE. (347, 580)
--------------------------------------------------------------------
Subtree, depth = 12 (347 data points).
Split on feature home_ownership_OTHER. (347, 0)
--------------------------------------------------------------------
Subtree, depth = 13 (347 data points).
Split on feature home_ownership_OWN. (276, 71)
--------------------------------------------------------------------
Subtree, depth = 14 (276 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (71 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 13 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 12 (580 data points).
Split on featur

--------------------------------------------------------------------
Subtree, depth = 8 (4 data points).
Split on feature grade_E. (4, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (4 data points).
Split on feature grade_F. (4, 0)
--------------------------------------------------------------------
Subtree, depth = 10 (4 data points).
Split on feature grade_G. (4, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (4 data points).
Split on feature term_ 60 months. (0, 4)
--------------------------------------------------------------------
Subtree, depth = 12 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 12 (4 data points).
Split on feature home_ownership_MORTGAGE. (1, 3)
--------------------------------------------------------------------
Subtree, depth = 13 (1 data points

Split on feature grade_C. (8917, 0)
--------------------------------------------------------------------
Subtree, depth = 10 (8917 data points).
Split on feature term_ 60 months. (8917, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (8917 data points).
Split on feature home_ownership_MORTGAGE. (4748, 4169)
--------------------------------------------------------------------
Subtree, depth = 12 (4748 data points).
Split on feature home_ownership_OWN. (4089, 659)
--------------------------------------------------------------------
Subtree, depth = 13 (4089 data points).
Split on feature home_ownership_RENT. (0, 4089)
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 14 (4089 data points).
Early stopping condition 1 reached. Reached

Split on feature emp_length_3 years. (43, 1)
--------------------------------------------------------------------
Subtree, depth = 12 (43 data points).
Split on feature emp_length_7 years. (42, 1)
--------------------------------------------------------------------
Subtree, depth = 13 (42 data points).
Split on feature emp_length_8 years. (41, 1)
--------------------------------------------------------------------
Subtree, depth = 14 (41 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (1 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 13 (1 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 12 (1 data points).
Stopping condition 

--------------------------------------------------------------------
Subtree, depth = 8 (1 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 7 (182 data points).
Split on feature grade_B. (182, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (182 data points).
Split on feature grade_C. (182, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (182 data points).
Split on feature grade_G. (182, 0)
--------------------------------------------------------------------
Subtree, depth = 10 (182 data points).
Split on feature term_ 60 months. (182, 0)
--------------------------------------------------------------------
Subtree, depth = 11 (182 data points).
Split on feature home_ownership_MORTGAGE. (71, 111)
--------------------------------------------------------------------
Subtree, depth 

Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Split on feature grade_F. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 7 (1276 data points).
Split on feature grade_G. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 8 (1276 data points).
Split on feature term_ 60 months. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 9 (1276 data points).
Split on feature home_ownership_MORTGAGE. (855, 421)
--------------------------------------------------------------------
Subtree, depth = 10 (855 data points).
Split on feature home_ownership_OTHER. (849, 6)
--------------------------------------------------------------------
Subtree, depth = 11 (849 data points).
Split on feature home_ownership_OWN. (737, 112)
---------------------------------------------------

Split on feature emp_length_1 year. (2392, 241)
--------------------------------------------------------------------
Subtree, depth = 14 (2392 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (241 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 12 (404 data points).
Split on feature emp_length_2 years. (372, 32)
--------------------------------------------------------------------
Subtree, depth = 13 (372 data points).
Split on feature home_ownership_RENT. (372, 0)
--------------------------------------------------------------------
Subtree, depth = 14 (372 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 14 (0 data points).
Stopping condition 1 reached. All 

In [27]:
print("Training data, classification error (model 1):", evaluate_classification_error(model_1, train_data))
print("Training data, classification error (model 2):", evaluate_classification_error(model_2, train_data))
print("Training data, classification error (model 3):", evaluate_classification_error(model_3, train_data))

Training data, classification error (model 1): 0.40003761014399314
Training data, classification error (model 2): 0.3834891467870191
Training data, classification error (model 3): 0.375886524822695


In [28]:
print("Validation data, classification error (model 1):", evaluate_classification_error(model_1, validation_data))
print("Validation data, classification error (model 2):", evaluate_classification_error(model_2, validation_data))
print("Validation data, classification error (model 3):", evaluate_classification_error(model_3, validation_data))

Training data, classification error (model 1): 0.3981042654028436
Training data, classification error (model 2): 0.38507109004739337
Training data, classification error (model 3): 0.3805471779405429


## Measuring complexity of fit

In [30]:
def count_leaves(tree):
    if tree['is_leaf']:
        return 1
    return count_leaves(tree['left']) + count_leaves(tree['right'])

In [32]:
print("total leaves in model_1:", count_leaves(model_1))
print("total leaves in model_2:", count_leaves(model_2))
print("total leaves in model_3:", count_leaves(model_3))

total leaves in model_1: 4
total leaves in model_2: 35
total leaves in model_3: 273


In [38]:
model_4 = decision_tree_create(train_data, features, 'safe_loans', max_depth=6, min_node_size=0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature home_ownership_OTHER. (3824, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (3824 data 

Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Split on

In [39]:
model_5 = decision_tree_create(train_data, features, 'safe_loans', max_depth=6, min_node_size=0, min_error_reduction=0)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature home_ownership_OTHER. (3824, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (3824 data 

Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 2 (4701 data points).
Split on feature grade_A. (4701, 0)
--------------------------------------------------------------------
Subtree, depth = 3 (4701 data points).
Split on

In [40]:
model_6 = decision_tree_create(train_data, features, 'safe_loans', max_depth=6, min_node_size=0, min_error_reduction=5)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Early stopping condition 3 reached. Minimum error reduction.


In [42]:
print("Validation data, classification error (model 4):", evaluate_classification_error(model_4, validation_data))
print("Validation data, classification error (model 5):", evaluate_classification_error(model_5, validation_data))
print("Validation data, classification error (model 6):", evaluate_classification_error(model_6, validation_data))

Validation data, classification error (model 4): 0.38507109004739337
Validation data, classification error (model 5): 0.38507109004739337
Validation data, classification error (model 6): 0.503446790176648


In [43]:
model_7 = decision_tree_create(train_data, features, 'safe_loans', max_depth=6, min_node_size=0, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature home_ownership_OTHER. (3824, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (3824 data 

Split on feature grade_A. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 4 (1276 data points).
Split on feature grade_B. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 5 (1276 data points).
Split on feature grade_C. (1276, 0)
--------------------------------------------------------------------
Subtree, depth = 6 (1276 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 6 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 5 (0 data points).
Stopping condition 1 reached. All data points have the same target value.
--------------------------------------------------------------------
Subtree, depth = 4 (0 data points).
Stopping condition 1 reached. All data points 

In [44]:
model_8 = decision_tree_create(train_data, features, 'safe_loans', max_depth=6, min_node_size=2000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Split on feature home_ownership_OTHER. (3824, 2)
--------------------------------------------------------------------
Subtree, depth = 6 (3824 data 

In [45]:
model_9 = decision_tree_create(train_data, features, 'safe_loans', max_depth=6, min_node_size=5000, min_error_reduction=-1)

--------------------------------------------------------------------
Subtree, depth = 0 (37224 data points).
Split on feature term_ 36 months. (9223, 28001)
--------------------------------------------------------------------
Subtree, depth = 1 (9223 data points).
Split on feature grade_A. (9122, 101)
--------------------------------------------------------------------
Subtree, depth = 2 (9122 data points).
Split on feature grade_B. (8074, 1048)
--------------------------------------------------------------------
Subtree, depth = 3 (8074 data points).
Split on feature grade_C. (5884, 2190)
--------------------------------------------------------------------
Subtree, depth = 4 (5884 data points).
Split on feature grade_D. (3826, 2058)
--------------------------------------------------------------------
Subtree, depth = 5 (3826 data points).
Early stopping condition 2 reached. Reached minimum node size.
--------------------------------------------------------------------
Subtree, depth =

In [46]:
print("Validation data, classification error (model 7):", evaluate_classification_error(model_7, validation_data))
print("Validation data, classification error (model 8):", evaluate_classification_error(model_8, validation_data))
print("Validation data, classification error (model 9):", evaluate_classification_error(model_9, validation_data))

Validation data, classification error (model 7): 0.38507109004739337
Validation data, classification error (model 8): 0.38453252908229213
Validation data, classification error (model 9): 0.38453252908229213


In [47]:
print("total leaves in model_7:", count_leaves(model_7))
print("total leaves in model_8:", count_leaves(model_8))
print("total leaves in model_9:", count_leaves(model_9))

total leaves in model_7: 35
total leaves in model_8: 19
total leaves in model_9: 11
