In [26]:
import pandas as pd
import numpy as np

In [27]:
loans = pd.read_csv('lending-club-data.csv', low_memory=False)

In [28]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)

target = 'safe_loans'
loans = loans[features + [target]]

In [29]:
loans.head()

Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
0,B,36 months,RENT,10+ years,1
1,C,60 months,RENT,< 1 year,-1
2,C,36 months,RENT,10+ years,1
3,C,36 months,RENT,10+ years,1
4,A,36 months,RENT,3 years,1


In [30]:
train_index = pd.read_json('module-8-assignment-2-train-idx.json')
train_index.columns = ['indexvalue']
train_idx = train_index.indexvalue.tolist()

test_index = pd.read_json('module-8-assignment-2-test-idx.json')
test_index.columns = ['indexvalue']
test_idx = test_index.indexvalue.tolist()

train_data = loans.iloc[train_idx]
test_data = loans.iloc[test_idx]

In [31]:
train_data.shape, test_data.shape, loans.shape

((37224, 5), (9284, 5), (122607, 5))

In [32]:
print train_data['safe_loans'].value_counts()
print test_data['safe_loans'].value_counts()

 1    18748
-1    18476
dtype: int64
-1    4674
 1    4610
dtype: int64


In [33]:
def df_encode_data(data, categorical_type):
    return pd.get_dummies(data[categorical_type])

In [34]:
category = features

X_train_vector = df_encode_data(train_data[features], features)
X_test_vector = df_encode_data(test_data[features], features)

In [35]:
# there is no numerical feature in this case
X_train = X_train_vector
X_test = X_test_vector
y_train = train_data[target]
y_test = test_data[target]

In [94]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((37224, 25), (37224,), (9284, 25), (9284,))

In [103]:
def intermediate_node_weighted_mistakes(labels_in_node, data_weights):
    # Sum the weights of all entries with label +1
    total_weight_positive = sum(data_weights[np.where(labels_in_node == +1)])
    
    # Weight of mistakes for predicting all -1's is equal to the sum above
    ### YOUR CODE HERE
    weighted_mistakes_all_negative = total_weight_positive
    
    # Sum the weights of all entries with label -1
    ### YOUR CODE HERE
    total_weight_negative = sum(data_weights[np.where(labels_in_node == -1)])
    
    # Weight of mistakes for predicting all +1's is equal to the sum above
    ### YOUR CODE HERE
    weighted_mistakes_all_positive = total_weight_negative
    
    #print total_weight_positive, total_weight_negative
    # Return the tuple (weight, class_label) representing the lower of the two weights
    #    class_label should be an integer of value +1 or -1.
    # If the two weights are identical, return (weighted_mistakes_all_positive,+1)
    ### YOUR CODE HERE
    if weighted_mistakes_all_positive > weighted_mistakes_all_negative:
        return (weighted_mistakes_all_negative, -1)
    else:
        return (weighted_mistakes_all_positive, +1)
        

In [104]:
example_labels = np.array([-1, -1, 1, 1, 1])
example_data_weights = np.array([1., 2., .5, 1., 1.])
if intermediate_node_weighted_mistakes(example_labels, example_data_weights) == (2.5, -1):
    print 'Test passed!'
else:
    print 'Test failed... try again!'

Test passed!


In [105]:
# If the data is identical in each feature, this function should return None
def best_splitting_feature(data, features, target, data_weights):
    
    # These variables will keep track of the best feature and the corresponding error
    best_feature = None
    best_error = float('+inf') 
    num_points = float(len(data))

    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        # The right split will have all data points where the feature value is 1
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1]
        
        #print feature, len(left_split), len(right_split)
        
        # Apply the same filtering to data_weights to create left_data_weights, right_data_weights
        ## YOUR CODE HERE
        left_data_weights = data_weights[np.where(data[feature] == 0)]
        right_data_weights = data_weights[np.where(data[feature] == 1)]
        
        #print feature, len(left_data_weights), len(right_data_weights), sum(left_data_weights), sum(right_data_weights)
        
        # DIFFERENT HERE
        # Calculate the weight of mistakes for left and right sides
        ## YOUR CODE HERE
        left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(left_split[target], left_data_weights)
        right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights)
        
        # DIFFERENT HERE
        # Compute weighted error by computing
        #  ( [weight of mistakes (left)] + [weight of mistakes (right)] ) / [total weight of all data points]
        ## YOUR CODE HERE
        error = float(left_weighted_mistakes + right_weighted_mistakes) / float(sum(data_weights))
        
        print feature, error, left_weighted_mistakes, right_weighted_mistakes, len(left_split), len(right_split)
        
        # If this is the best error we have found so far, store the feature and the error
        if error < best_error:
            best_feature = feature
            best_error = error
    
    # Return the best feature we found
    return best_feature

In [106]:
training_data = pd.concat([X_train,y_train],axis=1)
updated_features = training_data.columns[:-1].tolist()

In [107]:
data_weights = np.array(len(training_data)* [1.5])

In [108]:
example_data_weights = np.array(len(training_data)* [1.5])
if best_splitting_feature(training_data, updated_features, target, example_data_weights) == 'term_ 36 months':
    print 'Test passed!'
else:
    print 'Test failed... try again!'

grade_A 0.433430045132 22314.0 1887.0 32094 5130
grade_B 0.458521384053 19087.5 6514.5 26858 10366
grade_C 0.485170857511 20343.0 6747.0 27812 9412
grade_D 0.458924349882 21600.0 4024.5 30465 6759
grade_E 0.463921126155 24252.0 1651.5 33815 3409
grade_F 0.475177304965 25839.0 693.0 35512 1712
grade_G 0.490490006447 27223.5 163.5 36788 436
term_ 36 months 0.421636578551 4831.5 18711.0 9223 28001
term_ 60 months 0.421636578551 18711.0 4831.5 28001 9223
home_ownership_MORTGAGE 0.467332903503 14074.5 12019.5 19846 17378
home_ownership_OTHER 0.496212121212 27664.5 42.0 37163 61
home_ownership_OWN 0.495137545669 25374.0 2272.5 34149 3075
home_ownership_RENT 0.468676122931 14409.0 11760.0 20514 16710
emp_length_1 year 0.496346443155 25785.0 1929.0 34624 2600
emp_length_10+ years 0.496346443155 20125.5 7588.5 26901 10323
emp_length_2 years 0.496346443155 25149.0 2565.0 33652 3572
emp_length_3 years 0.496346443155 25450.5 2263.5 34099 3125
emp_length_4 years 0.496346443155 25887.0 1827.0 34593 

In [17]:
features

['grade', 'term', 'home_ownership', 'emp_length']

In [18]:
# Leaf structure
#{ 
#   'is_leaf'            : True/False.
#   'prediction'         : Prediction at the leaf node.
#   'left'               : (dictionary corresponding to the left tree).
#   'right'              : (dictionary corresponding to the right tree).
#   'splitting_feature'  : The feature that this node splits on.
#}

def create_leaf(target_values, data_weights):
    
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'is_leaf': True}
    
    # Computed weight of mistakes.
    # Store the predicted class (1 or -1) in leaf['prediction']
    weighted_error, best_class = intermediate_node_weighted_mistakes(target_values, data_weights)
    leaf['prediction'] = best_class ## YOUR CODE HERE
    
    return leaf

In [19]:
#Now write a function that learns a weighted decision tree recursively and implements 3 stopping conditions:

# - All data points in a node are from the same class.
# - No more features to split on.
# - Stop growing the tree when the tree depth reaches max_depth.

def weighted_decision_tree_create(data, features, target, data_weights, current_depth = 1, max_depth = 10):
    remaining_features = features[:] # Make a copy of the features.
    target_values = data[target]
    print "--------------------------------------------------------------------"
    print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
    
    # Stopping condition 1. Error is 0.
    if intermediate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
        print "Stopping condition 1 reached."                
        return create_leaf(target_values, data_weights)
    
    # Stopping condition 2. No more features.
    if remaining_features == []:
        print "Stopping condition 2 reached."                
        return create_leaf(target_values, data_weights)    
    
    # Additional stopping condition (limit tree depth)
    if current_depth > max_depth:
        print "Reached maximum depth. Stopping for now."
        return create_leaf(target_values, data_weights)
    
    # If all the datapoints are the same, splitting_feature will be None. Create a leaf
    splitting_feature = best_splitting_feature(data, features, target, data_weights)
    remaining_features.remove(splitting_feature)
        
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    left_data_weights = data_weights[data[splitting_feature] == 0]
    right_data_weights = data_weights[data[splitting_feature] == 1]
    
    print "Split on feature %s. (%s, %s)" % (\
              splitting_feature, len(left_split), len(right_split))
    
    # Create a leaf node if the split is "perfect"
    if len(left_split) == len(data):
        print "Creating leaf node."
        return create_leaf(left_split[target], data_weights)
    if len(right_split) == len(data):
        print "Creating leaf node."
        return create_leaf(right_split[target], data_weights)
    
    # Repeat (recurse) on left and right subtrees
    left_tree = weighted_decision_tree_create(
        left_split, remaining_features, target, left_data_weights, current_depth + 1, max_depth)
    right_tree = weighted_decision_tree_create(
        right_split, remaining_features, target, right_data_weights, current_depth + 1, max_depth)
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [20]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [21]:
def classify(tree, x, annotate = False):   
    # If the node is a leaf node.
    if tree['is_leaf']:
        if annotate: 
            print "At leaf, predicting %s" % tree['prediction']
        return tree['prediction'] 
    else:
        # Split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate: 
            print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value)
        if split_feature_value == 0:
            return classify(tree['left'], x, annotate)
        else:
            return classify(tree['right'], x, annotate)

In [22]:
def evaluate_classification_error(tree, data):
    # Apply the classify(tree, x) to each row in your data
    prediction = data.apply(lambda x: classify(tree, x), axis=1)
    
    # Once you've made the predictions, calculate the classification error
    return (prediction != data[target]).sum() / float(len(data))

In [24]:
train_data.head()

Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
1,C,60 months,RENT,< 1 year,-1
6,F,60 months,OWN,4 years,-1
7,B,60 months,RENT,< 1 year,-1
10,C,36 months,RENT,< 1 year,-1
12,B,36 months,RENT,3 years,-1


In [25]:
X_train.head()

Unnamed: 0,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,home_ownership_MORTGAGE,...,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,emp_length_n/a
1,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
10,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
12,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [26]:
features

['grade', 'term', 'home_ownership', 'emp_length']

In [28]:
# Assign weights
example_data_weights = np.array([1.] * 10 + [0.]*(len(train_data) - 20) + [1.] * 10)
# Train a weighted decision tree model.
small_data_decision_tree_subset_20 = weighted_decision_tree_create(training_data, updated_features, target,
                         example_data_weights, max_depth=2)

  app.launch_new_instance()


--------------------------------------------------------------------
Subtree, depth = 1 (37224 data points).
Split on feature grade_A. (32094, 5130)
--------------------------------------------------------------------
Subtree, depth = 2 (32094 data points).
Split on feature grade_B. (21728, 10366)
--------------------------------------------------------------------
Subtree, depth = 3 (21728 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 3 (10366 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (5130 data points).
Split on feature grade_B. (5130, 0)
Creating leaf node.




In [29]:
print 'Classification errors: %s percent' % evaluate_classification_error(small_data_decision_tree_subset_20, training_data)

Classification errors: 0.496346443155 percent


In [30]:
pred = training_data.apply(lambda x: classify(small_data_decision_tree_subset_20, x), axis=1)

In [33]:
def print_stump(tree, name = 'root'):
    split_name = tree['splitting_feature'] # split_name is something like 'term. 36 months'
    if split_name is None:
        print "(leaf, label: %s)" % tree['prediction']
        return None
    split_feature, split_value = split_name.split('_')
    print '                       %s' % name
    print '         |---------------|----------------|'
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '  [{0} == 0]               [{0} == 1]    '.format(split_name)
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '    (%s)                         (%s)' \
        % (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
           ('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))

In [34]:
print_stump(small_data_decision_tree_subset_20)

                       root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_A == 0]               [grade_A == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (leaf, label: 1)


In [40]:
print_stump(small_data_decision_tree_subset_20['left'], small_data_decision_tree_subset_20['splitting_feature'])

                       grade_A
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [grade_B == 0]               [grade_B == 1]    
         |                                |
         |                                |
         |                                |
    (leaf, label: 1)                         (leaf, label: 1)


In [35]:
len(training_data), len(pred)

(37224, 37224)

In [36]:
training_data[target].value_counts()

 1    18748
-1    18476
dtype: int64

In [38]:
training_data['grade_A'].value_counts()

0    32094
1     5130
dtype: int64

In [41]:
training_data['grade_B'].value_counts()

0    26858
1    10366
dtype: int64

In [39]:
small_data_decision_tree_subset_20

{'is_leaf': False,
 'left': {'is_leaf': False,
  'left': {'is_leaf': True, 'prediction': 1, 'splitting_feature': None},
  'prediction': None,
  'right': {'is_leaf': True, 'prediction': 1, 'splitting_feature': None},
  'splitting_feature': 'grade_B'},
 'prediction': None,
 'right': {'is_leaf': True, 'prediction': 1, 'splitting_feature': None},
 'splitting_feature': 'grade_A'}