#### Read Data

In [1]:
import sframe
import numpy as np

loans = sframe.SFrame('data1/')
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging C:\Users\user\AppData\Local\Temp\sframe_server_1502768817.log.0


#### Set feature and target

In [2]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'
loans = loans[features + [target]]
loans

grade,term,home_ownership,emp_length,safe_loans
B,36 months,RENT,10+ years,1
C,60 months,RENT,< 1 year,-1
C,36 months,RENT,10+ years,1
C,36 months,RENT,10+ years,1
A,36 months,RENT,3 years,1
E,36 months,RENT,9 years,1
F,60 months,OWN,4 years,-1
B,60 months,RENT,< 1 year,-1
C,60 months,OWN,5 years,1
B,36 months,OWN,10+ years,1


#### Balance dataset

In [3]:
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target] == -1]

# Since there are less risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(percentage, seed = 1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.502236174422
Percentage of risky loans                : 0.497763825578
Total number of loans in our new dataset : 46508


#### One hot-encoding

In [4]:
loans_data = risky_loans.append(safe_loans)
for feature in features:
    loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x: 1})    
    loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
    
    # Change None's to 0's
    for column in loans_data_unpacked.column_names():
        loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)

    loans_data.remove_column(feature)
    loans_data.add_columns(loans_data_unpacked)
loans_data

safe_loans,grade.A,grade.B,grade.C,grade.D,grade.E,grade.F,grade.G,term. 36 months,term. 60 months
-1,0,0,1,0,0,0,0,0,1
-1,0,0,0,0,0,1,0,0,1
-1,0,1,0,0,0,0,0,0,1
-1,0,0,1,0,0,0,0,1,0
-1,0,1,0,0,0,0,0,1,0
-1,0,1,0,0,0,0,0,1,0
-1,0,1,0,0,0,0,0,1,0
-1,0,0,1,0,0,0,0,1,0
-1,0,0,0,1,0,0,0,0,1
-1,1,0,0,0,0,0,0,1,0

home_ownership.MORTGAGE,home_ownership.OTHER,home_ownership.OWN,home_ownership.RENT,emp_length.1 year,emp_length.10+ years
0,0,0,1,0,0
0,0,1,0,0,0
0,0,0,1,0,0
0,0,0,1,0,0
0,0,0,1,0,0
0,0,0,1,0,1
0,0,0,1,1,0
0,0,0,1,0,0
0,0,0,1,0,0
1,0,0,0,0,1

emp_length.2 years,emp_length.3 years,emp_length.4 years,emp_length.5 years,emp_length.6 years,emp_length.7 years
0,0,0,0,0,0
0,0,1,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,1,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
1,0,0,0,0,0
0,0,0,0,0,0

emp_length.8 years,emp_length.9 years,emp_length.< 1 year,emp_length.n/a
0,0,1,0
0,0,0,0
0,0,1,0
0,0,1,0
0,0,0,0
0,0,0,0
0,0,0,0
0,1,0,0
0,0,0,0
0,0,0,0


In [5]:
features = loans_data.column_names()
features.remove('safe_loans')  # Remove the response variable
features

['grade.A',
 'grade.B',
 'grade.C',
 'grade.D',
 'grade.E',
 'grade.F',
 'grade.G',
 'term. 36 months',
 'term. 60 months',
 'home_ownership.MORTGAGE',
 'home_ownership.OTHER',
 'home_ownership.OWN',
 'home_ownership.RENT',
 'emp_length.1 year',
 'emp_length.10+ years',
 'emp_length.2 years',
 'emp_length.3 years',
 'emp_length.4 years',
 'emp_length.5 years',
 'emp_length.6 years',
 'emp_length.7 years',
 'emp_length.8 years',
 'emp_length.9 years',
 'emp_length.< 1 year',
 'emp_length.n/a']

#### Train Validation Split

In [6]:
train_data, validation_set = loans_data.random_split(.8, seed=1)

#### Early stop when reach minimum node size

In [12]:
def reached_minimum_node_size(data,min_node_size):
    return data.num_rows() <= min_node_size

#### Early stop when error reduction is small

In [8]:
def error_reduction(error_before_split, error_after_split):
    return error_before_split - error_after_split

#### Function to determine mistakes for a node

In [9]:
def intermediate_node_num_mistakes(labels_in_node):
    if len(labels_in_node) == 0:
        return 0
    number_of_1 = 0
    number_of_not_1 = 0
    for element in labels_in_node:
        if element == 1:
            number_of_1 = number_of_1 + 1
        else:
            number_of_not_1 = number_of_not_1 + 1
    return min(number_of_1,number_of_not_1)

#### Function to determine the best feature to split

In [10]:
def best_splitting_feature(data,features,target):
    best_feature = None
    best_error = 10
    num_data_points = float(len(data))
    
    for feature in features:
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] != 0]
        left_mistake = intermediate_node_num_mistakes(left_split[target])
        right_mistake = intermediate_node_num_mistakes(right_split[target])
        error = (left_mistake + right_mistake)/num_data_points
        if error < best_error:
            best_error = error
            best_feature = feature
    return best_feature

#### Function to create a leaf node

In [11]:
def create_leaf(target_values):
    
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True   }
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])
    if num_ones > num_minus_ones:
        leaf['prediction'] =  1 
    else:
        leaf['prediction'] =  -1
    return leaf

#### Function to create decision tree

In [None]:
def decision_tree_create(data,features,target,current_depth=0,max_depth = 10, min_node_size = 1,min_error_reduction = 0.0):
    remaining_features = features[:]
    
    target_values = data[target]
    print "--------------------------------------------------------------------"
    print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
    
    if intermediate_node_num_mistakes(target_values) ==0:
        print "Stopping condition 1 reached. All data points have the same target value."                
        return create_leaf(target_values)
    
    if remaining_features = []:
        print "Stopping condition 2 reached. No remaining features."                
        return create_leaf(target_values)  
    
    if current_depth >= max_depth:
        print "Early stopping condition 1 reached. Reached maximum depth."
        return create_leaf(target_values)
    
    if reached_minimum_node_size(data,min_node_size):
        print "Early stopping condition 2 reached. Reached minimum node size."
        return create_leaf(target_values)
    
    splitting_feature = best_splitting_feature(data,features,target)
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    
    # Calculate the error after splitting (number of misclassified examples 
    # in both groups divided by the total number of examples)
    left_mistakes = intermediate_node_num_mistakes(left_split)
    right_mistakes = intermediate_node_num_mistakes(right_split)
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))
    
    if error_after_split <= min_error_reduction:
        print "Early stopping condition 3 reached. Minimum error reduction."
        return create_leaf(target_value)
    
    remaining_features.remove(splitting_feature)
    print "Split on feature %s. (%s, %s)" % (splitting_feature, len(left_split), len(right_split))
    
    