In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
loans = pd.read_csv(r"Data\lending-club-data.csv", index_col = 0)
loans.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,is_inc_v,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,not_compliant,status,inactive_loans,bad_loans,emp_length_num,grade_num,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,,10+ years,RENT,24000.0,Verified,20111201T000000,Fully Paid,n,https://www.lendingclub.com/browse/loanDetail....,Borrower added on 12/22/11 > I need to upgra...,credit_card,Computer,860xx,AZ,27.65,0.0,19850101T000000,1.0,,,3.0,0.0,13648,83.7,9.0,f,0.0,0.0,5861.07,5831.78,5000.0,861.07,0.0,0.0,0.0,20150101T000000,171.62,,20150101T000000,0.0,,1,0,Fully Paid,1,0,11,5,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,20111201T000000,Charged Off,n,https://www.lendingclub.com/browse/loanDetail....,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx,GA,1.0,0.0,19990401T000000,5.0,,,3.0,0.0,1687,9.4,4.0,f,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,20130401T000000,119.66,,20130901T000000,0.0,,1,0,Charged Off,1,1,1,4,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1


In [3]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)

In [4]:
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length']         # number of years of employment
target = 'safe_loans'
loans = loans[features + [target]]
loans.head(2)

Unnamed: 0,grade,term,home_ownership,emp_length,safe_loans
0,B,36 months,RENT,10+ years,1
1,C,60 months,RENT,< 1 year,-1


In [5]:
# Balancing the data
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target] == -1]

# Since there are less risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(n = len(risky_loans_raw), random_state = 1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print("Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data)))
print("Percentage of risky loans                :", len(risky_loans) / float(len(loans_data)))
print("Total number of loans in our new dataset :", len(loans_data))
del safe_loans_raw, risky_loans_raw, risky_loans, safe_loans

Percentage of safe loans                 : 0.5
Percentage of risky loans                : 0.5
Total number of loans in our new dataset : 46300


In [6]:
# One-Hot-Encode
loans_data = pd.get_dummies(loans_data).reset_index(drop = True)
loans_data.head(2)

Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,emp_length_1 year,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year
0,-1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,-1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


## Stopping Criteria
1. No improvement in accuracy
2. Min number of leaves
3. Max Depth

In [7]:
def reached_minimum_node_size(data, min_node_size):
    ''' Return True if the number of data points is less than or equal to the minimum node size '''
    return (len(data) - min_node_size)   

def error_reduction(error_before_split, error_after_split):
    ''' Return the error before the split minus the error after the split '''
    return (error_before_split-error_after_split)

In [8]:
def intermediate_node_num_mistakes(df):
    ''' Returns the number of mistakes made at any node '''
    if len(df) == 0:
        return 0
    safe_loans_num = (df == +1).sum()
    risky_loans_num = (df == -1).sum()
    if safe_loans_num > risky_loans_num:
        return risky_loans_num
    else:
        return safe_loans_num

In [9]:
def best_splitting_feature(data, features, target):
    ''' Iterates through each feature, calculates the mistakes/error due to that feature
    owing to the split, returns the best feature which gives lowest error '''
    
    best_feature = None # Keep track of the best feature 
    best_error = 10     # Keep track of the best error so far 
    # Note: Since error is always <= 1, we should intialize it with something larger than 1.

    # Convert to float to make sure error gets computed correctly.
    num_data_points = float(len(data))  
    
    # Loop through each feature to consider splitting on that feature
    for feature in features:
        
        # The left split will have all data points where the feature value is 0
        left_split = data[data[feature] == 0]
        
        # The right split will have all data points where the feature value is 1
        right_split = data[data[feature] == 1]
            
        # Calculate the number of misclassified examples in the left split.
        # Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
        left_mistakes = intermediate_node_num_mistakes(left_split['safe_loans'])            

        # Calculate the number of misclassified examples in the right split.
        right_mistakes = intermediate_node_num_mistakes(right_split['safe_loans']) 
            
        # Compute the classification error of this split.
        # Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
        error = (left_mistakes + right_mistakes) / num_data_points

        # If this is the best error we have found so far, store the feature as best_feature and the error as best_error
        if error < best_error:
            best_feature = feature
            best_error = error
    
    return best_feature # Return the best feature we found

In [10]:
def create_leaf(target_values):
    ''' Returns the prediction based on the majority class in that leaf '''
    # Create a leaf node
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True,
            'prediction' : None}   ## YOUR CODE HERE
    
    # Count the number of data points that are +1 and -1 in this node.
    num_ones = len(target_values[target_values == +1])
    num_minus_ones = len(target_values[target_values == -1])
    
    # For the leaf node, set the prediction to be the majority class.
    # Store the predicted class (1 or -1) in leaf['prediction']
    if num_ones > num_minus_ones:
        leaf['prediction'] = 1       ## YOUR CODE HERE
    else:
        leaf['prediction'] = -1        ## YOUR CODE HERE
        
    # Return the leaf node        
    return leaf 

In [15]:
def decision_tree_create(data, features, target, current_depth = 0, 
                         max_depth = 10, min_node_size=1, 
                         min_error_reduction=0.0):
    
    remaining_features = features[:] # Make a copy of the features.
    
    target_values = data[target]
    print("--------------------------------------------------------------------")
    print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
    
    
    # Stopping condition 1: All nodes are of the same type.
    if intermediate_node_num_mistakes(target_values) == 0:
        print("Stopping condition 1 reached. All data points have the same target value.")                
        return create_leaf(target_values)
    
    # Stopping condition 2: No more features to split on.
    if remaining_features == []:
        print("Stopping condition 2 reached. No remaining features.")                
        return create_leaf(target_values)    
    
    # Early stopping condition 1: Reached max depth limit.
    if current_depth >= max_depth:
        print("Early stopping condition 1 reached. Reached maximum depth.")
        return create_leaf(target_values)
    
    # Early stopping condition 2: Reached the minimum node size.
    # If the number of data points is less than or equal to the minimum size, return a leaf.
    if  reached_minimum_node_size(data, min_node_size) == True:       ## YOUR CODE HERE 
        print("Early stopping condition 2 reached. Reached minimum node size.")
        return create_leaf(target_values)## YOUR CODE HERE
    
    # Find the best splitting feature
    splitting_feature = best_splitting_feature(data, features, target)
    
    # Split on the best feature that we found. 
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    # Early stopping condition 3: Minimum error reduction
    # Calculate the error before splitting (number of misclassified examples 
    # divided by the total number of examples)
    error_before_split = intermediate_node_num_mistakes(target_values) / float(len(data))
    
    # Calculate the error after splitting (number of misclassified examples 
    # in both groups divided by the total number of examples)
    left_mistakes = intermediate_node_num_mistakes(left_split[target]) / float(len(data))   ## YOUR CODE HERE
    right_mistakes = intermediate_node_num_mistakes(right_split[target]) / float(len(data))  ## YOUR CODE HERE
    error_after_split = (left_mistakes + right_mistakes) / float(len(data))
    
    # If the error reduction is LESS THAN OR EQUAL TO min_error_reduction, return a leaf.
    if  error_reduction(error_before_split, error_after_split) <= min_error_reduction:       ## YOUR CODE HERE
        print("Early stopping condition 3 reached. Minimum error reduction.")
        return create_leaf(target_values) ## YOUR CODE HERE 
    
    
    remaining_features.remove(splitting_feature)
    print("Split on feature %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split)))
    
    
    # Repeat (recurse) on left and right subtrees
    left_tree = decision_tree_create(left_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction)        
    
    ## YOUR CODE HERE
    right_tree = decision_tree_create(right_split, remaining_features, target, 
                                     current_depth + 1, max_depth, min_node_size, min_error_reduction)
    
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [17]:
features = loans_data.columns.to_list()
features.remove('safe_loans')  # Remove the response variable

In [18]:
small_decision_tree = decision_tree_create(loans_data, features, 'safe_loans', max_depth = 2, 
                                        min_node_size = 10, min_error_reduction=0.0)

--------------------------------------------------------------------
Subtree, depth = 0 (46300 data points).
Split on feature term_ 36 months. (11671, 34629)
--------------------------------------------------------------------
Subtree, depth = 1 (11671 data points).
Split on feature grade_A. (11515, 156)
--------------------------------------------------------------------
Subtree, depth = 2 (11515 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (156 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 1 (34629 data points).
Split on feature grade_D. (28778, 5851)
--------------------------------------------------------------------
Subtree, depth = 2 (28778 data points).
Early stopping condition 1 reached. Reached maximum depth.
-------------------------------------------

In [None]:
def get_best_feature_split(feature_list, df):
    accuracy, best_feature = 0, []
    for featr in feature_list:
        df_feat = pd.crosstab(df['safe_loans'], df[featr]).reset_index(drop=False)
        if (1 in df_feat.columns) & (0 in df_feat.columns):
            featr_accuracy = (df_feat[0].max() + df_feat[1].max())/df.shape[0]
        elif (1 in df_feat.columns):
            featr_accuracy = (df_feat[1].max())/df.shape[0]
        else:
            featr_accuracy = (df_feat[0].max())/df.shape[0]
        if featr_accuracy > accuracy:
            accuracy, best_feature = featr_accuracy, featr
    print(f"Best Feature : {best_feature}, Best Accuracy = {accuracy}")
    return best_feature

In [None]:
all_features = list(loans_data.iloc[:,1:].columns)
best_feature = get_best_feature_split(all_features, loans_data)

In [None]:
df_tree1 = loans_data[loans_data[best_feature] == 0]
df_tree2 = loans_data[loans_data[best_feature] == 1]

In [None]:
get_best_feature_split(all_features, df_tree1)
get_best_feature_split(all_features, df_tree2)