In [22]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [24]:
loans = pd.read_csv(r"Data\lending-club-data.csv", index_col = 0)
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length']         # number of years of employment
target = 'safe_loans'
loans = loans[features + [target]]

In [25]:
# Balancing the data
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target] == -1]

# Since there are less risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(n = len(risky_loans_raw), random_state = 1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print("Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data)))
print("Percentage of risky loans                :", len(risky_loans) / float(len(loans_data)))
print("Total number of loans in our new dataset :", len(loans_data))
del safe_loans_raw, risky_loans_raw, risky_loans, safe_loans

Percentage of safe loans                 : 0.5
Percentage of risky loans                : 0.5
Total number of loans in our new dataset : 46300


In [26]:
loans_data = pd.get_dummies(loans_data).reset_index(drop = True)
loans_data.head(2)

Unnamed: 0,safe_loans,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,term_ 36 months,term_ 60 months,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,emp_length_1 year,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year
0,-1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,-1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [27]:
features = loans_data.columns.to_list()
features.remove('safe_loans')

In [71]:
def get_tree_weight(df):
    ''' Return individual tree weight '''
    error_rate = (df['safe_loans'] == df['predictions']).sum()/len(output)
    weight = -1/2*np.log((1 - error_rate)/error_rate)
    return weight

def get_data_weight(df):
    ''' Return updated weights to data based on misclassification '''
    if df['safe_loans'] == df['predictions']:
        return df['weights']*np.exp(-df['weights'])
    else:
        return df['weights']*np.exp(df['weights'])

junk['new_weight'] = junk.apply(get_data_weight, axis = 1)

In [7]:
# Helper Functions
def intermediate_node_mistakes(df):
    ''' Return Number of Mistakes at node '''
    if len(df) == 0:
        return 0

    safe_loans_num = (df == +1).sum()
    risky_loans_num = (df == -1).sum()
    # print(safe_loans_num)
    if safe_loans_num > risky_loans_num:
        return risky_loans_num
    else:
        return safe_loans_num

In [8]:
def best_feature_split(data, features):
    ''' Return best feature at a decision stump '''
    best_error = 10
    best_feature = []
    for feature in features:
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1]

        left_mistakes = intermediate_node_mistakes(left_split['safe_loans'])
        right_mistakes = intermediate_node_mistakes(right_split['safe_loans'])

        error = (left_mistakes + right_mistakes)/len(data)

        if error < best_error:
            best_error, best_feature = error, feature
    return best_feature

In [61]:
def get_final_predictions(weigths_all, predictions_all):
    return np.dot(weight_all, prediction_all)

In [None]:
def get_tree_prediction()

In [25]:
def generate_boosted_tree(data, features, target, num_trees = 2, max_depth = 1):

    weights_all = []
    prediction_all = np.array(shape = (len(data), num_trees))

    for tree in np.arange(num_trees):    
        remaining_features = features[:]
        target_values = data[target]
        
        splitting_feature = best_feature_split(data, features)
        prediction = 
        left_tree_data = data[data[splitting_feature] == 0]
        right_tree_data = data[data[splitting_feature] == 1]

    return {'is_leaf'          : False, 
        'prediction'       : None,
        'splitting_feature': splitting_feature,
        'left'             : left_tree, 
        'right'            : right_tree}

In [26]:
def create_leaf(target_values):
    leaf = {'is_leaf'          : True, 
        'prediction'       : None,
        'splitting_feature': None,
        'left'             : None, 
        'right'            : None}

    safe_loan = (target_values == +1).sum()
    risky_loan = (target_values == -1).sum()
    if safe_loan > risky_loan:
        leaf['prediction'] = +1
    else:
        leaf['prediction'] = -1
    return leaf

In [27]:
small_decision_tree = generate_tree(loans_data, features, 'safe_loans', max_depth = 4, 
                                        min_node_size = 10, min_error_reduction=0.0)

--------------------------------------------------------------------
Subtree, depth = 0 (46300 data points).
term_ 36 months
--------------------------------------------------------------------
Subtree, depth = 1 (11671 data points).
grade_A
--------------------------------------------------------------------
Subtree, depth = 2 (11515 data points).
grade_B
--------------------------------------------------------------------
Subtree, depth = 3 (10167 data points).
grade_C
--------------------------------------------------------------------
Subtree, depth = 4 (7369 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 4 (2798 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 3 (1348 data points).
emp_length_3 years
---------------------------------------------------------------

In [28]:
def classify(tree, x):
    if tree['is_leaf']:
        return tree['prediction']
    else:
        # print(tree['splitting_feature'])
        splitting_feature_value = x[tree['splitting_feature']]
        if splitting_feature_value == 0:
            # print(0)
            return classify(tree['left'], x)
        else:
            # print(1)
            return classify(tree['right'], x)

In [30]:
loans_data['prediction'] = 99
for i in np.arange(len(loans_data)):
    loans_data['prediction'][i] = classify(small_decision_tree, loans_data.iloc[i,1:])

In [31]:
loans_data['prediction'].value_counts()

-1    27788
 1    18512
Name: prediction, dtype: int64

In [32]:
(loans_data['prediction'] == loans_data['safe_loans']).sum()/len(loans_data)

0.609805615550756

## Notes:
1. The recursion aspect of building tree is critical.   
2. The above example examines only categorical features for simplicity. 
3. Remember the three stopping criteria. Always have code for edge cases (like when all target values at start belong to same class)
4. The early stopping can affect the overfitting scenarios. That can be checked with low, medium and high values for each criteria. The model complexity can be checked at number of leave nodes.