# Lending Club

### Model: Decision Tree Classifier

### Framework: sklearn

In [1]:
import pandas as pd
import numpy as np

loans = pd.read_csv("lending-club-data.csv")
loans.columns

  interactivity=interactivity, compiler=compiler, result=result)


Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d',
       'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans',
       'emp_length_num', 'grade_num', 'sub_gra

In [3]:
print(loans.shape)
loans.head()

(122607, 68)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1.0,1.0,1.0,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1.0,1.0,1.0,0,5.21533,20141201T000000,1,1,1


### Preprocessing

In [4]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.drop('bad_loans', axis = 1)

In [16]:
m = len(loans) #num_of_examples

safe_loans_perc = np.sum(loans['safe_loans'] == 1) / m
risky_loans_perc = 1-safe_loans_perc
print("Percentage of safe loans = {}".format(safe_loans_perc*100))
print("Percentage of risky loans = {}".format(risky_loans_perc*100))

Percentage of safe loans = 81.11853319957262
Percentage of risky loans = 18.881466800427383


In [17]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

In [18]:
loans = loans[features + [target]]
loans.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,B,B2,0,11,RENT,27.65,credit_card,36 months,1,1,83.7,0.0,1
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0.0,-1
2,C,C5,0,11,RENT,8.72,small_business,36 months,1,1,98.5,0.0,1
3,C,C1,0,11,RENT,20.0,other,36 months,0,1,21.0,16.97,1
4,A,A4,0,4,RENT,11.2,wedding,36 months,1,1,28.3,0.0,1


In [19]:
import json

with open("module-5-assignment-1-train-idx.json") as json_file:
    train_idx = json.load(json_file)
    
with open("module-5-assignment-1-validation-idx.json") as json_file:
    val_idx = json.load(json_file)
    
loan_onehot = pd.get_dummies(loans)

train_data = loan_onehot.iloc[train_idx]
validation_data = loan_onehot.iloc[val_idx]

print(train_data.shape)
print(validation_data.shape)

train_data.head()

(37224, 68)
(9284, 68)


Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
1,1,1,1.0,1,1,9.4,0.0,-1,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,5,5.55,1,1,32.6,0.0,-1,0,0,...,0,0,0,0,0,1,0,0,0,1
7,1,1,18.08,1,1,36.5,0.0,-1,0,1,...,0,0,0,0,1,0,0,0,0,1
10,1,1,10.08,1,1,91.7,0.0,-1,0,0,...,0,0,0,0,0,0,0,0,1,0
12,0,4,7.06,1,1,55.5,0.0,-1,0,1,...,0,0,0,0,1,0,0,0,1,0


In [20]:
X_train = train_data.drop('safe_loans', axis =1).values
Y_train = train_data['safe_loans'].values

X_val = validation_data.drop('safe_loans', axis =1).values
Y_val = validation_data['safe_loans'].values

### Building Model and Training

In [21]:
from sklearn.tree import DecisionTreeClassifier

## Model 1 - depth 6

In [22]:
decision_tree_model = DecisionTreeClassifier(max_depth=6)
decision_tree_model.fit(X_train, Y_train)
print("Score for model with depth 6: {}".format(decision_tree_model.score(X_val, Y_val)))

Score for model with depth 6: 0.6361482119775959


## Model 2 - depth 2

In [23]:
small_model = DecisionTreeClassifier(max_depth=2)
small_model.fit(X_train, Y_train)
print("Score for model with depth 2: {}".format(small_model.score(X_val, Y_val)))

Score for model with depth 2: 0.6193451098664369


#### Create Sample Data

In [24]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
79,0,10,16.85,1,1,96.4,0.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
24,0,3,13.97,0,1,59.5,0.0,-1,0,0,...,0,0,0,0,1,0,0,0,0,1
41,0,11,16.33,1,1,62.1,0.0,-1,1,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
X_sample_val = sample_validation_data.drop('safe_loans', axis =1).values
Y_sample_val  = sample_validation_data['safe_loans'].values

In [28]:
sample_pred = decision_tree_model.predict(X_sample_val)
sample_acc = np.sum(np.equal(sample_pred,Y_sample_val)) / len(Y_sample_val)
str(sample_acc*100) + "%"

'50.0%'

In [32]:
# probabilities
sample_probs = decision_tree_model.predict_proba(X_sample_val)
sample_probs[:,1]

array([0.65843457, 0.46369354, 0.35249042, 0.79210526])

##### Using Smaller model:

In [35]:
sample_probs2 = small_model.predict_proba(X_sample_val)

### Exporting the tree

In [36]:
from sklearn import tree
tree.export_graphviz(small_model,out_file='tree.dot')

small_model.predict_proba(sample_validation_data.iloc[[2,3]].drop(target, axis =1))

array([[0.59255339, 0.40744661],
       [0.23120112, 0.76879888]])

### Accuracies

In [37]:
print("Training Accuracy for model with depth 6: {}".format(decision_tree_model.score(X_train, Y_train)))
## Same as accuracy -- np.sum(np.equal(decision_tree_model.predict(X_train), Y_train)) / len(Y_train)
print("Trainin Accuracy for model with depth 2: {}".format(small_model.score(X_train, Y_train)))

print("Validation Accuracy for model with depth 6: {}".format(decision_tree_model.score(X_val, Y_val)))
## Same as accuracy -- np.sum(np.equal(decision_tree_model.predict(X_train), Y_train)) / len(Y_train)
print("Validation Accuracy for model with depth 2: {}".format(small_model.score(X_val, Y_val)))

Training Accuracy for model with depth 6: 0.6405276165914464
Trainin Accuracy for model with depth 2: 0.6135020416935311
Validation Accuracy for model with depth 6: 0.6361482119775959
Validation Accuracy for model with depth 2: 0.6193451098664369


## Model 3 - depth 10

In [38]:
big_model = DecisionTreeClassifier(max_depth=10)
big_model.fit(X_train, Y_train)

print("Training Accuracy for model with depth 10: {}".format(big_model.score(X_train, Y_train)))
## Same as accuracy -- np.sum(np.equal(decision_tree_model.predict(X_train), Y_train)) / len(Y_train)
print("Validation Accuracy for model with depth 10: {}".format(big_model.score(X_val, Y_val)))

Training Accuracy for model with depth 10: 0.6638459058671825
Validation Accuracy for model with depth 10: 0.6261309780267126
