In [223]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
import json

In [224]:
loans = pd.read_csv('lending-club-data.csv')
'bad_loans' in loans.columns

True

In [225]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: 1 if x==0 else -1)
loans = loans.drop('bad_loans',axis=1)
'bad_loans' in loans.columns

False

In [226]:
safe = len(loans[loans['safe_loans']==1])
risky = len(loans[loans['safe_loans']==-1])
print(safe,risky)
print(safe/(safe+risky))
print(risky/(safe+risky))

99457 23150
0.8111853319957262
0.18881466800427382


In [227]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)
loans = loans[[target]+features]
loans.shape

(122607, 13)

In [228]:
train_idx = json.loads(open('module-5-assignment-1-train-idx.json').read())
validation_idx = json.loads(open('module-5-assignment-1-validation-idx.json').read())
print(type(validation_idx))

<class 'list'>


In [229]:
loans = pd.get_dummies(loans)
print(len(loans))

122607


In [274]:
train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]
print(train_data.shape,validation_data.shape)

(37224, 68) (9284, 68)


In [275]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]
sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]
print(sample_validation_data_safe.shape,sample_validation_data_risky.shape)
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_target = sample_validation_data['safe_loans']
sample_validation_data = sample_validation_data.drop('safe_loans',1)
sample_validation_data

(2, 68) (2, 68)


Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,grade_A,grade_B,grade_C,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79,0,10,16.85,1,1,96.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24,0,3,13.97,0,1,59.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
41,0,11,16.33,1,1,62.1,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [276]:
train_target = train_data['safe_loans'].as_matrix()
train_target = train_target.reshape(len(train_target),1)
validation_target = validation_data['safe_loans'].as_matrix()
validation_target = validation_target.reshape(len(validation_target),1)
train_data = train_data.drop('safe_loans',1)
print('safe_loans' in train_data.columns)
validation_data = validation_data.drop('safe_loans',1)
train_data = train_data.as_matrix()
validation_data = validation_data.as_matrix()

False


In [277]:
decision_tree_model = DecisionTreeClassifier(criterion = "gini", random_state = 100,max_depth=6, min_samples_leaf=5)
model1 = decision_tree_model.fit(train_data,train_target)
small_model = DecisionTreeClassifier(criterion = "gini", random_state = 100,max_depth=2, min_samples_leaf=5)
model2 = small_model.fit(train_data,train_target)

In [278]:
prediction_model1 = model1.predict(sample_validation_data)
prediction_model2 = model2.predict(sample_validation_data)
score_model1 = model1.score(sample_validation_data,sample_validation_target)
print(score_model1)
print(sample_validation_target)
print(prediction_model1)
print(prediction_model2)

0.5
19    1
79    1
24   -1
41   -1
Name: safe_loans, dtype: int64
[ 1 -1 -1  1]
[ 1 -1 -1  1]


In [281]:
score_train1 = model1.score(train_data,train_target)
score_train2 = model2.score(train_data,train_target)
print(score_train1,score_train2)
score_validation1 = model1.score(validation_data,validation_target)
score_validation2 = model2.score(validation_data,validation_target)
print(score_validation1,score_validation2)

0.640500752203 0.613502041694
0.63517880224 0.619345109866


In [280]:
probability_model1 = model1.predict_proba(sample_validation_data)
probability_model2 = model2.predict_proba(sample_validation_data)
print(probability_model1)
print(probability_model2)

[[ 0.34156543  0.65843457]
 [ 0.53630646  0.46369354]
 [ 0.64750958  0.35249042]
 [ 0.20789474  0.79210526]]
[[ 0.41896585  0.58103415]
 [ 0.59255339  0.40744661]
 [ 0.59255339  0.40744661]
 [ 0.23120112  0.76879888]]
