In [1]:
import pandas as pd
import numpy as np

In [2]:
loans = pd.read_csv('lending-club-data.csv',low_memory=False)

In [3]:
print loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'is_inc_v', u'issue_d', u'loan_status', u'pymnt_plan', u'url', u'desc',
       u'purpose', u'title', u'zip_code', u'addr_state', u'dti',
       u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'not_compliant',
       u'status', u'inactiv

In [4]:
print loans.bad_loans.value_counts()

0    99457
1    23150
Name: bad_loans, dtype: int64


In [5]:
loans.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1,1,1,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1,1,1,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1,1,1,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1,1,1,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1,1,1,0,5.21533,20141201T000000,1,1,1


In [6]:
loans.shape

(122607, 68)

In [7]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)

In [8]:
len(loans[loans['safe_loans'] == +1]), len(loans[loans['safe_loans'] == -1])

(99457, 23150)

In [9]:
print "Safe loan percentage = " + str(len(loans[loans['safe_loans'] == +1]) * 100 / len(loans)) + '%'
print "Risky loan percentage = " + str(len(loans[loans['safe_loans'] == -1]) * 100 / len(loans)) + '%'

Safe loan percentage = 81%
Risky loan percentage = 18%


In [10]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [11]:
train_index = pd.read_json('module-5-assignment-1-train-idx.json')
train_index.columns = ['indexvalue']
train_idx = train_index.indexvalue.tolist()

validation_index = pd.read_json('module-5-assignment-1-validation-idx.json')
validation_index.columns = ['indexvalue']
validation_idx = validation_index.indexvalue.tolist()

train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]

In [12]:
train_data.shape, validation_data.shape

((37224, 13), (9284, 13))

In [13]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150


In [14]:
print train_data['safe_loans'].value_counts()
print validation_data['safe_loans'].value_counts()

 1    18748
-1    18476
Name: safe_loans, dtype: int64
-1    4674
 1    4610
Name: safe_loans, dtype: int64


In [15]:
train_data.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0,-1
6,F,F2,0,5,OWN,5.55,small_business,60 months,1,1,32.6,0,-1
7,B,B5,1,1,RENT,18.08,other,60 months,1,1,36.5,0,-1
10,C,C1,1,1,RENT,10.08,debt_consolidation,36 months,1,1,91.7,0,-1
12,B,B2,0,4,RENT,7.06,other,36 months,1,1,55.5,0,-1


In [16]:
for col in train_data.columns:
    print str(col) + ' ' + str(train_data[col].dtype)

grade object
sub_grade object
short_emp int64
emp_length_num int64
home_ownership object
dti float64
purpose object
term object
last_delinq_none int64
last_major_derog_none int64
revol_util float64
total_rec_late_fee float64
safe_loans int64


In [17]:
#from sklearn.preprocessing import OneHotEncoder, LabelEncoder
categorical_type = ['grade','sub_grade','home_ownership','purpose','term']

In [19]:
from sklearn.feature_extraction import DictVectorizer

train_cat = train_data[categorical_type].T.to_dict().values()

In [21]:
vect = DictVectorizer(sparse=False)
train_vector = vect.fit_transform(train_cat)

In [24]:
train_vector.shape, train_data.shape

((37224, 60), (37224, 13))

In [27]:
# Do the same for validation data
validation_cat = validation_data[categorical_type].T.to_dict().values()
vect2 = DictVectorizer(sparse=False)
validation_vector = vect2.fit_transform(validation_cat)

In [28]:
validation_vector.shape, validation_data.shape

((9284, 60), (9284, 13))

In [33]:
#Prepare output numpy array
y_train = train_data['safe_loans'].values
y_validation = validation_data['safe_loans'].values

array([[  1.  ,   1.  ,   1.  , ...,   1.  ,   9.4 ,   0.  ],
       [  0.  ,   5.  ,   5.55, ...,   1.  ,  32.6 ,   0.  ],
       [  1.  ,   1.  ,  18.08, ...,   1.  ,  36.5 ,   0.  ],
       ..., 
       [  0.  ,   4.  ,   7.57, ...,   1.  ,  34.4 ,   0.  ],
       [  0.  ,  11.  ,  26.27, ...,   0.  ,  94.1 ,   0.  ],
       [  0.  ,  11.  ,  11.26, ...,   0.  ,  15.2 ,   0.  ]])

In [35]:
#Merge/replace vector data into existing dataframe
train_num = train_data.drop(categorical_type + ['safe_loans'], axis=1)
X_train = np.concatenate((train_num.values, train_vector), axis=1)

In [37]:
X_train

array([[  1.  ,   1.  ,   1.  , ...,   0.  ,   0.  ,   1.  ],
       [  0.  ,   5.  ,   5.55, ...,   0.  ,   0.  ,   1.  ],
       [  1.  ,   1.  ,  18.08, ...,   0.  ,   0.  ,   1.  ],
       ..., 
       [  0.  ,   4.  ,   7.57, ...,   0.  ,   1.  ,   0.  ],
       [  0.  ,  11.  ,  26.27, ...,   0.  ,   0.  ,   1.  ],
       [  0.  ,  11.  ,  11.26, ...,   0.  ,   1.  ,   0.  ]])

In [38]:
#Merge/replace vector data into existing dataframe - for validation data
validation_num = validation_data.drop(categorical_type + ['safe_loans'], axis=1)
X_validation = np.concatenate((validation_num.values, validation_vector), axis=1)

In [39]:
X_validation.shape, validation_num.shape, validation_vector.shape

((9284, 67), (9284, 7), (9284, 60))

In [41]:
y_validation.shape, y_train.shape

((9284,), (37224,))

In [42]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(max_depth=6)
small_model = DecisionTreeClassifier(max_depth=2)

In [43]:
decision_tree_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [44]:
small_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [45]:
from sklearn import tree
tree.export_graphviz(small_model, out_file='tree.dot')    

In [47]:
train_data.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0,-1
6,F,F2,0,5,OWN,5.55,small_business,60 months,1,1,32.6,0,-1
7,B,B5,1,1,RENT,18.08,other,60 months,1,1,36.5,0,-1
10,C,C1,1,1,RENT,10.08,debt_consolidation,36 months,1,1,91.7,0,-1
12,B,B2,0,4,RENT,7.06,other,36 months,1,1,55.5,0,-1


In [49]:
X_train[0,:]

array([ 1. ,  1. ,  1. ,  1. ,  1. ,  9.4,  0. ,  0. ,  0. ,  1. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ,  1. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        1. ])

In [50]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
19,B,B3,0,11,OWN,11.18,credit_card,36 months,1,1,82.4,0,1
79,D,D1,0,10,RENT,16.85,debt_consolidation,36 months,1,1,96.4,0,1
24,D,D2,0,3,RENT,13.97,other,60 months,0,1,59.5,0,-1
41,A,A5,0,11,MORTGAGE,16.33,debt_consolidation,36 months,1,1,62.1,0,-1


In [64]:
np.where(y_validation == +1)

(array([4674, 4675, 4676, ..., 9281, 9282, 9283]),)

In [65]:
np.where(y_validation == -1)

(array([   0,    1,    2, ..., 4671, 4672, 4673]),)

In [70]:
y_validation[4674]

1

In [78]:
sample_X_validation = X_validation[[0,1,4674,4675],:]
sample_y_validation = y_validation[[0,1,4674,4675]]

In [79]:
sample_X_validation

array([[  0.  ,   3.  ,  13.97,   0.  ,   1.  ,  59.5 ,   0.  ,   0.  ,
          0.  ,   0.  ,   1.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   1.  ,   0.  ,   1.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   1.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   1.  ,   0.  ],
       [  0.  ,  11.  ,  16.33,   1.  ,   1.  ,  62.1 ,   0.  ,   1.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   1.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   0.  ,   0.  ,

In [82]:
small_model.predict(sample_X_validation), sample_y_validation

(array([-1, -1, -1, -1]), array([-1, -1,  1,  1]))

In [83]:
decision_tree_model.predict(sample_X_validation), sample_y_validation

(array([1, 1, 1, 1]), array([-1, -1,  1,  1]))

In [85]:
small_model.predict_proba(sample_X_validation)

array([[ 0.52596364,  0.47403636],
       [ 0.52596364,  0.47403636],
       [ 0.52596364,  0.47403636],
       [ 0.52596364,  0.47403636]])

In [87]:
decision_tree_model.predict_proba(sample_X_validation)

array([[ 0.44694377,  0.55305623],
       [ 0.45289179,  0.54710821],
       [ 0.48567839,  0.51432161],
       [ 0.48567839,  0.51432161]])

In [89]:
small_model.score(X_train, y_train)

0.56917580055877925

In [90]:
decision_tree_model.score(X_train, y_train)

0.60837094347732645

In [91]:
small_model.score(X_validation, y_validation)

0.56365790607496769

In [92]:
decision_tree_model.score(X_validation, y_validation)

0.55848772080999565

In [93]:
big_model = DecisionTreeClassifier(max_depth=10)

In [94]:
big_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [95]:
big_model.score(X_train, y_train)

0.63738448312916396

In [96]:
big_model.score(X_validation, y_validation)

0.55385609651012491

In [98]:
from sklearn.metrics import classification_report

print classification_report(decision_tree_model.predict(X_train), y_train)

             precision    recall  f1-score   support

         -1       0.48      0.64      0.55     14000
          1       0.73      0.59      0.65     23224

avg / total       0.64      0.61      0.61     37224



In [99]:
print classification_report(decision_tree_model.predict(X_validation), y_validation)

             precision    recall  f1-score   support

         -1       0.44      0.58      0.50      3583
          1       0.67      0.54      0.60      5701

avg / total       0.59      0.56      0.56      9284



In [100]:
train_predict = decision_tree_model.predict(X_train)
validation_predict = decision_tree_model.predict(X_validation)

In [101]:
from sklearn.metrics import confusion_matrix

In [102]:
print confusion_matrix(y_train, train_predict)

[[ 8949  9527]
 [ 5051 13697]]


In [103]:
print confusion_matrix(y_validation, validation_predict)

[[2079 2595]
 [1504 3106]]


In [104]:
prediction_error = y_validation == validation_predict

In [107]:
prediction_error

array([False, False,  True, ...,  True, False,  True], dtype=bool)

In [118]:
# False Negative
len(validation_predict[validation_predict[np.where(prediction_error==False)] == -1])

  if __name__ == '__main__':


1504

In [119]:
# False Positive
len(validation_predict[validation_predict[np.where(prediction_error==False)] == 1])

  if __name__ == '__main__':


2595

In [120]:
pd.crosstab(y_validation, validation_predict, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,-1,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,2079,2595,4674
1,1504,3106,4610
All,3583,5701,9284
