In [1]:
import pandas as pd
import numpy as np

In [2]:
loans = pd.read_csv('lending-club-data.csv',low_memory=False)

In [3]:
print loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'is_inc_v', u'issue_d', u'loan_status', u'pymnt_plan', u'url', u'desc',
       u'purpose', u'title', u'zip_code', u'addr_state', u'dti',
       u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'not_compliant',
       u'status', u'inactiv

In [4]:
print loans.bad_loans.value_counts()

0    99457
1    23150
dtype: int64


In [5]:
loans.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1,1,1,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1,1,1,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1,1,1,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1,1,1,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1,1,1,0,5.21533,20141201T000000,1,1,1


In [6]:
loans.shape

(122607, 68)

In [7]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)

In [8]:
len(loans[loans['safe_loans'] == +1]), len(loans[loans['safe_loans'] == -1])

(99457, 23150)

In [9]:
print "Safe loan percentage = " + str(len(loans[loans['safe_loans'] == +1]) * 100 / len(loans)) + '%'
print "Risky loan percentage = " + str(len(loans[loans['safe_loans'] == -1]) * 100 / len(loans)) + '%'

Safe loan percentage = 81%
Risky loan percentage = 18%


In [10]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [11]:
train_index = pd.read_json('module-5-assignment-1-train-idx.json')
train_index.columns = ['indexvalue']
train_idx = train_index.indexvalue.tolist()

validation_index = pd.read_json('module-5-assignment-1-validation-idx.json')
validation_index.columns = ['indexvalue']
validation_idx = validation_index.indexvalue.tolist()

train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]

In [12]:
train_data.shape, validation_data.shape

((37224, 13), (9284, 13))

In [13]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150


In [14]:
print train_data['safe_loans'].value_counts()
print validation_data['safe_loans'].value_counts()

 1    18748
-1    18476
dtype: int64
-1    4674
 1    4610
dtype: int64


In [15]:
train_data.head()

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,C,C4,1,1,RENT,1.0,car,60 months,1,1,9.4,0,-1
6,F,F2,0,5,OWN,5.55,small_business,60 months,1,1,32.6,0,-1
7,B,B5,1,1,RENT,18.08,other,60 months,1,1,36.5,0,-1
10,C,C1,1,1,RENT,10.08,debt_consolidation,36 months,1,1,91.7,0,-1
12,B,B2,0,4,RENT,7.06,other,36 months,1,1,55.5,0,-1


In [19]:
for col in train_data.columns:
    print str(col) + ' ' + str(train_data[col].dtype)

grade object
sub_grade object
short_emp int64
emp_length_num int64
home_ownership object
dti float64
purpose object
term object
last_delinq_none int64
last_major_derog_none int64
revol_util float64
total_rec_late_fee float64
safe_loans int64


In [25]:
#from sklearn.preprocessing import OneHotEncoder, LabelEncoder
categorical_type = ['grade','sub_grade','home_ownership','purpose','term']

for col in categorical_type:
    train_data = pd.concat([train_data, pd.get_dummies(train_data[col])], axis=1)

In [27]:
train_data.shape

(37224, 73)

In [28]:
train_data = train_data.drop(categorical_type, axis=1)

In [29]:
train_data.shape

(37224, 68)

In [30]:
train_data.head()

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,A,B,...,house,major_purchase,medical,moving,other,small_business,vacation,wedding,36 months,60 months
1,1,1,1.0,1,1,9.4,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,5,5.55,1,1,32.6,0,-1,0,0,...,0,0,0,0,0,1,0,0,0,1
7,1,1,18.08,1,1,36.5,0,-1,0,1,...,0,0,0,0,1,0,0,0,0,1
10,1,1,10.08,1,1,91.7,0,-1,0,0,...,0,0,0,0,0,0,0,0,1,0
12,0,4,7.06,1,1,55.5,0,-1,0,1,...,0,0,0,0,1,0,0,0,1,0


In [31]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(max_depth=6)
small_model = DecisionTreeClassifier(max_depth=2)

In [32]:
X_train = train_data.drop(target, axis=1)
y_train = train_data[target].values
decision_tree_model.fit(X_train, y_train)
small_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [40]:
small_model.tree_

TypeError: 'sklearn.tree._tree.Tree' object has no attribute '__getitem__'