#### Read loan data

In [2]:
import sframe
loans = sframe.SFrame('data1/')

[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging C:\Users\user\AppData\Local\Temp\sframe_server_1502693599.log.0


In [3]:
loans.column_names()

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'is_inc_v',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'not_compliant',
 'status',
 'inactive_loans',
 'bad_loans',
 'emp_length_num',
 'grade_num',
 'sub_grade_num',
 'delinq_2yrs_zero',
 'pub_rec

In [4]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

#### Extract feature columns and target columns

In [5]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]
loans

grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none
B,B2,0,11,RENT,27.65,credit_card,36 months,1
C,C4,1,1,RENT,1.0,car,60 months,1
C,C5,0,11,RENT,8.72,small_business,36 months,1
C,C1,0,11,RENT,20.0,other,36 months,0
A,A4,0,4,RENT,11.2,wedding,36 months,1
E,E1,0,10,RENT,5.35,car,36 months,1
F,F2,0,5,OWN,5.55,small_business,60 months,1
B,B5,1,1,RENT,18.08,other,60 months,1
C,C3,0,6,OWN,16.12,debt_consolidation,60 months,1
B,B5,0,11,OWN,10.78,debt_consolidation,36 months,1

last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,83.7,0.0,1
1,9.4,0.0,-1
1,98.5,0.0,1
1,21.0,16.97,1
1,28.3,0.0,1
1,87.5,0.0,1
1,32.6,0.0,-1
1,36.5,0.0,-1
1,20.6,0.0,1
1,67.1,0.0,1


In [6]:
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target]==-1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150


#### Undersample safe loans

In [7]:
percentage = len(risky_loans_raw) / float(len(safe_loans_raw))
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage,seed=1)
loans_data = risky_loans.append(safe_loans)

#### One hot-encoding

In [8]:
categorical_variables = []
for feat_name, feat_type in zip(loans_data.column_names(),loans_data.column_types()):
    if feat_type == str:
        categorical_variables.append(feat_name)
        
for feature in categorical_variables:
    loans_data_one_hot_encoded = loans_data[feature].apply(lambda x:{x:1})
    loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix = feature)
    
    for column in loans_data_unpacked.column_names():
        loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
    loans_data.remove_column(feature)
    loans_data.add_columns(loans_data_unpacked)
loans_data

short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,1,1.0,1,1,9.4,0.0,-1
0,5,5.55,1,1,32.6,0.0,-1
1,1,18.08,1,1,36.5,0.0,-1
1,1,10.08,1,1,91.7,0.0,-1
0,4,7.06,1,1,55.5,0.0,-1
0,11,13.22,1,1,90.3,0.0,-1
0,2,2.4,1,1,29.7,0.0,-1
0,10,15.22,1,1,57.6,0.0,-1
0,3,13.97,0,1,59.5,0.0,-1
0,11,16.33,1,1,62.1,0.0,-1

grade.A,grade.B,grade.C,grade.D,grade.E,grade.F,grade.G,sub_grade.A1,sub_grade.A2,sub_grade.A3,sub_grade.A4
0,0,1,0,0,0,0,0,0,0,0
0,0,0,0,0,1,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,0,1,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,0,1,0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0

sub_grade.A5,sub_grade.B1,sub_grade.B2,sub_grade.B3,sub_grade.B4,sub_grade.B5,sub_grade.C1,sub_grade.C2
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,1,0,0
0,0,0,0,0,0,1,0
0,0,1,0,0,0,0,0
0,0,0,0,1,0,0,0
0,0,0,1,0,0,0,0
0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0

sub_grade.C3,sub_grade.C4,sub_grade.C5,sub_grade.D1,sub_grade.D2,sub_grade.D3,sub_grade.D4,sub_grade.D5
0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,1,0,0,0
0,0,0,0,0,0,0,0

sub_grade.E1,sub_grade.E2,sub_grade.E3,sub_grade.E4,sub_grade.E5,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...


#### Split Data

In [9]:
train_data, validation_data = loans_data.random_split(.8,seed=1)

#### Builder a decision tree classifier

In [14]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import numpy as np
decision_tree_model = DecisionTreeClassifier(max_depth=6)
small_model = DecisionTreeClassifier(max_depth=2)
big_model = DecisionTreeClassifier(max_depth = 10)

def convert_to_numpy(data,target):
    columns = data.column_names()
    columns.remove(target)
    matrix = data.select_columns(columns).to_numpy()
    target_raw = data.select_columns([target]).to_numpy()
    target = np.empty(0)
    for element_list in target_raw:
        for element in element_list:
            target = np.append(target,element)
    return (matrix, target)

train_matrix, train_target = convert_to_numpy(train_data,target)
train_matrix

array([[  1.  ,   1.  ,   1.  , ...,   0.  ,   0.  ,   1.  ],
       [  0.  ,   5.  ,   5.55, ...,   0.  ,   0.  ,   1.  ],
       [  1.  ,   1.  ,  18.08, ...,   0.  ,   0.  ,   1.  ],
       ..., 
       [  0.  ,   4.  ,   7.57, ...,   0.  ,   1.  ,   0.  ],
       [  0.  ,  11.  ,  26.27, ...,   0.  ,   0.  ,   1.  ],
       [  0.  ,  11.  ,  11.26, ...,   0.  ,   1.  ,   0.  ]])

In [11]:
train_target

array([-1., -1., -1., ...,  1.,  1.,  1.])

#### Visualize a tree

In [12]:
decision_tree_model.fit(train_matrix,train_target)
model_view = export_graphviz(decision_tree_model)
with open('tree.dot','r') as tree:
    from graphviz import Source
    s = Source(tree.read(), filename="test.gv", format="png",engine = "dot")
    s.view()



In [59]:
small_model.fit(train_matrix,train_target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [15]:
big_model.fit(train_matrix,train_target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

#### Make predictions

Make a sample of validation data

In [60]:
validation_safe_loans = validation_data[validation_data[target]==1]
validation_risky_loans = validation_data[validation_data[target]==-1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,11,11.18,1,1,82.4,0.0,1
0,10,16.85,1,1,96.4,0.0,1
0,3,13.97,0,1,59.5,0.0,-1
0,11,16.33,1,1,62.1,0.0,-1

grade.A,grade.B,grade.C,grade.D,grade.E,grade.F,grade.G,sub_grade.A1,sub_grade.A2,sub_grade.A3,sub_grade.A4
0,1,0,0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0

sub_grade.A5,sub_grade.B1,sub_grade.B2,sub_grade.B3,sub_grade.B4,sub_grade.B5,sub_grade.C1,sub_grade.C2
0,0,0,1,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0

sub_grade.C3,sub_grade.C4,sub_grade.C5,sub_grade.D1,sub_grade.D2,sub_grade.D3,sub_grade.D4,sub_grade.D5
0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0
0,0,0,0,1,0,0,0
0,0,0,0,0,0,0,0

sub_grade.E1,sub_grade.E2,sub_grade.E3,sub_grade.E4,sub_grade.E5,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...


In [62]:
sample_validation, sample_target = convert_to_numpy(sample_validation_data,target)
decision_tree_model.predict(sample_validation)

array([ 1., -1., -1.,  1.])

In [63]:
sample_target

array([ 1.,  1., -1., -1.])

In [64]:
decision_tree_model.predict_proba(sample_validation)

array([[ 0.34156543,  0.65843457],
       [ 0.53630646,  0.46369354],
       [ 0.64750958,  0.35249042],
       [ 0.20789474,  0.79210526]])

#### Training set Accruacy

In [16]:
from sklearn.metrics import accuracy_score
print accuracy_score(train_target,big_model.predict(train_matrix))

0.66379217709


#### Validation set Accruacy

In [18]:
validation_matrix, validation_target = convert_to_numpy(validation_data,target)
print accuracy_score(validation_target,big_model.predict(validation_matrix))

0.62623869022


In [19]:
from sklearn.metrics import classification_report
print classification_report(validation_target,big_model.predict(validation_matrix))

             precision    recall  f1-score   support

       -1.0       0.62      0.65      0.64      4674
        1.0       0.63      0.60      0.62      4610

avg / total       0.63      0.63      0.63      9284

