In [154]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree

In [155]:
data = pd.read_csv("lending-club-data.csv", low_memory = False)
data['safe_loans']=data["bad_loans"].apply(lambda x : +1 if x==0 else -1)
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                   # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans' 
loans= data[features+[target]]

In [156]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
risky_loans = risky_loans_raw
percentage
safe_loans = safe_loans_raw.sample(frac = percentage)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)
print "Percentage of safe loans                 :" ,len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.5
Percentage of risky loans                : 0.5
Total number of loans in our new dataset : 46300


In [157]:
from sklearn.preprocessing import LabelEncoder as le

In [158]:
col_to_encode=['grade','sub_grade','home_ownership','purpose','term']
for col in col_to_encode:
    loans_data[col] = loans_data[col].factorize()[0]
    loans_data[col] = le().fit_transform(loans_data[col])

loans_data.head()    

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,0,0,1,1,0,1.0,0,0,1,1,9.4,0.0,-1
6,1,1,0,5,1,5.55,1,0,1,1,32.6,0.0,-1
7,2,2,1,1,0,18.08,2,0,1,1,36.5,0.0,-1
10,0,3,1,1,0,10.08,3,1,1,1,91.7,0.0,-1
12,2,4,0,4,0,7.06,2,1,1,1,55.5,0.0,-1


In [159]:
train_data, validation_data = train_test_split(loans_data, test_size=0.3, random_state=42)

In [160]:
from sklearn.tree import DecisionTreeClassifier

In [161]:
decision_tree_model = DecisionTreeClassifier(max_depth=6)
small_model= DecisionTreeClassifier(max_depth=2)

In [162]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
99688,3,13,1,1,2,16.33,6,1,0,1,85.1,0.0,1
26816,0,3,0,7,2,7.84,3,1,1,1,59.0,0.0,1
35751,3,14,0,11,1,2.87,4,0,0,1,91.9,0.0,-1
69905,5,22,0,11,0,4.01,3,0,1,1,96.4,0.0,-1


In [163]:
sample_validation_data[features]

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee
99688,3,13,1,1,2,16.33,6,1,0,1,85.1,0.0
26816,0,3,0,7,2,7.84,3,1,1,1,59.0,0.0
35751,3,14,0,11,1,2.87,4,0,0,1,91.9,0.0
69905,5,22,0,11,0,4.01,3,0,1,1,96.4,0.0


In [164]:
decision_tree_model.fit(train_data[features],train_data[target])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [202]:
predictions = decision_tree_model.predict(validation_data[features])

In [174]:
from sklearn.metrics import accuracy_score
print accuracy_score(tt,validation_data[target])

0.630597552196


In [175]:
small_model.fit(train_data[features],train_data[target])
small = small_model.predict(validation_data[features])
print accuracy_score(small,validation_data[target])

0.595896328294


In [176]:
import pydotplus 
dot_data = tree.export_graphviz(decision_tree_model, out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("new.pdf") 

True

In [203]:
false_positives = 0
false_negatives = 0
for item in xrange(len(validation_data)):
    if predictions[item] != validation_data['safe_loans'][item]:
        if predictions[item] == 1:
            false_positives += 1
        else:
            false_negatives += 1
            
print false_positives
print false_negatives

KeyError: 0

In [201]:
false_negative =0
false_positive = 0

for i in xrange(1,len(tt)):
    if tt[i] != validation_data[target][i]:
        if tt[i] == 1:
            false_positive += 1
        else:
            false_negative += 1

print false_positive
print false_negative
        

KeyError: 2