### Import the LendingClub data

In [1]:
import pandas as pd
import numpy as np

In [2]:
loans = pd.read_csv('lending-club-data.csv', low_memory=False)

In [3]:
print list(loans.columns.get_values())

['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans', 'emp_length_num', 'grade_num', 'sub_grade_num', 'delinq_2yrs_zero', 'pub_rec_zero', 'collections_12_mths_zero', 'short_emp', 'payment_in

In [4]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans.drop('bad_loans', axis=1, inplace=True)

In [5]:
# Distribution of the safe loans
loans['safe_loans'].value_counts()*100/len(loans)

 1    81.118533
-1    18.881467
Name: safe_loans, dtype: float64

### Features for the classification

In [6]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [7]:
### Do the one hot encoding
categorical_variables = loans.select_dtypes(include=['object']).columns.get_values()
loans_with_dummies = pd.get_dummies(loans, columns=categorical_variables)

In [8]:
train_idx = list(pd.read_json('module-5-assignment-1-train-idx.json').values.flatten())
validation_idx = list(pd.read_json('module-5-assignment-1-validation-idx.json').values.flatten())
train_data = loans_with_dummies.iloc[train_idx]
validation_data = loans_with_dummies.iloc[validation_idx]

### Build a decision tree classifier

In [9]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier(max_depth=6)
small_model = DecisionTreeClassifier(max_depth=2)

In [10]:
train_data_matrix = train_data.drop([target], axis=1)
train_data_matrix = train_data_matrix.as_matrix()
train_labels = train_data[target].as_matrix()
validation_data_matrix = validation_data.drop([target], axis=1)
validation_data_matrix = validation_data_matrix.as_matrix()
validation_labels = validation_data[target].as_matrix()

In [11]:
decision_tree_model.fit(train_data_matrix, train_labels)
small_model.fit(train_data_matrix, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [12]:
# Explore more for graph visualization
from sklearn import tree
tree.export_graphviz(small_model, out_file='tree.dot') 

**The small graph trained is:**
<img src = "tree.png">

### Making predictions

In [13]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79,0,10,16.85,1,1,96.4,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24,0,3,13.97,0,1,59.5,0.0,-1,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
41,0,11,16.33,1,1,62.1,0.0,-1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
sample_validation_data_matrix = sample_validation_data.drop([target], axis=1).as_matrix()
sample_validation_prediction = decision_tree_model.predict(sample_validation_data_matrix)

**Quiz Question:** What percentage of the predictions on *sample_validation_data* did *decision_tree_model* get correct?

In [20]:
sample_validation_label = sample_validation_data[target]
print sum(sample_validation_prediction == sample_validation_label)*100/len(sample_validation_data)
print decision_tree_model.score(sample_validation_data_matrix, sample_validation_label.as_matrix())

50
0.5


### Explore probability predictions
**Quiz Question:** Which loan has the highest probability of being classified as a safe loan?

In [31]:
sample_validation_prob_pred = decision_tree_model.predict_proba(sample_validation_data_matrix)
pos_prob_pred = sample_validation_prob_pred[:,1]
neg_prob_pred = sample_validation_prob_pred[:,0]
print 'The loan with highest probability of being classified as safe loan is:', np.argmax(pos_prob_pred)

The loan with highest probability of being classified as safe loan is: 3


### Evaluating accuracy of the decision tree model

In [32]:
train_acc = decision_tree_model.score(train_data_matrix, train_labels)
train_acc_small = small_model.score(train_data_matrix, train_labels)
print 'Accuracy of decision_tree_model on training data: ', train_acc
print 'Accuracy of small_model on training data: ', train_acc_small

###################################
Accuracy of decision_tree_model:  0.640527616591
Accuracy of small_model:  0.613502041694


In [33]:
validation_acc = decision_tree_model.score(validation_data_matrix, validation_labels)
validation_acc_small = small_model.score(validation_data_matrix, validation_labels)
print 'Accuracy of decision_tree_model on validation data: ', validation_acc
print 'Accuracy of small_model on validation data: ', validation_acc_small

###################################
Accuracy of decision_tree_model on validation data:  0.636363636364
Accuracy of small_model on validation data:  0.619345109866


### Evaluating accuracy of a complex decision tree model

In [34]:
big_model = DecisionTreeClassifier(max_depth=10)
big_model.fit(train_data_matrix, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [35]:
train_acc_big = big_model.score(train_data_matrix, train_labels)
validation_acc_big = big_model.score(validation_data_matrix, validation_labels)
print 'Accuracy of big_model on training data is: ', train_acc_big
print 'Accuracy of big_model on validation data is: ', validation_acc_big

Accuracy of big_model on training data is:  0.663738448313
Accuracy of big_model on validation data is:  0.626777251185


### Quantifying the cost of mistakes

In [36]:
predictions = decision_tree_model.predict(validation_data_matrix)

In [39]:
safe_loans_idx = (validation_labels == 1)
risky_loans_idx = (validation_labels == -1)
safe_pred_idx = (predictions == 1)
risky_pred_idx = (predictions == -1)
fn = np.sum(safe_loans_idx == risky_pred_idx)
fp = np.sum(risky_loans_idx == safe_pred_idx)
print 'Total false negative predictions are: ', fn
print 'Total false positive predictions are: ', fp

Total false negative predictions are:  3376
Total false positive predictions are:  3376


In [40]:
cost = 10000*fn + 20000*fp
print 'Total cost: ', cost

Total cost:  101280000
