In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true;
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

- [0 Introduction](#0-Introduction)
- [1 Data](#1-Data)
  - [1.1 Lending Club Data](#1.1-Lending-Club-Data)
    - [1.1.1 Loan Status](#1.1.1-Loan-Status)
    - [1.1.2 Missing Data](#1.1.2-Missing-Data)
    - [1.1.3 Transform String Variables](#1.1.3-Transform-String-Variables)
    - [1.1.4 Variables with Constant Values](#1.1.4-Variables-with-Constant-Values)
    - [1.1.5 Meaningless Non-predictors](#1.1.5-Meaningless-Non-predictors)
    - [1.1.6 "Post-hoc" Variables](#1.1.6-"Post-hoc"-Variables)
    - [1.1.7 Third party Credit Score](#1.1.7-Third-party-Credit-Score)
    - [1.1.8 Drop Highly-correlated Variables](#1.1.8-Drop-Highly-correlated-Variables)
    - [1.1.9 Transform "annual_inc" to Log Measure](#1.1.9-Transform-"annual_inc"-to-Log-Measure)
    - [1.1.10 Trivial Changes](#1.1.10-Trivial-changes)
  - [1.2 Census Data](#1.2-Census-Data)
  - [1.3 Preprocessing Categorical Variables](#1.3-Preprocessing-Categorical-Variables)

- [2 Analysis](#2-Analysis)
  - [2.1 Visualization and Plots](#2.1-Visualization-and-plots)
    - [2.1.1 Annual Income and Loan Amount](#2.1.1-Annual-Income-and-Loan-Amount)
    - [2.1.2 Debt to Income Ratio and Loan Amount](#2.1.2-Debt-to-Income-Ratio-and-Loan-Amount)
    - [2.1.3 Home Ownership](#2.1.3-Home-Ownership)
  - [2.2 Feature Selection](#2.2-Feature-Selection)
    - [2.2.1 Lasso Regularization](#2.2.1-Lasso-Regularization)
    - [2.2.2 Random Forest Feature Importance](#2.2.2-Random-Forest-Feature-Importance)
    - [2.2.3 Step-wise Backward Feature Selection](#2.2.3-Step-wise-Backward-Feature-Selection)
    - [2.2.4 Features Selection Conclusion](#2.2.4-Features-Selection-Conclusion)
- [3 Classification Models](#3-Classification)
  - [3.1 Performance Evaluation Metrics](#3.1-Performance-Evaluation-Metrics)
  - [3.2 Classification Models](#3.2-Classification-Models)
    - [3.2.1 Logistic Regression Model](#3.2.1-Logistic-Regression-Model)
    - [3.2.2 LDA and QDA](#3.2.2-LDA-and-QDA)
    - [3.2.3 Random Forest](#3.2.3-Random-Forest)
    - [3.2.4 Gradient Boost](#3.2.4-Gradient-Boost)
  - [3.3 Comparison of Different Models](#3.3-Comparison-of-Different-Models)
  - [3.4 Performance Improvements](3.4-Performance-Improvements)
    - [3.4.1 Re-weight Classes](#3.4.1-Re-weight-Classes)
    - [3.4.2 Adjust Threshold](#3.4.2-Adjust-Threshold)
  - [3.5 Replicate Lending Club Credit Grades](#3.5-Replicate-Lending-Club-Credit-Grades)
- [4 Future Work](#4-Future-Work)


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.linear_model import Ridge as Ridge_Reg
from sklearn.linear_model import Lasso as Lasso_Reg
from sklearn.linear_model import LogisticRegression as Logit
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import LabelEncoder as LabelEncoder
import sklearn.preprocessing as Preprocessing
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler as Standardize
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split


import itertools as it
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
from scipy import stats
from itertools import combinations



from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from collections import defaultdict


%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# 0 Introduction

## Project Overview

This project explores the features of Lending Club loans and builds a variety of classification models to predict the loan outcomes based on available information of borrowers and loans. As an improvement of the prediction results, I also tuned the hyper-paremeters of the models to pursue a higher prediction precision at the cost of lower "pass rate", simulating a more strict preselection system. The results of this project could be used to assist loan investors to create customized pool of loans based on their risk tolerance and desire for risk diversification. Another application of this project is, the "pass rate" could be treated as credit grade similar to Lending Club grade ("A" to "G"), and the loans predicted as "non-default" under the "pass rate" are analogical to the loans with corresponding grades.

## Background and Motivation

Lending Club is the world's largest peer-to-peer online lending platform today. It connects people who need money for various purposes with people who seek investment opportunities in loan markets, and increases the mobility and efficiency of the financial market. Lending Club offers investors the access to a large pool of loans with different levels of default risks. Lending Club assigns a credit grade and subgrade to each loan based on the features of loans and borrowers. The interest rate of the loan is based on the credit grade and subgrade. Once decided, the interest rate is fixed throught the term of the loan. Investors could select which loans to invest in based on their risk and return preference. Borrowers can prepay the loans at any time to eliminate future interest payments and there is no prepayment penalty or fee.

This project predicts the loan outcomes based soly on raw information available to investors. Third-party credit scores, Lending Club credit grade, or interest rate are not used as predictors in my classification models (although from the perspective of investors, these are powerful predictors of loan outcome), since they are artificially created by human beings and already incorporated useful information.

# 1 Data 

## 1.1 Lending Club Data

Lending Club data is available at https://www.lendingclub.com/info/download-data.action. In this project I used the data of loans issued in the first three quarters in 2016. I compressed and uploaded this data to the project github page under 'data/LoanStats_securev1_2016Q1.csv.tar.gz', 'LoanStats_securev1_2016Q2.csv.tar.gz', and 'data/LoanStats_2016Q3.csv.tar.gz'. For each loan, 115 features are provided and the description of the features is under 'data/LCDataDictionary.xlsx'.

In [None]:
####k-folds cross validation

X_train_sel = X_train
X_test_sel = X_test

n_obs = X_train.shape[1]
#Parameters for tuning
n_trees = np.arange(200, 800, 100)  # Trees and depth are explored on an exponentially growing space,
depths = np.arange(4, 10, 2)   # since it is assumed that trees and depth will add accuracy in a decaying fashion.

# To keep track of the best model
best_score = 0
best_recall = 0
best_auc = 0

# Run grid search for model with 5-fold cross validation
# print '5-fold cross validation:'

for trees in n_trees:
    for depth in depths:
        
        # Cross validation for every experiment
        kf = KFold(n_obs, 5, shuffle = True)
        scores = []
        recalls = []
        auc = []
        for train_indices, validation_indices in kf:
            # Generate training data
            x_train_cv = X_train_sel.iloc[train_indices, :]
            y_train_cv = y_train[train_indices]
            
#             print x_train_cv.shape, y_train_cv.shape
            # Generate validation data
            x_validate = X_train_sel.iloc[validation_indices, :]
            y_validate = y_train[validation_indices]
            
            # Fit random forest on training data
            rf = RandomForest(n_estimators=trees, max_depth=depth)
            rf.fit(x_train_cv, y_train_cv)
            # Score on validation data
            scores += [rf.score(x_validate, y_validate)]
            y_hat_validate = rf.predict(x_validate)
            recalls += [recall(y_validate, y_hat_validate)]
            
            y_pred_logit = rf.predict_proba(x_validate)[:, 1]
            auc += [roc_auc_score(y_validate, y_pred_logit)]
        # Record and report accuracy
        average_score = np.mean(scores)
        recall_rate = np.mean(recalls)
        avg_auc = np.mean(auc)
#         print "Trees:", trees, "Depth:", depth, "Score:", average_score, "AUC", avg_auc
        
        # Update our record of the best parameters see so far
        if average_score > best_score:
            best_score = average_score
            best_trees = trees
            best_depth = depth
        if avg_auc > best_auc:
            best_auc = avg_auc
            best_auc_trees = trees
            best_auc_depth = depth
# print 'number of trees, depth Chosen by Accuracy:', best_trees, ',', best_depth, ',', best_score
# print 'number of trees, depth Chosen by AUC:', best_auc_trees, ',', best_auc_depth, ',', best_auc

In [None]:
# Fit model on entire train set using chosen number of trees and depth
rf = RandomForest(n_estimators=300, max_depth=8, min_samples_split = 10)
rf.fit(X_train_sel,y_train)

importance_list = rf.feature_importances_
name_list = X_train_sel.columns
importance_list, name_list = zip(*sorted(zip(importance_list[:25], name_list[:25])))
plt.figure(figsize=(5, 6))
plt.barh(range(len(name_list)),importance_list,align='center', color = 'red', alpha = 0.5)
plt.yticks(range(len(name_list)),name_list)
plt.xlabel('Relative Importance in the Random Forest')
plt.ylabel('Features')
plt.title('Relative importance of Each Feature')
plt.show()
# print name_list

### 2.2.3 Step-wise Backward Feature Selection

In [None]:
bcwd_list = ['acc_open_past_24mths', 'addr_state', 'all_util', 'avg_cur_bal', 'bc_open_to_buy', 
             'bc_util', 'delinq_amnt', 'dti', 'loan_amnt', 'max_bal_bc', 'mo_sin_old_il_acct', 'mo_sin_rcnt_tl',
             'mths_since_rcnt_il', 'mths_since_recent_inq', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'revol_bal', 
             'tot_coll_amt', 'tot_cur_bal', 'tot_hi_cred_lim', 'total_acc', 'total_bal_ex_mort', 'total_bal_il',
             'total_bc_limit', 'total_cu_tl', 'total_il_high_credit_limit', 'total_rev_hi_lim', 'Average',
             'acc_now_delinq',  'chargeoff_within_12_mths',  'initial_list_status',  'collections_12_mths_ex_med',
             'application_type',  'inq_fi',  'mo_sin_rcnt_rev_tl_op',  'emp_length',  'inq_last_12m', 
             'inq_last_6mths',  'annual_inc',  'delinq_2yrs',  'home_ownership',]

X_sel_train = X_train[bcwd_list]
X_sel_test = X_test[bcwd_list]

In [None]:
###  Step-wise Backward Selection
d = X_sel_train.shape[1] # total no. of predictors

# Keep track of current set of chosen predictors
current_predictors = range(d)

# Keep track of the best subset of predictors
best_subset = [] 

# Iterate over all possible subset sizes, d predictors to 1 predictor
for size in range(d - 1, 0, -1): # stop before 0 to avoid choosing an empty set of predictors
    max_auc = -1e10 # set some initial small value for max R^2
    worst_predictor = -1 # set some throwaway initial number for the worst predictor to remove
    
#     print size
    
    # Iterate over current set of predictors (for potential elimination)
    for i in current_predictors:
        # Create copy of current predictors, and remove predictor 'i'
        temp = current_predictors[:]
        temp.remove(i)
                                    
        # Use only a subset of predictors in the training data
        x_subset = X_sel_train.values[:, temp]
        
         # Fit and evaluate AUC
        logit.fit(x_subset, y_train)

        y_pred_logit = logit.predict_proba(x_subset)[:, 1]
        auc = roc_auc_score(y_train, y_pred_logit)
        
        # Check if we get a higher AUC than current max AUC, if so, update
        if(auc > max_auc):
            max_auc = auc
            worst_predictor = i
    # Remove worst predictor from current set of predictors
    current_predictors.remove(worst_predictor)
#     print [features[i] for i in current_predictors], max_auc

### 2.2.4 Features Selection Conclusion

Based on the three feature selection methods and analysis of each features, the 25 predictors I choose to continue with include:
'loan_amnt','term', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'purpose',
  'title', 'addr_state','delinq_2yrs', 'open_acc', 'revol_util','open_acc_6m', 'open_il_12m', 'total_cu_tl',  'mo_sin_old_il_acct', 'mort_acc',   'mths_since_recent_bc', 'num_actv_rev_tl', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'avg_cur_bal','bc_open_to_buy','total_rev_hi_lim', 'revol_bal'.

In [None]:
feature = ['loan_amnt','term', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'purpose',
          'title', 'addr_state','delinq_2yrs', 'open_acc', 'revol_util','open_acc_6m', 'open_il_12m', 
           'total_cu_tl', 'mo_sin_old_rev_tl_op',  'mort_acc',   'mths_since_recent_bc',
          'num_actv_rev_tl', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'avg_cur_bal','bc_open_to_buy','total_rev_hi_lim',
           'revol_bal']
# len(feature)

# 3. Classification

## 3.1 Performance Evaluation Metrics

I defined a function called "predict_table", which returns a comprehensive table of the classification results including True(False) Positive(Nagative) predictions as well as recall and precisions on default (class 1) and non-default (class 0) groups. Also I calculated the Accuracy and AUC of the classification results and "Pass rate" defined as the proportion of predicted non-default loans among all of the loans. All of the evaluation metrics are calculated on test data set

In [None]:
def predict_table(y, y_hat):
    tp = 0.0
    tn = 0.0
    fp = 0.0
    fn = 0.0
    for i in range(len(y)):
        if(y[i] == 1 and y_hat[i] == 1):
            tp += 1.0
        elif(y[i] == 0 and y_hat[i] == 0):
            tn += 1.0
        elif(y[i] == 1 and y_hat[i] == 0):
            fn += 1.0
        elif(y[i] == 0 and y_hat[i] == 1):
            fp += 1.0
        else:
            print "Something is wrong with labels!"
    print "Total number of test data: " + str(len(y))
    print "True  Positive:            " + str(tp)
    print "True  Negative:            " + str(tn)
    print "False Positive:            " + str(fp)
    print "False Negative:            " + str(fn)
    print "Recall    on class 1:      " + str(tp/(tp + fn))
    print "Precision on class 1:      " + str(tp/(tp + fp))
    print "Recall    on class 0:      " + str(tn/(tn + fp))
    print "Precision on class 0:      " + str(tn/(tn + fn))
    print "Accuracy:                  " + str((tp + tn)/len(y))
    print "Pass rate:            " + str((tn + fn)/len(y))
#     return (tn + fn)/len(y), (tp + tn)/len(y), tp/(tp + fn), tn/(tn + fn)

With the predictors I selected in the last step, I tried to classify the loans to "default" and "non-default" with different classification models: Logistic Regression, LDA and QDA, Random Forest, and Gradient Boost.

### 1.1.2 Missing Data

I dropped variables with more than 10% missing values in the data set. For the rest missing data, I used scikit-learn impute function to fill the missing data with the mean of the variable in the whole data set.

In [None]:
too_many_missing = []
for column in x.columns:
    if (x[column].isnull().sum() >0.1*x.shape[0]):
        too_many_missing.append(column)
print "The number of variables deleted due to too many massing values is", len(too_many_missing)
x = x.drop(too_many_missing, axis = 1)

### 1.1.3 Transform String Variables

The variable "issue_d" is a string containing the issue year and month of the loans. I created two variables of 'issue_y' and 'issue_m' and deleted the variable "issue_d". Same for string variables 'earliest_cr_line' and 'revol_util'.

In [None]:
## generate year variable
x['issue_y'] = x['issue_d'].str[4:]

## generate month variable
x['issue_m'] = x['issue_d'].str[:3]

x = x.drop(['issue_d'], axis = 1)

In [None]:
x['cr_y'] = x['issue_y'].apply(lambda k: float(k)) - x['earliest_cr_line'].str[4:].apply(lambda k: float(k))
x['revol_util'] = x['revol_util'].apply(lambda k: float(str(k).strip('%'))/100)

x = x.drop(['earliest_cr_line'], axis = 1)

### 1.1.4 Variables with Constant Values

1. 
There might be variables with constant values in the data set, which have no use in predicting the loan outcomes. Therefore I checked the number of unique values for each variable in the data set and deleted the constant variables.
2. 
There might be variables that have valid meaning for default loans only and have no meaning for non-default loans (for example, "collection_recovery_fee"), or vice versa. These variables are strongly associated with the loan outcomes. Therefore, we cannot use them as predictors of loan status, but should not be used as predictors of loan outcomes since they are essentially variable of loan outcomes. Therefore I also deleted variables with constant values for all default loans or with constant values for all non-default loans.


In [None]:
## look at the unique values of each variable in the entire dataset
unique_values = [] 
for column in x.columns:
    
    ## get unique values of variables
    unique = len(x[column].unique())
    if unique == 1:
        print column, "has a constant value for all observations."
    ## append the unique value to unique_values list
    unique_values.append(unique)

#print "The numbers of unique values of corresponding variables are", unique_values

In [None]:
x = x.drop(['policy_code', 'pymnt_plan', 'issue_y'], axis = 1)

In [None]:
## look at the unique values of each variable by loan status
unique_values_by_status = pd.DataFrame({}) 
for column in x.columns:
    
    ## get unique values of variables grouped by loan status
    unique = pd.Series((len(x[column][y == 0].unique()), len(x[column][y == 1].unique())))
    
    if len(x[column][y == 0].unique()) == 1:
        print column, "only valid for default loans"
        
    if len(x[column][y == 1].unique()) == 1:
        print column, "only valid for non-default loans"
    ## append the unique value series to unique_values_by_status dataframe
    unique_values_by_status = pd.concat([unique_values_by_status, unique], axis=1)

In [None]:
x = x.drop(['collection_recovery_fee', 'out_prncp', 'out_prncp_inv', 'recoveries'], axis = 1)

### 1.1.5 Meaningless Non-predictors

Delete variables that are not meaningful and thus do not contribute to the prediction of loan outcome.

In [None]:
## Drop non-predictors and keep valid predictors only
x = x.drop(['member_id', 'id', 'url', 'emp_title'], axis = 1, inplace = False)

###  1.1.6 "Post-hoc" Variables

"Post-hoc" variables are variables that are not available until the issuance of the loans, such as principle received up to today. Since we are predicting the loan outcomes to decide whether or not to lend the money, we need to make the decision with avaialble information before the issuance of loans. Therefore I went through the documentation and discovered all the "post-hoc" variables. Including the "post-hoc" variables as predictors gave me perfect predicting accuracy (around 98%), but in practical it makes no sense to include the post-hoc information in prediction model, so I excluded all the post-hoc variables.

In [None]:
x = x.drop(['total_pymnt', 'total_pymnt_inv', 'last_pymnt_amnt','last_pymnt_d', 'total_rec_int', 'total_rec_late_fee', 'last_credit_pull_d', 'total_rec_prncp'], axis = 1)

### 1.1.7 Third-party Credit Score

The Lending Club data set also include FICO scores of the borrowers and Lending Club credit grade of the loans. Since the credit score and grade are artificially generated by human beings trying to predict the default probability of the loans, and the goal of this project is to predict the outcomes of loans from raw information, it is appropriate to exclude the credit grade as predictors. Similarly, the interest rate of loans should be excluded because it is also a reflection of the default probability of the loans.

In [None]:
x = x.drop(['grade', 'sub_grade', 'int_rate'], axis = 1)

### 1.1.8 Drop Highly-correlated variables

Among the remaining 80 features, some of them are highly-correlated. I plotted the correlation between each pair of the features and identify the features of high correlation. Then I deleted the highly-correlated features and keep only one representing to represent its peer features.

In [None]:
corr_m = x.corr()
# corr_m
high_corr = ['funded_amnt', 'funded_amnt_inv', 'installment']
x = x.drop(high_corr, axis = 1) 

### 1.1.9 Transform "annual_inc" to Log Measure

### 2.1.1 Annual Income and Loan Amount

First I explored the relationship between loan amount and annual income, and plotted the relationship curves separately for different loan outcomes (default and non-default). For each range of loan amount and for two different loan outcomes, I calculate the average annual income of borrowers. I found that the two relationship curves far away from each other when the loan amount is relatively low, but converges together as loan amount exceeds 20000. Note that annual income has been transformed to log measure due to right skewness of distribution.

This result gives a very important hint: for small loans with amount less than 20000, borrower's annual income has a very important influence on loan outcome. However, for large loans with amount greater than 20000, borrower's annual income does not make a difference in loan outcome.

Generally, the red line lies below the blue line, indicating whatever the loan amount, the average annual income of non-defualt loan borrowers is higher than the default loan borrowers. This is consistent with our intuition.  

In [None]:
bin_means_0, bin_edges_0, binnumber_0 = stats.binned_statistic(x['loan_amnt'][y == 0], x['annual_inc'][y == 0], statistic='mean', bins=25)
bin_width_0 = (bin_edges_0[1] - bin_edges_0[0])
bin_centers_0 = bin_edges_0[1:] - bin_width_0/2

bin_means_1, bin_edges_1, binnumber_1 = stats.binned_statistic(x['loan_amnt'][(y == 1) & (x['annual_inc'].isnull().values == False)], x['annual_inc'][(y == 1)& (x['annual_inc'].isnull().values == False)], statistic='mean', bins=25)
bin_width_1 = (bin_edges_1[1] - bin_edges_1[0])
bin_centers_1 = bin_edges_1[1:] - bin_width_1/2

plt.scatter(bin_centers_0, bin_means_0, c= 'blue', label = 'non-default')
plt.plot(bin_centers_0, bin_means_0, c= 'blue')
plt.scatter(bin_centers_1, bin_means_1, c='red', label = 'default')
plt.plot(bin_centers_1, bin_means_1, c='red')

plt.title("average annual_income for each range of loan amount")
plt.xlabel('loan amount')
plt.ylabel('annual income')
plt.legend(loc = 2)
plt.xlim(0, 40000)
plt.show()

### 2.1.2 Debt to Income Ratio and Loan Amount

Then I plotted the relationship curves of debt to income ratio and loan amount by loan outcomes in the same way as 2.1.1. Debt to income ratio is also different for default and non-default loans for each range of loan amount. Generally the borrowers of loans that default have higher debt to income ratio than the borrowers whose loans did not default.

The two curves are clearly separated and parallel with each other when loan amount is small. However, as loan amount grows, the two curves converges to each other and even crosses over. This indicates that debt to income ratio might work well in separating the default and non-default loans for small amount loans, but for large amount loans, the influence of debt to income ratio on the loan outcome becomes trivial.

In [None]:
bin_means_0, bin_edges_0, binnumber_0 = stats.binned_statistic(x['loan_amnt'][y == 0], x['dti'][y == 0], statistic='mean', bins=25)
bin_width_0 = (bin_edges_0[1] - bin_edges_0[0])
bin_centers_0 = bin_edges_0[1:] - bin_width_0/2

bin_means_1, bin_edges_1, binnumber_1 = stats.binned_statistic(x['loan_amnt'][y == 1], x['dti'][y == 1], statistic='mean', bins=25)
bin_width_1 = (bin_edges_1[1] - bin_edges_1[0])
bin_centers_1 = bin_edges_1[1:] - bin_width_1/2

plt.scatter(bin_centers_0, bin_means_0, c= 'blue', label = 'non-default')
plt.plot(bin_centers_0, bin_means_0, c= 'blue')
plt.scatter(bin_centers_1, bin_means_1, c='red', label = 'default')
plt.plot(bin_centers_1, bin_means_1, c='red')

plt.title("debt to income ratio for each range of loan amount")
plt.xlabel('loan amount')
plt.ylabel('debt to income')
plt.legend(loc = 2)
plt.xlim(0, 40000)
plt.show()

### 2.1.3 Home Ownership

The type of home ownership might also be related with loan outcomes, since different home ownership might be an indication of the borrower's financial situation and influence the cash flow capability of the borrowers.

From the results we can find that the "RENT" type of ownership has the highest proportion of default loans, while "MORTGAGE" type of ownership has the lowest. This makes sense because people who choose to apply for mortgage and got approved are those with good credit history and stable income cash flows compared to people who rent to live. 

In [None]:
## one observation with home_ownership == any, drop it
# y = y[x['home_ownership'].values != 'ANY']
# x = x[x['home_ownership']!= 'ANY']

print "The proportion of default by different types of home ownership is:"
print x[y == 1].home_ownership.value_counts()/(x[y == 1].home_ownership.value_counts() + x[y == 0].home_ownership.value_counts())
ax1 = plt.subplot()
x[y == 1].home_ownership.value_counts().plot(kind='bar', color = 'red', position=0,width=0.25,alpha = 0.4, axes = ax1, label = 'default')
x[y == 0].home_ownership.value_counts().plot(kind='bar', color = 'blue',position=1,width=0.25, alpha = 0.4, axes = ax1, label = 'non-default')


plt.legend()
plt.title('Bar plot of Home Ownership')
plt.show()

In [None]:
x = x.drop(['loan_status'], axis = 1)
x_copy = x[:]

## 2.2 Feature Selection

Before building any classification model, I analyzed the features in the data set and tries to select the most predictive features. I used three methods to select features: Lasso, Random Forest Importance Rank, and Step-wise Backward Feature Selection. The features selected by different methods are analyzed in combination trying to figure out the "best" set of predictors.

Before conducting feature selection, I split the whold data set into training and testing data sets to avoid fitting to the test data set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_encod, y, test_size=0.3)

In [None]:
def recall(y, y_hat):
    score = np.mean(y_hat[y == 1])
    return score

def precision(y, y_hat):
    score = np.mean(y[y_hat == 1])
    return score

def F_score(y, y_hat):
    prec = precision(y,y_hat)
    rec = recall(y, y_hat)
    score = 2*(prec*rec)/(prec+rec)
    return score

### 2.2.1 Lasso Regularization

First I use logistic regression with Lasso regularization with different penaly hyper-parameters C. Smaller C means stricter regularization and fewer non-zero parameter features. There is a trade-off between bias and variance: models with fewer features tend to be robust to changes of data, but with more classification error; models with more features are more flexible and good at fitting the training data, but tend to be volatile and sensitivity to changes in data set. There is a "sweet point" in-between such that a model gives good performance results with appropriate number of predictors.

I drew the ROC curves of all the logistic models with different C-parameters, and based on the analysis of Logistic regression results and ROC curves, I found the "sweet point" with C = 0.001 and 27 non-zero parameter predictors.

In [None]:
Cs = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

fpr = [0]*len(Cs)
tpr = [0]*len(Cs)

for i in range(len(Cs)):
    features = []
    C = Cs[i]
    print "penalty C is", C
    logit = Logit(penalty='l1', C=C)
    
    logit.fit(X_train, y_train)
    coefs = logit.coef_

    
    #### predict y_hat
    y_hat = logit.predict(X_train)
    
    #### draw ROC 
    y_pred_logit = logit.predict_proba(X_train)[:, 1]
    fpr[i], tpr[i], _ = roc_curve(y_train, y_pred_logit)
    
    #### display the predictors with non-zero parameters
    count = 0
    for k in range(len(coefs[0])):
        if coefs[0][k] != 0:
            features.append(X_train.columns.values[k])
            count+=1
    print "Number of non-zero parameters is", count
#     print features
    print "accuracy rate is", logit.score(X_train, y_train)
    print "recall rate is", recall(y_train, y_hat)
    print "precision rate is", precision(y_train, y_hat)
    print "F score is", F_score(y_train, y_hat)
    print "AUC score is", roc_auc_score(y_train, y_pred_logit)
    print

In [None]:
for i in range(len(Cs)):
    plt.plot(fpr[i], tpr[i], label= 'C =' + str(Cs[i]))

plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.plot([0, 1], [0, 1], 'k--')
plt.legend(loc = 4)
plt.show()

### 2.2.2 Random Forest Feature Importance

In [None]:
# plt.hist(x['annual_inc'])
# plt.clf()
x['annual_inc'] = np.log(x['annual_inc']+1)
# plt.hist(x['annual_inc'])

### 1.1.10 Trivial changes

In [None]:
X_sel_train = X_train[feature]
X_sel_test = X_test[feature]

## 3.2 Classification Models

### 3.2.1 Logistic Regression Model 

In [None]:
clf_slc = Logit()
clf_slc.fit(X_train_sel, y_train)
y_predict_slc = clf_slc.predict(X_test_sel)
y_predict_prob_slc = clf_slc.predict_proba(X_test_sel)[:,1]

fpr_slc, tpr_slc, _ = roc_curve(y_test, y_predict_prob_slc)

print "**************************************************"
print " Sklearn logistic regression Model performance report"
print "**************************************************"
predict_table(y_test, y_predict_slc)
print "auc score = ", roc_auc_score(y_test, y_predict_prob_slc)

### 3.2.2 LDA and QDA

In [None]:
clf_lda = LDA()
clf_lda.fit(X_train_sel, y_train)
y_predict_lda = clf_lda.predict(X_test_sel)
y_predict_prob_lda = clf_lda.predict_proba(X_test_sel)[:,1]

fpr_lda, tpr_lda, _ = roc_curve(y_test, y_predict_prob_lda)

clf_qda = QDA()
clf_qda.fit(X_train_sel, y_train)
y_predict_qda = clf_qda.predict(X_test_sel)
y_predict_prob_qda = clf_qda.predict_proba(X_test_sel)[:,1]

fpr_qda, tpr_qda, _ = roc_curve(y_test, y_predict_prob_qda)


print "**************************************************"
print " LDA Model performance report"
print "**************************************************"
predict_table(y_test, y_predict_lda)
print "auc score = ", roc_auc_score(y_test, y_predict_prob_lda)

print "**************************************************"
print " QDA performance report"
print "**************************************************"
predict_table(y_test, y_predict_qda)
print "auc score = ", roc_auc_score(y_test, y_predict_prob_qda)

### 3.2.3 Random Forest

In [None]:
X_train_sel = X_train[feature]
X_test_sel = X_test[feature]

n_obs = X_train.shape[1]
#Parameters for tuning
n_trees = np.arange(200, 800, 100)  # Trees and depth are explored on an exponentially growing space,
depths = np.arange(4, 10, 2)   # since it is assumed that trees and depth will add accuracy in a decaying fashion.

# To keep track of the best model
best_score = 0
best_recall = 0
best_auc = 0

# Run grid search for model with 5-fold cross validation
print '5-fold cross validation:'

for trees in n_trees:
    for depth in depths:
        
        # Cross validation for every experiment
        kf = KFold(n_obs, 5, shuffle = True)
        scores = []
        recalls = []
        auc = []
        for train_indices, validation_indices in kf:
            # Generate training data
            x_train_cv = X_train_sel.iloc[train_indices, :]
            y_train_cv = y_train[train_indices]
            
#             print x_train_cv.shape, y_train_cv.shape
            # Generate validation data
            x_validate = X_train_sel.iloc[validation_indices, :]
            y_validate = y_train[validation_indices]
            
            # Fit random forest on training data
            rf = RandomForest(n_estimators=trees, max_depth=depth)
            rf.fit(x_train_cv, y_train_cv)
            # Score on validation data
            scores += [rf.score(x_validate, y_validate)]
            y_hat_validate = rf.predict(x_validate)
            recalls += [recall(y_validate, y_hat_validate)]
            
            y_pred_logit = rf.predict_proba(x_validate)[:, 1]
            auc += [roc_auc_score(y_validate, y_pred_logit)]
        # Record and report accuracy
        average_score = np.mean(scores)
        recall_rate = np.mean(recalls)
        avg_auc = np.mean(auc)
        print "Trees:", trees, "Depth:", depth, "Score:", average_score, "AUC", avg_auc
        
        # Update our record of the best parameters see so far
        if average_score > best_score:
            best_score = average_score
            best_trees = trees
            best_depth = depth
        if avg_auc > best_auc:
            best_auc = avg_auc
            best_auc_trees = trees
            best_auc_depth = depth
print "Best number of trees and best tree depth according to accuracy is", best_trees, best_depth
print "Best number of trees and best tree depth according to AUC is", best_auc_trees, best_auc_depth

In [None]:
rf = RandomForest(n_estimators = 300, max_depth = 8, random_state = 8)
rf.fit(X_train_sel, y_train)

y_predict_rf = rf.predict(X_test_sel)
y_predict_prob_rf = rf.predict_proba(X_test_sel)[:,1]

fpr_rf, tpr_rf, _ = roc_curve(y_test, y_predict_prob_rf)


print "**************************************************"
print " Random Forest Model performance report"
print "**************************************************"
predict_table(y_test, y_predict_rf)
print "auc score = ", roc_auc_score(y_test, y_predict_prob_rf)

### 3.2.4 Gradient Boost

In [None]:
# Set the parameters by cross-validation
tuned_parameters = [{'n_estimators': [50, 100, 200], 'learning_rate': [1.0, 5.0, 10.0], 'max_depth': [1, 3, 5]}]

scores = ['recall']
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train_sel, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
   
    print()

In [None]:
clf_boost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf_boost.fit(X_train_sel, y_train)
y_predict_boost = clf_boost.predict(X_test_sel)
y_predict_prob_boost = clf_boost.predict_proba(X_test_sel)[:,1]


fpr_boost, tpr_boost, _ = roc_curve(y_test, y_predict_prob_boost)

print "**************************************************"
print "Boosting Model performance report"
print "**************************************************"
predict_table(y_test, y_predict_boost)
print "auc score = ", roc_auc_score(y_test, y_predict_prob_boost)

## 3.3 Comparison of Different Models

In [None]:
data1 = pd.read_csv("./data/LoanStats_securev1_2016Q1.csv", skiprows = 1)
data2 = pd.read_csv("./data/LoanStats_securev1_2016Q2.csv", skiprows = 1)
data3 = pd.read_csv("./data/LoanStats_2016Q3.csv", skiprows = 1)
## concatenate datasets into one dataset
data_all = pd.concat((data1, data2, data3), axis = 0, ignore_index=True)

### 1.1.1 Loan Status

A total of 330867 loans were issued by lending club in the first three quarters of year 2016. The loans might have one of several status such as "Current", "Fully Paid", "Charged Off", etc. I treated the "Current" status as unknown status, and focus on the loans with known status that are either "Fully Paid" (meaning non-default) or "Default" (including "Charged Off", "Late", "In Grace Period", "Does not meet the credit policy", "Default"). I assigned the value of 1 to the default loans and the value of 0 to non-default loans.

In [None]:
print "The distribution of loan status:"
print
print data_all['loan_status'].value_counts()

In [None]:
## remove "current status"
data = data_all[data_all['loan_status']!= 'Current']

## remove observations with unknown (NaN) loan status
data = data[data['loan_status'] == data['loan_status']]

# y == 1 if default, y == 0 if fully paid
y = np.ones(data.shape[0])
y[data['loan_status'].values =='Fully Paid'] = 0

total_default = len(y[y==1])
total_non_default = len(y[y==0])
print "The number of newly-defined defaulted loans is", total_default, ", and the number of non-default loans is", total_non_default

In [None]:
## x get a copy of data
x = data[:]