In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

In [2]:
# Use data from Random Forests lesson for Lending Club for 2015
y2015 = pd.read_csv(
    'LoanStats3d.csv',
    skipinitialspace=True,
    header=1,
    low_memory=False
)

In [3]:
# Remove two summary rows at the end that don't actual'id', 'member_idly contain data.
y2015 = y2015[:-2]

# Drop columns with nulls (lots of data to work with)
y2015 = y2015.dropna(axis=1)

In [4]:
# Convert ID and Interest Rate to numeric.
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

In [5]:
y2015.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'pymnt_plan',
       'purpose', 'zip_code', 'addr_state', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_rev_tl_op',
       'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',
       'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
  

In [6]:
# Drop other columns with many unique variables
y2015.drop(['zip_code', 'earliest_cr_line'], 1, inplace=True)

In [7]:
y2015.describe()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,...,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0,421095.0
mean,15240.285862,15240.285862,15234.156426,12.599315,441.842344,76965.61,0.347397,0.57269,11.948643,0.233776,...,0.004042,0.094318,2.170947,93.924313,0.133188,0.064556,173407.0,52221.99,21958.208958,43478.65
std,8571.325901,8571.325901,8567.935757,4.318782,244.847058,73949.96,0.927695,0.867776,5.633119,0.649778,...,0.067965,0.522758,1.853858,8.875399,0.385975,0.444231,175672.2,49074.94,21840.371112,43942.71
min,1000.0,1000.0,900.0,5.32,14.01,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2500.0,0.0,0.0,0.0
25%,8500.0,8500.0,8500.0,9.17,263.93,46000.0,0.0,0.0,8.0,0.0,...,0.0,0.0,1.0,90.9,0.0,0.0,50909.0,22598.5,7800.0,15481.5
50%,14000.0,14000.0,14000.0,12.29,385.41,65000.0,0.0,0.0,11.0,0.0,...,0.0,0.0,2.0,97.4,0.0,0.0,111405.0,39596.0,15200.0,32970.0
75%,20000.0,20000.0,20000.0,15.59,578.79,91690.5,0.0,1.0,15.0,0.0,...,0.0,0.0,3.0,100.0,0.0,0.0,249629.0,65651.0,28600.0,58299.0
max,35000.0,35000.0,35000.0,28.99,1445.46,9500000.0,39.0,6.0,90.0,86.0,...,4.0,39.0,30.0,100.0,11.0,85.0,9999999.0,2921551.0,834300.0,2101913.0


In [10]:
# blind approach
rfc = ensemble.RandomForestClassifier()
X = y2015.drop(['loan_status'], 1)
Y = y2015['loan_status']
X = pd.get_dummies(X)


In [11]:
cross_val_score(rfc, X, Y, cv=5)




array([0.98169124, 0.9897769 , 0.98886237, 0.98955093, 0.98958618])

In [12]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

print(mlp.score(X, Y))
print(Y.value_counts()/len(Y))

cross_val_score(mlp, X, Y, cv=5)

0.9922226575950795
Fully Paid            0.711816
Charged Off           0.180014
Current               0.102825
Late (31-120 days)    0.003227
In Grace Period       0.001453
Late (16-30 days)     0.000663
Default               0.000002
Name: loan_status, dtype: float64




array([0.98942082, 0.99003811, 0.99268574, 0.99091643, 0.99223416])

In [13]:
# Try with multiple layers
mlp = MLPClassifier(hidden_layer_sizes=(500,50,100,))
mlp.fit(X, Y)

print(mlp.score(X, Y))
print(Y.value_counts()/len(Y))

cross_val_score(mlp, X, Y, cv=5)

0.71181562355288
Fully Paid            0.711816
Charged Off           0.180014
Current               0.102825
Late (31-120 days)    0.003227
In Grace Period       0.001453
Late (16-30 days)     0.000663
Default               0.000002
Name: loan_status, dtype: float64




array([0.71179739, 0.71180584, 0.71181087, 0.71181933, 0.71185656])

In [None]:
# once again the multiple layers with fewer data points produces a much lower score
# the scores for random forest and for one layer network with many points are very similar
# and are high enough to suspect overfitting in both cases
# For this data set, random forest takes less time to run, so would pursue adjustments
# to that model to reduce risk of overfitting