In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
file = 'C:\\Users\\Abe\\Data Science Bootcamp\\Unit 3\\Random Forest\\LoanStats3d.csv'
col = list(range(2,111))
tp = pd.read_csv(file, skipinitialspace=True, header=1, usecols=col, iterator=True, chunksize=1000, dtype={19:str, 55:str})
df = pd.concat(tp, ignore_index=True)

In [3]:
# Use random sample of one tenth of the data so I don't get a memory error
df_ss = df.sample(frac=.1)

In [4]:
categorical = df_ss.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

term
2
int_rate
109
grade
7
sub_grade
35
emp_title
18476
emp_length
12
home_ownership
3
verification_status
3
issue_d
12
loan_status
7
pymnt_plan
2
desc
6
purpose
12
title
15
zip_code
855
addr_state
49
earliest_cr_line
590
revol_util
1060
initial_list_status
2
last_pymnt_d
28
next_pymnt_d
3
last_credit_pull_d
28
application_type
2
verification_status_joint
1


In [5]:
# Convert ID and Interest Rate to numeric.
# df_ss['id'] = pd.to_numeric(df_ss['id'], errors='coerce')
df_ss['int_rate'] = pd.to_numeric(
                    df_ss['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
df_ss.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line',
            'revol_util', 'sub_grade', 'addr_state', 'desc'],
            1, inplace=True)

In [6]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = df_ss.ix[:, df_ss.columns != 'loan_status']
Y = df_ss['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

forest = cross_val_score(rfc, X, Y, cv=10)
print (forest)
print (forest.mean())



[ 0.9774561   0.98148588  0.98101116  0.97934473  0.98005224  0.98052719
  0.98004751  0.98241863  0.98170587  0.98003802]
0.980408731975


# End of Guidance
## Goals:
Reduce to minimum # of features while keeping accuracy of a 10 fold random forest > 90. First by PCA, second by using only the highest correlated features.

In [7]:
Y.groupby(Y).count()

loan_status
Charged Off            3914
Current               25989
Default                   7
Fully Paid            10856
In Grace Period         431
Late (16-30 days)       150
Late (31-120 days)      763
Name: loan_status, dtype: int64

In [8]:
# Drop the loan status of 'Default'
df_ss.drop(df_ss[df_ss['loan_status'] == 'Default'].index,inplace=True)

df_ss['loan_status'].groupby(df_ss['loan_status']).count()

loan_status
Charged Off            3914
Current               25989
Fully Paid            10856
In Grace Period         431
Late (16-30 days)       150
Late (31-120 days)      763
Name: loan_status, dtype: int64

In [9]:
# Redo random forest without the error

rfc = ensemble.RandomForestClassifier()
X = df_ss.ix[:, df_ss.columns != 'loan_status']
Y = df_ss['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

scre = cross_val_score(rfc, X, Y, cv=10)
print(scre)
print(scre.mean())

[ 0.97958699  0.9817189   0.97910731  0.98052719  0.98052257  0.98028504
  0.98028035  0.98265621  0.98146828  0.9793251 ]
0.980547793692


In [10]:
# Rank feature importances

rfc.fit(X,Y)
importances = rfc.feature_importances_
imp_df = pd.DataFrame()
imp_df['features'] = X.columns
imp_df['importance'] = importances

imp_df.sort(columns='importance',ascending=False,inplace=True)
imp_df.head(10)



Unnamed: 0,features,importance
14,out_prncp_inv,0.350992
157,next_pymnt_d_May-2017,0.078258
13,out_prncp,0.076037
17,total_rec_prncp,0.066781
22,last_pymnt_amnt,0.062199
15,total_pymnt,0.040454
16,total_pymnt_inv,0.032291
160,last_credit_pull_d_Apr-2017,0.031574
129,last_pymnt_d_Apr-2017,0.030514
21,collection_recovery_fee,0.030475


In [11]:
# Use PCA to reduce the number of features
from sklearn.decomposition import PCA 

sklearn_pca = PCA(n_components=12)
X_sklearn = sklearn_pca.fit_transform(X,Y)

print(sklearn_pca.explained_variance_ratio_)

scre = cross_val_score(rfc, X_sklearn, Y, cv=10)
print('\n')
print(scre)
print('\n')
print(scre.mean())

[  8.28065266e-01   6.08054809e-02   4.34785742e-02   3.22280110e-02
   2.22837082e-02   4.59955858e-03   3.43337154e-03   1.81772788e-03
   1.27152827e-03   8.84621355e-04   6.37702059e-04   2.26023765e-04]


[ 0.96083551  0.96082621  0.95916429  0.96057943  0.95795724  0.96057007
  0.96246139  0.96127346  0.96269898  0.96055133]


0.960691792625


In [12]:
# # Turn Y variables into dummies to run a correlation matrix

# # Get dummies for Y and then combine with X and its dummies
# y_dum = pd.get_dummies(Y)
# y_df = pd.concat([y_dum, X], axis=1)

# # Create correlation matrix and combine the absolute values of the y dummy correlation for an idea of total correlation
# corrmat = y_df.corr()
# y_ = np.sum(np.absolute(corrmat.ix[:, corrmat.columns[0:5]]),axis=1)
# y_ = pd.DataFrame(y_, columns=['correlation'])
# y_cols = y_dum.columns.tolist()
# y_ = y_[~y_.index.isin(y_cols)]
# y_.sort_values('correlation',ascending=False,inplace=True)
# y_

In [13]:
# # Select how many of the top features we want to use
# x_ = y_.index[0:6].tolist()

# X = df_ss.ix[:, df_ss.columns != 'loan_status']
# X = pd.get_dummies(X)
# Y = df_ss['loan_status']
# X = X.ix[:, X.columns.isin(x_)]

# forest = cross_val_score(rfc, X, Y, cv=10)
# print (forest)
# print (forest.mean())
# x_

In [23]:
# Perform Anova for every column against Y

test_output = []
p_output = []
for col in df_ss.columns[~df_ss.columns.isin(categorical)]:
    value = stats.f_oneway(*[value for name, value in df_ss.groupby('loan_status')[col]])
    test_output.append(value[0])
    p_output.append(value[1])
    
build = {'name':df_ss.columns[~df_ss.columns.isin(categorical)],'test_statistic':test_output,'p_value':p_output}

anova = pd.DataFrame(build)
anova.sort_values(by='test_statistic', inplace=True,ascending=False)
anova

  f = msb / msw


Unnamed: 0,name,p_value,test_statistic
23,last_pymnt_amnt,0.000000e+00,9897.326126
15,out_prncp_inv,0.000000e+00,6142.398180
14,out_prncp,0.000000e+00,6141.580405
18,total_rec_prncp,0.000000e+00,4431.717504
17,total_pymnt_inv,0.000000e+00,2385.946837
16,total_pymnt,0.000000e+00,2385.759174
21,recoveries,0.000000e+00,2171.486375
22,collection_recovery_fee,0.000000e+00,2166.670435
19,total_rec_int,0.000000e+00,530.276496
47,acc_open_past_24mths,8.303079e-215,203.200335
