### Author: Byron Pineda
##### Supervised Learning Predicting Credit Risk

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.head()
print(train_df.shape)

(12180, 86)


In [4]:
train_df['loan_status'].unique()

array(['low_risk', 'high_risk'], dtype=object)

In [5]:
test_df.head()
print(test_df.shape)

(4702, 86)


In [6]:
test_df['loan_status'].unique()

array(['low_risk', 'high_risk'], dtype=object)

In [7]:
# Convert categorical data to numeric and separate target feature for training data
Xtrain = train_df.drop('loan_status', axis = 1)
print(Xtrain)

ytrain = train_df['loan_status'].values
print(ytrain)

       Unnamed: 0   index  loan_amnt  int_rate  installment home_ownership  \
0           57107   57107    13375.0    0.1797       483.34       MORTGAGE   
1          141451  141451    21000.0    0.1308       478.68       MORTGAGE   
2          321143  321143    20000.0    0.1240       448.95       MORTGAGE   
3           11778   11778     3000.0    0.1240       100.22           RENT   
4          169382  169382    30000.0    0.1612      1056.49       MORTGAGE   
...           ...     ...        ...       ...          ...            ...   
12175      354912  354912    19975.0    0.2565       801.09           RENT   
12176      354944  354944    15000.0    0.1774       540.34           RENT   
12177      354973  354973     3600.0    0.1862       131.28           RENT   
12178      355002  355002    15000.0    0.0881       475.68       MORTGAGE   
12179      355307  355307    15000.0    0.1774       540.34           RENT   

       annual_inc verification_status pymnt_plan    dti  ...  p

In [8]:
Xtrain_dummies = pd.get_dummies(Xtrain)
print(Xtrain_dummies.columns)
print(Xtrain.shape)
print(Xtrain_dummies.shape)

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc'

In [9]:
# Convert categorical data to numeric and separate target feature for testing data
Xtest = test_df.drop('loan_status', axis = 1)
ytest = test_df['loan_status'].values

In [10]:
print(Xtest)

      Unnamed: 0  index  loan_amnt  int_rate  installment home_ownership  \
0          67991  67991    40000.0    0.0819       814.70       MORTGAGE   
1          25429  25429     6000.0    0.1524       208.70           RENT   
2          38496  38496     3600.0    0.1695       128.27           RENT   
3          19667  19667    20000.0    0.1524       478.33           RENT   
4          37505  37505     3600.0    0.1240       120.27           RENT   
...          ...    ...        ...       ...          ...            ...   
4697       77282  77282    30000.0    0.1240       673.42           RENT   
4698       77291  77291    24000.0    0.0756       747.22           RENT   
4699       77292  77292    10000.0    0.2305       387.36           RENT   
4700       77297  77297     8000.0    0.1862       205.86           RENT   
4701       77304  77304    30000.0    0.2055      1123.34           RENT   

      annual_inc verification_status pymnt_plan    dti  ...  pct_tl_nvr_dlq  \
0       

In [11]:
print(ytest)

['low_risk' 'low_risk' 'low_risk' ... 'high_risk' 'high_risk' 'high_risk']


In [12]:
Xtest_dummies = pd.get_dummies(Xtest)
print(Xtest_dummies.columns)
Xtest_dummies

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc'

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,0,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,...,1,0,1,1,0,1,0,1,0,1
4698,77291,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,...,0,0,1,0,1,1,0,1,0,1
4699,77292,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,...,0,1,1,1,0,1,0,1,0,1
4700,77297,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,...,1,0,1,0,1,1,0,1,0,1


In [13]:
print(Xtest.shape)
print(Xtest_dummies.shape)

(4702, 85)
(4702, 93)


In [14]:
# add missing dummy variables to testing set
Xtest_dummies['debt_settlement_flag_Y'] = 0
print(Xtest_dummies)

      Unnamed: 0  index  loan_amnt  int_rate  installment  annual_inc    dti  \
0          67991  67991    40000.0    0.0819       814.70    140000.0  19.75   
1          25429  25429     6000.0    0.1524       208.70     55000.0  11.52   
2          38496  38496     3600.0    0.1695       128.27     42000.0   6.74   
3          19667  19667    20000.0    0.1524       478.33    100000.0  12.13   
4          37505  37505     3600.0    0.1240       120.27     50000.0  16.08   
...          ...    ...        ...       ...          ...         ...    ...   
4697       77282  77282    30000.0    0.1240       673.42    140480.0  15.74   
4698       77291  77291    24000.0    0.0756       747.22     50000.0  26.81   
4699       77292  77292    10000.0    0.2305       387.36     33000.0  38.51   
4700       77297  77297     8000.0    0.1862       205.86     38000.0  16.36   
4701       77304  77304    30000.0    0.2055      1123.34    180000.0  12.06   

      delinq_2yrs  inq_last_6mths  open

In [15]:
print(Xtest_dummies.shape)
print(Xtrain_dummies.shape)

(4702, 94)
(12180, 94)


##### Prediction - Which model will perform better a logistic regression or random forest classifier with unscaled data on the two loan datasets?

We know from the instructions that the two loan datasets have been undersampled to give an even number of high and low risk loans.  My initial guess was the forests classifier would be the better model simply because in general they produce better results and work on missing data by creating estimates. 

After running the models/scoring with the unscaled the logistic regression scored training score was 0.648440065681445 and the testing score was 0.5253083794130158 compared to the random forest classifier which had a training score of 1.0 and a testing score of 0.6180348787749894. In this case the forest classifier testing score was better than the logistic regression testing score namely .6180 versus .5253 with the unscaled data.

In [16]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [17]:
classifier.fit(Xtrain_dummies, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [18]:
print(f'Training Score: {classifier.score(Xtrain_dummies, ytrain)}')
print(f'Testing Score: {classifier.score(Xtest_dummies, ytest)}')

Training Score: 0.648440065681445
Testing Score: 0.5253083794130158


In [19]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(Xtrain_dummies, ytrain)
print(f'Training Score: {clf.score(Xtrain_dummies, ytrain)}')
print(f'Testing Score: {clf.score(Xtest_dummies, ytest)}')

Training Score: 1.0
Testing Score: 0.6180348787749894


##### Prediction - Which model will perform better a logistic regression or random forest classifier with scaled data on the two loan datasets?

After seeing the previous results my guess again was the forests classifier would be the better model with the scaled data simply because in general they produce better results than logistic regression.

After running the models/scoring on the scaled data the logistic regression scored training score was 0.713136288998358 and the testing score was 0.7207571246278179 compared to the random forest classifier which had a training score of 1.0 and a testing score of 0.6193109315185028. The training/test scores for the random forest classifier was nearly identical to the previous test using unscaled data. In this case, the forest classifier testing score was not as good as the logistic regression testing score namely .6193 versus .7207 with the scaled data. The logistic regression training and testing scores were very close at .71 plus.

In retrospect and after reviewing the data again it makes sense that with the scaled data the logistic regression would be the better model. There are some very large discrepancies in the size of some of the data elements compared to each other. This showed the importance of scaling data for bettering a model using logistic regression although scaling did not meaningfully change the results for the random forest classifier.

In [20]:
# Scale the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
scaler = StandardScaler().fit(Xtrain_dummies)
X_train_scale = scaler.transform(Xtrain_dummies)
X_test_scale = scaler.transform(Xtest_dummies)

In [21]:
# Train the Logistic Regression model on the scaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_scale, ytrain)
print(f'Training Score: {classifier.score(X_train_scale, ytrain)}')
print(f'Testing Score: {classifier.score(X_test_scale, ytest)}')

Training Score: 0.713136288998358
Testing Score: 0.7207571246278179


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scale, ytrain)
print(f'Training Score: {clf.score(X_train_scale, ytrain)}')
print(f'Testing Score: {clf.score(X_test_scale, ytest)}')

Training Score: 1.0
Testing Score: 0.6193109315185028
