In [28]:
# importing dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
# creating training and testing dataframes
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))

In [3]:
# viewing train_df
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [4]:
# viewing test_df
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.4,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.143,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.143,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.7,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.5,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk


## Data Preprocessing

In [15]:
# removing target from the data
# retaining only predictor variables in X_train_dummies dataframe
# applying one-hot encoding with get_dummies
X_train = pd.get_dummies(train_df.drop('target', axis = 1))
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0


In [6]:
# Saving outcome variable for training data
y_train = train_df['target']
y_train.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: target, dtype: object

In [16]:
# removing target from the data
# retaining only predictor variables in X_test_dummies dataframe
# applying one-hot encoding with get_dummies
X_test = pd.get_dummies(test_df.drop('target', axis = 1))
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [17]:
# saving outcome for testing data
y_test = test_df['target']
y_test.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: target, dtype: object

In [20]:
# adding missing dummy variables to testing set
for column in X_train.columns:
    if column not in X_test.columns:
        X_test[column] = 0

## Training and Testing Logistic Regression Model and Random Forest Classifier

### Model Prediction: The model has numerous predictors included (92); therefore, the model is subject to noise. While a logistic regression is good for classifying an outcome (e.g. low-risk versus high-risk), to have a parsimonious model, I would belive the Random Forests Classifier would be more optimal because it will simplify the model by selecting only the relevant subfeatures.

In [21]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()

# fitting the model
classifier.fit(X_train, y_train)

# scoring the model
classifier.score(X_test, y_test)

0.5082943428328371

In [22]:
# Train a Random Forest Classifier model and print the model score
rf_classifier = RandomForestClassifier()

# fitting the model
rf_classifier.fit(X_train, y_train)

#scoring the model
rf_classifier.score(X_test, y_test)

0.620374308804764

### Observation on the Outcome: The Random Forest Classifier is the better fitting model, which is consistent with my predictions. It would make sense that a more parsimonious model would better predict the outcome than an all-inclusive model.

## Scaling the data

### Prediction on the impact of scaling: One reason we scale the data is because machine learning algorithms can be sensitive to large data values. Standardizing these ranges can reduce the sensitivity to these large values. Based on the data, I would predict that scaling would improve model fit since we have some numerical data with rather large values (e.g. income, revolving_balance).

In [23]:
# scaling the X_train data using the standard scaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-1.00869871,  0.91961977, -0.85042027, ..., -0.16843038,
         0.02026518, -0.02026518],
       [ 2.22409871,  0.38717869,  1.60261212, ..., -0.16843038,
         0.02026518, -0.02026518],
       [-0.61684447,  1.2257734 , -0.71936772, ..., -0.16843038,
         0.02026518, -0.02026518],
       ...,
       [-1.3417748 ,  0.85876936, -1.27703164, ..., -0.16843038,
         0.02026518, -0.02026518],
       [-0.22499024, -1.00667601, -0.10257157, ..., -0.16843038,
         0.02026518, -0.02026518],
       [-0.22499024,  0.69143074,  0.11792955, ..., -0.16843038,
         0.02026518, -0.02026518]])

In [24]:
# scaling the X_test data using the standard scaler
scaler = StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 2.26900798, -0.76980798,  1.19064216, ...,  7.24098185,
         0.        ,  0.        ],
       [ 0.73766371, -0.08638596,  0.23127262, ..., -0.13810282,
         0.        ,  0.        ],
       [-0.3406784 , -0.08638596, -0.6361438 , ...,  7.24098185,
         0.        ,  0.        ],
       ...,
       [-0.68535395,  1.41989683, -0.39559122, ..., -0.13810282,
         0.        ,  0.        ],
       [-0.88231142,  0.65728737, -1.00940105, ..., -0.13810282,
         0.        ,  0.        ],
       [ 1.28422067,  0.98953032,  2.09339915, ..., -0.13810282,
         0.        ,  0.        ]])

In [25]:
# Train the Logistic Regression model on the scaled data and print the model score
scaled_classifier = LogisticRegression()

# fitting the model
scaled_classifier.fit(X_train_scaled, y_train)

# scoring the model
scaled_classifier.score(X_test_scaled, y_test)

0.6599319438536793

In [26]:
# Train a Random Forest Classifier model on the scaled data and print the model score
scaled_rfc = RandomForestClassifier()

# fitting the model
scaled_rfc.fit(X_train_scaled, y_train)

#scoring the model
scaled_rfc.score(X_test_scaled, y_test)

0.5650786899191833

### Observation on the outcome: The fit for the Logistic Regression Model improved, as predicted; however, the fit for the Random Forest Classifier actually decreased. For the Random Forest Classifier, scaling does not help with attributes with predicting. For the logistic regression, the fact that the scaling improves the model fit indicates those attributes play a bigger role in predicting the outcome.

## Looking at some diagnostics for the unscaled data

In [31]:
# looking at a confusion matrix
y_true = y_test
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_true, y_pred)
cm

array([[ 713, 1638],
       [ 674, 1677]], dtype=int64)

In [32]:
# looking at a classification report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

   high_risk       0.51      0.30      0.38      2351
    low_risk       0.51      0.71      0.59      2351

    accuracy                           0.51      4702
   macro avg       0.51      0.51      0.49      4702
weighted avg       0.51      0.51      0.49      4702



#### Considering the diagnostic information in more detail, it appears that while the data identifies a lot of true positives (1677 in the confusion matrix), a lot of false negatives are also identified (1638 in the confusion matrix). The classification report further reveals somewhat low precision of identifying high-risk and low-risk lenders (about 51%). Furthermore, the probability of accurately recalling high-risk clients for lending is rather low (about 30%). The models should be continuously revised to identify a more optimal set of predictors.

## Looking at some diagnostics for the scaled data

In [33]:
# looking at a confusion matrix
y_true = y_test
y_pred = classifier.predict(X_test_scaled)

cm = confusion_matrix(y_true, y_pred)
cm

array([[ 905, 1446],
       [1098, 1253]], dtype=int64)

In [34]:
# looking at a classification report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

   high_risk       0.45      0.38      0.42      2351
    low_risk       0.46      0.53      0.50      2351

    accuracy                           0.46      4702
   macro avg       0.46      0.46      0.46      4702
weighted avg       0.46      0.46      0.46      4702



#### Looking at some of the diagnostic information for the scaled data, we do not apepar to have much better precision. While the data identifies a lot of true positives (1253 in the confusion matrix), quite a few false positives are also identified (1098 the confusion matrix). The classification report further reveals even lower precision of identifying high-risk and low-risk lenders (about 45%). Furthermore, the probability of accurately recalling high-risk clients for lending is rather low (about 38%). There could be some issues with the model that would require some additional refining before implementation.