In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df_loans = pd.read_csv(
    Path('Resources/lending_data.csv')
)

# Review the DataFrame
df_loans.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_loans['loan_status']
target_names = ['healthy', 'high risk']

# Separate the X variable, the features
X = df_loans.drop('loan_status', axis=1)

In [4]:
# Review the y variable Series
y.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [5]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [6]:
# Check the balance of our target values
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [9]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

In [10]:
# Print the balanced_accuracy score of the model
acc_score = balanced_accuracy_score(y_test, predictions)
print('Balanced Accuracy Score: %.3f' % acc_score)

Balanced Accuracy Score: 0.952


In [11]:
# Generate a confusion matrix for the model
IND = ['Actual Healthy', 'Actual High Risk']
COL = ['Predicted Healthy', 'Predicted High Risk']
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(cm, index=IND, columns=COL)
cm_df

Unnamed: 0,Predicted Healthy,Predicted High Risk
Actual Healthy,18663,102
Actual High Risk,56,563


In [12]:
# Print the classification report for the model
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

     healthy       1.00      0.99      1.00     18765
   high risk       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



>**Analysis:** This model does extremely well at predicting healthy loans with 100% precision and 99% accuracy and recall, but somewhat less well with high-risk loans.  In those cases, it seems somewhat prone to false positives, with 102 in this sample, and a precision score of only 85%.  The recall score is also a little bit lower than for healthy loans at 91%, where the model found 56 false negatives as well.

---

## Predict a Logistic Regression Model with Resampled Training Data

In [13]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1, sampling_strategy='minority')

# Fit the original training data to the random_oversampler model
X_res, y_res = ros.fit_resample(X_train, y_train)

In [14]:
# Count the distinct values of the resampled labels data
y_res.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

In [15]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
ros_model = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
ros_model.fit(X_res, y_res)

# Make a prediction using the testing data
ros_predictions = ros_model.predict(X_test)

In [16]:
# Print the balanced_accuracy score of the model 
print('Balanced Accuracy Score: %.3f' % balanced_accuracy_score(y_test, ros_predictions))

Balanced Accuracy Score: 0.994


In [17]:
# Generate a confusion matrix for the model
ros_cm_df = pd.DataFrame(
    confusion_matrix(y_test, ros_predictions),
    index=IND, columns=COL
)
ros_cm_df

Unnamed: 0,Predicted Healthy,Predicted High Risk
Actual Healthy,18649,116
Actual High Risk,4,615


In [18]:
# Print the classification report for the model
print(classification_report(y_test, ros_predictions, target_names=target_names))

              precision    recall  f1-score   support

     healthy       1.00      0.99      1.00     18765
   high risk       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



>**Analysis:** Our second model performs equally well with healthy loans as the first model, and now has somewhat improved metrics with high risk loans as well.  Most notably, the recall has increased from 91% to 99%, bringing the balanced accuracy score overall from 95% to 99%.  This did cause the precision to decrease a small amount, from 85% to 84%, since our model found more false positives in the test sample than our first model did.  This is likely due to overfitting the minority class, which is somewhat expected when oversampling, but it found significantly less false negatives.