In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df = pd.read_csv(
    Path("Resources/lending_data.csv")
)

# Review the DataFrame
display(lending_df.head())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_df['loan_status']

# Separate the X variable, the features
X = lending_df.drop(columns=['loan_status'])

In [4]:
# Review the y variable Series
print(y[:5])

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [5]:
# Review the X variable DataFrame
print(X[:5])

   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [6]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function - allows it to split same way each time you run code
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [7]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1, max_iter=200) #added max iter due to error about max being reached)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [8]:
# Make a prediction using the testing data
predictions = lr_model.predict(X_train)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [9]:
# Generate a confusion matrix for the model

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, predictions)

# Print the confusion matrix for the training data
print(training_matrix)

#REMINDER: 
#TN|FP
# FN|TP

[[55988   283]
 [  137  1744]]


In [10]:
target_names = ['healthy loan', 'unhealthy loan']

In [11]:
# Print the classification report for the model
training_report = classification_report(y_train, predictions, target_names=target_names)

# Print the training classification report
print(training_report)

                precision    recall  f1-score   support

  healthy loan       1.00      0.99      1.00     56271
unhealthy loan       0.86      0.93      0.89      1881

      accuracy                           0.99     58152
     macro avg       0.93      0.96      0.94     58152
  weighted avg       0.99      0.99      0.99     58152



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model seems to predict the 0 (healthy loan) well with an f1 score of 1 (100%) but could use improvement on the 1 (high-risk loan) predictions due to a f1 score of 0.89 (89%). It should be of note that a score of 89% is still very good and perhaps the model can be used with supervision for case-by-case basis, as there is are still risks for false positives and negatives -- though the scale of this relative to the true positives and negatives is not huge -- per the confusion matrix.

---

In [12]:
### Exploring Other Models with the Original Data (see if others are better fit)

In [13]:
lending_df.dtypes

loan_size           float64
interest_rate       float64
borrower_income       int64
debt_to_income      float64
num_of_accounts       int64
derogatory_marks      int64
total_debt            int64
loan_status           int64
dtype: object

In [14]:
lending_df.columns

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

In [15]:
## SVM

In [16]:
# Support vector machine rbf classifier
from sklearn.svm import SVC
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

In [17]:
# SVM Model Accuracy
print('Test Acc: %.3f' % svm_model.score(X_test, y_test))

Test Acc: 0.994


In [18]:
#svm predictions (training data)
svm_predictions = svm_model.predict(X_train)

In [19]:
#confusion matrix svm
svm_training_matrix = confusion_matrix(y_train, svm_predictions)

# Print the confusion matrix for the training data
print(svm_training_matrix)

#REMINDER: 
#TN|FP
# FN|TP

[[55969   302]
 [   14  1867]]


In [20]:
#SVM classification report
print(classification_report(y_train, svm_predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

  healthy loan       1.00      0.99      1.00     56271
unhealthy loan       0.86      0.99      0.92      1881

      accuracy                           0.99     58152
     macro avg       0.93      0.99      0.96     58152
  weighted avg       1.00      0.99      0.99     58152



In [21]:
#SVM Result:
# improvement in f1 score for unhealthy (same f2 score for healthy). more false positives than LR but improved false negatives.

In [22]:
## Decision Trees

In [23]:
# Create the decision tree classifier instance
from sklearn import tree
dr_model = tree.DecisionTreeClassifier()
# Fit the model
dr_fit_model = dr_model.fit(X_train, y_train)

In [24]:
# Making predictions using the training data
dr_predictions = dr_fit_model.predict(X_train)

In [25]:
#confusion matrix decision tree
dr_training_matrix = confusion_matrix(y_train, dr_predictions)

# Print the confusion matrix for the training data
print(dr_training_matrix)

#REMINDER: 
#TN|FP
# FN|TP

[[56206    65]
 [   78  1803]]


In [26]:
# Calculating the accuracy score DR
# dr_acc_score = accuracy_score(y_test, dr_predictions)
print('Test Acc: %.3f' % dr_fit_model.score(X_test, y_test))

Test Acc: 0.990


In [27]:
#DR classification report
print(classification_report(y_train, dr_predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

  healthy loan       1.00      1.00      1.00     56271
unhealthy loan       0.97      0.96      0.96      1881

      accuracy                           1.00     58152
     macro avg       0.98      0.98      0.98     58152
  weighted avg       1.00      1.00      1.00     58152



In [28]:
## DR Result:
# increased false negatives, decreased false positves; improved f1 for unhealthy loans and same f1 for healthy

In [29]:
## Random Forest

In [30]:
# Create the random forest classifier instance
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
# Fit the model and use .ravel()on the "y_train" data. 
rf_fit_model = rf_model.fit(X_train, y_train.ravel())

  rf_fit_model = rf_model.fit(X_train, y_train.ravel())


In [31]:
# Making predictions using the training data
rf_predictions = rf_fit_model.predict(X_train)

In [32]:
#confusion matrix decision tree
rf_training_matrix = confusion_matrix(y_train, rf_predictions)

# Print the confusion matrix for the training data
print(rf_training_matrix)

#REMINDER: 
#TN|FP
# FN|TP

[[56154   117]
 [   26  1855]]


In [33]:
# Calculating the accuracy score DR
print('Test Acc: %.3f' % rf_fit_model.score(X_test, y_test))

Test Acc: 0.992


In [34]:
#RF classification report
print(classification_report(y_train, rf_predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

  healthy loan       1.00      1.00      1.00     56271
unhealthy loan       0.94      0.99      0.96      1881

      accuracy                           1.00     58152
     macro avg       0.97      0.99      0.98     58152
  weighted avg       1.00      1.00      1.00     58152



In [35]:
#RF Results:
# better confusion matrix than SVM but not as good as Decision Trees. Same f1 for healthy; same f1 as DR (#1 so far) for unhealthy

In [36]:
## KNN

In [37]:
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model
knn = KNeighborsClassifier(n_neighbors=3)
# Train the model using the training data
knn.fit(X_train, y_train)

In [38]:
# Create predictions using the testing data
knn_predictions = knn.predict(X_train)

In [39]:
#confusion matrix knn
knn_training_matrix = confusion_matrix(y_train, knn_predictions)

# Print the confusion matrix for the training data
print(knn_training_matrix)

#REMINDER: 
#TN|FP
# FN|TP

[[56060   211]
 [   57  1824]]


In [40]:
# Calculating the accuracy score DR
print('Test Acc: %.3f' % knn.score(X_test, y_test))

Test Acc: 0.992


In [41]:
#KNN classification report
print(classification_report(y_train, knn_predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

  healthy loan       1.00      1.00      1.00     56271
unhealthy loan       0.90      0.97      0.93      1881

      accuracy                           1.00     58152
     macro avg       0.95      0.98      0.96     58152
  weighted avg       1.00      1.00      1.00     58152



In [42]:
#KNN Result:
#f1 for unhealthy not as high as Decision Trees nor SVM, but better than logistic regression. confusion matrix has better falses than
# logistic but not as good as decision trees

In [43]:
# MY CHOICE: Decision Trees (labelled dr in this file)