In [4]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
from tabulate import tabulate

In [None]:
# Import the data
leading = pd.read_csv('Resources/lending_data.csv')
leading.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB


In [None]:
# check null data 
leading.isnull().sum()
# there is no null data

loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

## Performance Predictions

I predict that the Random Forest Model will be perform better than the Logistic Regression Model. Reason being because of the type of data being analyzed. Most of the output is not correlated on a linear axis, but rather serves more as a spread sheet for recorded information. Inputs in the data holds no correlation with one another.

In [None]:
# create the X and y dataset
X = leading.drop('loan_status',axis = 1)
y = leading['loan_status']


In [None]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# Feature Scaling
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 1. Logistic Regression Model

In [None]:
# Train and Test the Model
log_reg_model = LogisticRegression().fit(X_train_scaled,y_train)
log_reg_predict = log_reg_model.predict(X_test_scaled)
print(f'Model: {type(log_reg_model).__name__}')
print(f"Training Data Score: {log_reg_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {log_reg_model.score(X_test_scaled, y_test)}")
print(f"Confusion Matrix:{confusion_matrix(y_test,log_reg_predict)} ")
print(classification_report(y_test,log_reg_predict))

Model: LogisticRegression
Training Data Score: 0.9942908240473243
Testing Data Score: 0.9936545604622369
Confusion Matrix:[[18652   113]
 [   10   609]] 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.98      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



### 2. Random Forest Classifier Model

In [None]:
# Train and Test the Model
rf_model = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
rf_predict = rf_model.predict(X_test_scaled)
print(f'Model: {type(rf_model).__name__}')
print(f"Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")
print(f"Confusion Matrix:{confusion_matrix(y_test,rf_predict)} ")
print(classification_report(y_test, rf_predict))

Model: RandomForestClassifier
Training Data Score: 0.9975409272252029
Testing Data Score: 0.9914878250103177
Confusion Matrix:[[18666    99]
 [   66   553]] 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.89      0.87       619

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384



## Conclusion

My predictions came out somewhat accurate, the Random Forest performed slightly better than the Logistic Regression Model. Both scored a 0.992 so either model would be suffice. I believe that my prediction is still correct because if models were hyper tuned, Radom Forest accuracy would adjust to the overflow and increase at a higher accuracy rate than Logistic Regression. Both models are good though.