In [1]:
# Import the modules

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data = pd.read_csv(Path("./lending_data.csv"))

# Review the DataFrame
lending_data

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_data['loan_status']

# Separate the X variable, the features
X = lending_data.drop(columns='loan_status')

In [4]:
# Review the y variable Series
display(y.head())

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [5]:
# Review the X variable DataFrame
display(X.head())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [6]:
# Check the balance of our target values
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_training, X_testing, y_training, y_testing = train_test_split(X,y, random_state=1)

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_testing, y_testing)

In [9]:
# Make a prediction using the testing data
training_predictions = lr_model.predict(X_training)

#Generate testing predictions
testing_predicitons = logistic_regression_model.predict(X_testing)

In [10]:
# Print the balanced_accuracy score of the model
y_pred = logistic_regression_model.predict(X_testing)
balanced_accuracy_score(y_testing, y_pred)

0.9480091532589761

In [16]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_testing, y_pred)

print(test_matrix)

[[18663   102]
 [   61   558]]


In [18]:
# Print the classification report for the model
training_report = classification_report_imbalanced(y_testing, y_pred)

print(training_report)

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.90      1.00      0.95      0.90     18765
          1       0.85      0.90      0.99      0.87      0.95      0.89       619

avg / total       0.99      0.99      0.90      0.99      0.95      0.90     19384



In [19]:
#The logistic regression model predicts 


#The model is 100% correct in predicting healthy loans and 85% in high risk loans, but this
#leads to high speculation because the majority of the data it's training on is healthy loan 
#data only. I would want to run an oversampling test to see if their is a difference. 

In [20]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

In [21]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
random_oversampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model

X_resampled, y_resampled = random_oversampler.fit_resample(X_training, y_training)

In [24]:
# Count the distinct values of the resampled labels data

y_resampled.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

In [25]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model

model = LogisticRegression(random_state=1)

In [43]:
# Fit the model using the resampled training data

lr_resampled_model = model.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
y_resampled_pred = lr_resampled_model.predict(X_testing)

In [44]:
# Print the balanced_accuracy score of the model 
y_pred_oversampling = lr_resampled_model.predict(X_resampled)
balanced_accuracy_score(y_resampled, y_pred_oversampling)

0.9947308560359689

In [46]:
# Generate a confusion matrix for the model
confusion_matrix(y_resampled, y_pred_oversampling)

print(test_matrix_oversampling)

[[55964   307]
 [  286 55985]]


In [47]:
training_report_imbalanced = classification_report_imbalanced(y_resampled, y_pred_oversampling)

print(training_report_imbalanced)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.99      0.99      0.99      0.99      0.99     56271
          1       0.99      0.99      0.99      0.99      0.99      0.99     56271

avg / total       0.99      0.99      0.99      0.99      0.99      0.99    112542



In [None]:
# The module Review

#Just as I thougth, once the module was re-tested using oversampled method
#it scored much higher on the default features than the origianl report generated
#I would reccomend this report to the analyst for further review, This can be
#beneficial to this case but may not be an efficent module for everything related to loans. 
