In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

file_path = 'Credit_Risk/lending_data.csv'

df = pd.read_csv(file_path)

df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [3]:
#Here we declare our y value to be the loan_status column of our dataframe
#our y contains the target variable which we want to predict
y = df['loan_status']

#Our X here is a new dataframe we create without the loans_status column, 
#in other words, the rest of the picture containing all other feature columns.
x = df.drop(columns=['loan_status'])

In [4]:
#X are our independent variables which we want to split, followed by our target dependent variable. 
#We are using 20% for testing, 80% for training. 
#Hard coding our random state makes this reproducible, as the same split will be generated every time the code is run.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)

In [5]:
#We create an instance of LogisticRegression assigning it to the logreg variable
logreg = LogisticRegression()

#Now we fit the model on our data, X_train and y_train
logreg.fit(x_train, y_train)

In [6]:
#Our log reg model logreg is used here to make predictions on our X features matrix
#We are assigning these label predictions to the variable y_pred, as our predicted target values are our dependent variable y, 
#we use the other features to make this prediction
y_pred = logreg.predict(x_test)

In [7]:
#confusion_matrix is one of our functions from sklearn.metrics, y_test contains our true labels,
#y_pred contains our predicted labels, the matrix will let us compare
#top left are true negatives, bottom right are true positives
#top right false positives, bottom left false negatives
cm = confusion_matrix(y_test, y_pred)

cm

array([[14931,    89],
       [   39,   449]])

In [8]:
#We are evaluating our model's calssification abilities with another sklearn function
cr = classification_report(y_test, y_pred)

cr

'              precision    recall  f1-score   support\n\n           0       1.00      0.99      1.00     15020\n           1       0.83      0.92      0.88       488\n\n    accuracy                           0.99     15508\n   macro avg       0.92      0.96      0.94     15508\nweighted avg       0.99      0.99      0.99     15508\n'

## Answer the following question: How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

In [1]:
#For all of the loans predicted as healthy(0), the precision was 1.00, 100% were healthy!
#For the high risk loans, out of all predicted to be high risk, unfortunately only 83% were actually high risk
#aka 1. 

## An overview of the analysis: Explain the purpose of this analysis.

In [None]:
#The purpose of this analysis is to understand how well the model is making predictions.
#Are loans getting labeled as healthy or high risk appropriately?

## The results: Using a bulleted list, describe the accuracy score, the precision score, and recall score of the machine learning model.

In [None]:
#Accuracy Score
#Accuracy tells us how many times something was predicted correctly overall.
#Out of this models predictions we achieved 99% accuracy.

#Precision Score
#Precision shows us how many times the model got it right, and the times it got it wrong,
#showing an overall picture of true positives and negatives, and false positives and negatives.
#Here healthy loans were predicted correctly 100% of the time, and high risk loans were predicted correctly only 83% of the time.

#Recall Score
#Recall focuses on positives, telling us how many correctly predicted positives or yes's do we have
#compared with how many there actually are. For our healthy loan predictions, we have 99% recall,
#however for high risk(1), only 92% of our positives were predicted, meaning there were more healthy loan positives that were not predicted this way.

## A summary: Summarize the results from the machine learning model. Include your justification for recommending the model for use by the company. If you don’t recommend the model, justify your reasoning.

In [None]:
#Although the accuracy of this model is confident, I have concerns over the precision.
#To me this feels to be an unfortunate outcome, as people who actually qualify for a loan
#and would satisfy the algorithim for a healthy loan, would get automatically deflected
#if a human were not to review this work. I would much rather the opposite numbers,
#with the healthy being overshot, and high risk being accurate.
#For these reasons I would not currently recommend this model for use by the company,
#unless there is human oversight.