## Setup

In [22]:
# imports and dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# read in data as df
lending_df = pd.read_csv('Resources/lending_data.csv')
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


## Data Check

In [3]:
# check data types
lending_df.dtypes

loan_size           float64
interest_rate       float64
borrower_income       int64
debt_to_income      float64
num_of_accounts       int64
derogatory_marks      int64
total_debt            int64
loan_status           int64
dtype: object

In [8]:
# check nulls
lending_df.isna().sum()

loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

In [4]:
# check number of unique values
lending_df.nunique()

loan_size            182
interest_rate       4692
borrower_income      662
debt_to_income       662
num_of_accounts       17
derogatory_marks       4
total_debt           662
loan_status            2
dtype: int64

## Split Data: Training & Testing Sets
---

### Get y variable and x variables

In [16]:
# separate y and x cols
y_var = lending_df.iloc[:,-1]
x_vars = lending_df.iloc[:, :-1]

### Split into 75% training and 25% testing sets

In [None]:
# split into testing and training sets
x_train, x_test, y_train, y_test = train_test_split(x_vars, y_var)

## Create Logistic Regression Model
---

### Fit model with training data

In [18]:
# declare logistic regression model
logistic_regression_model = LogisticRegression(random_state=1)

# fit and save model
lr_model = logistic_regression_model.fit(x_train, y_train)

In [20]:
# get predicted valeus for testing and training
training_predictions = lr_model.predict(x_train)
testing_predictions = logistic_regression_model.predict(x_test)

## Evaluate model
---

In [25]:
# define function for metrics: accuracy score, confusion matrix, and classification report
def get_scoring(y_set, predictions):
    # confusion matrix
    cm = confusion_matrix(y_set, predictions)
    cm_df = pd.DataFrame(
        cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']
    )

    # display all scores
    print(f'Accuracy Score: {accuracy_score(y_set, predictions)}\n')
    print('Confusion matrix')
    display(cm_df)
    print('\nClassification Report')
    print(classification_report(y_set, predictions))

### Train data

In [27]:
get_scoring(y_train, training_predictions)

Accuracy Score: 0.9934138120786903

Confusion matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,55998,291
Actual 1,92,1771



Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56289
           1       0.86      0.95      0.90      1863

    accuracy                           0.99     58152
   macro avg       0.93      0.97      0.95     58152
weighted avg       0.99      0.99      0.99     58152



### Test data

In [26]:
get_scoring(y_test, testing_predictions)

Accuracy Score: 0.9925711927362774

Confusion matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18637,110
Actual 1,34,603



Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18747
           1       0.85      0.95      0.89       637

    accuracy                           0.99     19384
   macro avg       0.92      0.97      0.94     19384
weighted avg       0.99      0.99      0.99     19384

