In [3]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [4]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv("Resources/lending_data.csv")

# Review the DataFrame
df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [5]:
#view data/check for duplicates
duplicates = df[df.duplicated()]
duplicates

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
64,8500.0,6.728,43900,0.316629,3,0,13900,0
72,8600.0,6.773,44300,0.322799,3,0,14300,0
87,8400.0,6.700,43600,0.311927,3,0,13600,0
88,10800.0,7.698,53000,0.433962,5,1,23000,0
113,8600.0,6.778,44400,0.324324,3,0,14400,0
...,...,...,...,...,...,...,...,...
77529,19300.0,11.347,87400,0.656751,12,2,57400,1
77530,19700.0,11.508,88900,0.662542,13,2,58900,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1


In [6]:

df[df.borrower_income==43900]

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
55,8500.0,6.728,43900,0.316629,3,0,13900,0
64,8500.0,6.728,43900,0.316629,3,0,13900,0
378,8500.0,6.724,43900,0.316629,3,0,13900,0
464,8500.0,6.728,43900,0.316629,3,0,13900,0
488,8500.0,6.732,43900,0.316629,3,0,13900,0
...,...,...,...,...,...,...,...,...
74462,8500.0,6.722,43900,0.316629,3,0,13900,0
74500,8500.0,6.726,43900,0.316629,3,0,13900,0
74531,8500.0,6.726,43900,0.316629,3,0,13900,0
74609,8500.0,6.726,43900,0.316629,3,0,13900,0


In [7]:
#remove duplicate values
df=df.drop_duplicates()

In [8]:
#listing # of rows and columns / shape
df.shape

(5229, 8)

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [76]:
# Separate the data into labels and features

# Separate the y variable/target
y = df["loan_status"]

# Separate the X variable, the features
X = df.drop(columns="loan_status")


In [77]:
# Review the y variable Series
y.describe()

count    5229.000000
mean        0.301396
std         0.458908
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: loan_status, dtype: float64

In [78]:
# Review the X variable DataFrame
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0
mean,12844.214955,8.583721,61376.286097,0.461037,6.560719,1.078409,31376.286097
std,4779.22887,2.03113,19116.38397,0.166747,4.306406,0.974999,19116.38397
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0
25%,8800.0,6.857,45100.0,0.334812,3.0,0.0,15100.0
50%,11500.0,8.016,56000.0,0.464286,5.0,1.0,26000.0
75%,17800.0,10.674,81100.0,0.630086,11.0,2.0,51100.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [80]:
# Check the balance of our target values (looks imbalanced)
print(y.value_counts())

0    3653
1    1576
Name: loan_status, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [81]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape
#shape= 3921, 7. 75% of data

(3921, 7)

In [84]:
#Scale data (different scales)

#scaling step one: instantiate
unboosted_scaler = StandardScaler()

#scaling step two: fit
unboosted_scaler.fit(X_train)

#scaling step three: apply fit to scale data
X_train_scaled = unboosted_scaler.transform(X_train)
X_test_scaled = unboosted_scaler.transform(X_test)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [85]:
# Instantiate the Logistic Regression model and assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier



LogisticRegression(max_iter=200, random_state=1)

In [86]:
# Fit the model using training data
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [87]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
5,1,1
6,1,1
7,0,0
8,0,0
9,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [88]:
# Print the balanced_accuracy score of the model
accuracy_score(y_test, predictions)
#91.8% accuracy, meaning ratio of correctly predicted observations to total observations. 
#Generally, higher accuracy represents higher level of model performance

0.9181957186544343

In [89]:
balanced_accuracy_score(y_test, predictions)
#balanced accuracy is 93%
#balanced accuracy is mean of sensitivity 
#The use case of balanced accuracy is when data is imbalanced, which appears to be the case in this instance)


0.93208021859623

In [91]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

#SKLearn Confusion matrix labels positioning:
#https://towardsdatascience.com/understanding-the-confusion-matrix-from-scikit-learn-c51d88929c79

#tn / fp (tn: healthy loans correctly predicted / fp: healthy loans incorrectly predicted as high-risk)
#fn / tp (fn: high-risk loan incorrectly predicted as healthy / tp: high-risk loans correctly predicted)



array([[820,  94],
       [ 13, 381]])

In [52]:
# Print the classification report for the model

#target_names = ["Healthy Loan", "High-risk Loan"]
print(classification_report(y_test, predictions))

#precision = tp / (tp + fp)
#recall = tp / (tp + fn)
#f-beta score: weighted mean of precision and recall (best at 1 and worst at 0)
#support: # occurences fo each class in y_true


              precision    recall  f1-score   support

           0       0.95      0.90      0.92       914
           1       0.79      0.89      0.84       394

    accuracy                           0.90      1308
   macro avg       0.87      0.89      0.88      1308
weighted avg       0.90      0.90      0.90      1308



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** 
For healthy loans ('0'), the model correctly predicted  95% of its predictions (precision), caught 90% of positive cases (recall), and 92% of its positive predicitions were correct. According to the support metric, the dataset included 914 occurences of healthy loans. 

For high-risk loans ('1'), the model correctly predicted 79% of its predictions (precision), caught 89% of positive cases (recall), and 84% of positive predicitions were correct. According to the support metric, the dataset included 394 occurences of healthy loans (siginificantly less than healthy loans).

Imbalanced support metrics indicate need for rebalancing.


---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [94]:
#Balancing the sampling


# Instantiate the random oversampler model and assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)


# Fit the original training data to the random_oversampler model
X_res, y_res = ros.fit_resample(X_train_scaled, y_train)

In [95]:
# Count the distinct values of the resampled labels data
y.value_counts()

0    3653
1    1576
Name: loan_status, dtype: int64

In [96]:
y_res.value_counts()

0    2739
1    2739
Name: loan_status, dtype: int64

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [97]:
# Instantiate the Logistic Regression model and assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier


# Fit the model using the resampled training data
classifier.fit(X_res, y_res)

# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,1
3,0,0
4,0,0
5,1,1
6,1,1
7,0,0
8,0,0
9,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [98]:
# Print the accuracy score of the model 
accuracy_score(y_test, predictions)
#92%

0.9212538226299695

In [99]:
# Print the balanced accuracy score of the model 
balanced_accuracy_score(y_test, predictions)
#93.7%

0.9378783503093449

In [100]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[819,  95],
       [  8, 386]])

In [101]:
# Print the classification report for the model
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.90      0.98      0.94      0.94      0.87       914
          1       0.80      0.98      0.90      0.88      0.94      0.89       394

avg / total       0.93      0.92      0.95      0.92      0.94      0.87      1308



In [102]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94       914
           1       0.80      0.98      0.88       394

    accuracy                           0.92      1308
   macro avg       0.90      0.94      0.91      1308
weighted avg       0.93      0.92      0.92      1308



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:**
For healthy loans ('0'), the model correctly predicted 99% of its predictions (precision), caught 90% of positive cases (recall), and 94% of its positive predicitions were correct. According to the support metric, the dataset included 914 occurences of healthy loans.

For high-risk loans ('1'), the model correctly predicted 80% of its predictions (precision), caught 98% of positive cases (recall), and 88% of positive predicitions were correct. According to the support metric, the dataset included 394 occurences of healthy loans (siginificantly less than healthy loans).

Data resampling notably improved the model's performance in correctly identifying high-risk loans, which is key to our analysis. Accuracy and balanced accuracy also improved.