In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [4]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data=pd.read_csv("../Logistic_Regression_ML_Model/Resources/lending_data.csv")

# Review the DataFrame
lending_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [None]:
lending_data.shape

In [None]:
lending_data["loan_status"].value_counts()

In [None]:
lending_data.isnull().sum()

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [None]:
lending_data.columns

In [None]:
# Separate the data into labels and features

# Separate the y variable, the labels
y=lending_data["loan_status"]
# Separate the X variable, the features
X=lending_data.iloc[:,:-1]

In [None]:
# Review the y variable Series
# YOUR CODE HERE!
y[:5]

In [None]:
# Review the X variable DataFrame
X.head()

### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [None]:
# Check the balance of our target values
y.value_counts()

# A value of 0 in the “loan_status” column means that the loan is healthy. 
# A value of 1 means that the loan has a high risk of defaulting.

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train , X_test , y_train , y_test = train_test_split(X,y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model

logistic_regression_model = LogisticRegression(random_state=1)

logistic_regression_model

In [None]:
# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
# Make a prediction using the testing data
testing_predictions = lr_model.predict(X_test)

pd.DataFrame({"Prediction": testing_predictions, "Actual": y_test}).head(10)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Print the balanced_accuracy score of the model
bal_accuracy_score=balanced_accuracy_score(y_test, testing_predictions)
print(f"Balanced Accuracy Score of the model is : {bal_accuracy_score:.4f}")

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, testing_predictions)

In [None]:
# Print the classification report for the model
# A value of 0 in the “loan_status” column means that the loan is healthy. 
# A value of 1 means that the loan has a high risk of defaulting.

target_names = ["healthy_loan", "defaulting_loan"]
print(classification_report_imbalanced(y_test, testing_predictions, target_names=target_names))

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:**  Model performed very well for healthy loans. It's precision, recall and f1 score are very high reflecting the effectiveness of model for classification and prediction of healthy loans. However, the model is week for risky loans as recall and precision are quite less in comparison of healthy loans. Since data is imbalanced and false prediction of a loan could cost a significant loss to bank/ lending company if the client actually defaults in future.


---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [None]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
rus = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [None]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model_resampled = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
lr_model_resampled = logistic_regression_model_resampled.fit(X_resampled, y_resampled)
# Make a prediction using the testing data
testing_predictions_resampled = lr_model_resampled.predict(X_test)

pd.DataFrame({"Prediction": testing_predictions_resampled, "Actual": y_test}).head(10)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Print the balanced_accuracy score of the model 
bal_accuracy_score_resampled=balanced_accuracy_score(y_test, testing_predictions_resampled)
print(f"Balanced Accuracy Score of the model is : {bal_accuracy_score_resampled:.4f}")

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, testing_predictions_resampled)

In [None]:
# Print the imbalanced classification report for the model
target_names = ["healthy_loan", "defaulting_loan"]
print(classification_report_imbalanced(y_test, testing_predictions_resampled, target_names=target_names))


In [None]:
# Print the classification report for the model
from sklearn.metrics import classification_report
print(classification_report(y_test, testing_predictions_resampled, target_names=target_names))

## Print the classification reports for the two models

In [None]:
print(classification_report_imbalanced(y_test, testing_predictions,target_names=target_names))
print(classification_report_imbalanced(y_test, testing_predictions_resampled, target_names=target_names))

### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** This model with resampled data improved the results significantly for risky loans. The over all accuracy of model increased to 99.37%.  Revised model's results did not change for healthy loans however recall for risky loans increased impressively from 91% to 99%. This makes it more attractive for prediction of risky loans. Since prediction of non defaulting clients (False negatives) could be prevented by maximizing the recall.  Low precision raises a flag to the effectiveness of model but prediction of a client as False positive (predicted a borrower as to default whereas actually they are not defaulting) would not lead the company to direct loss of default.Therefore in this case emphasizing upon recall rather than precision or f1 is more significant.