In [25]:
# Import the modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)


import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.metrics import classification_report_imbalanced




---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [26]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# YOUR CODE HERE!


lending_data_df = pd.read_csv(
    Path('Resources/lending_data.csv')
)

# Review the DataFrame
# YOUR CODE HERE!

display(lending_data_df.head())
display(lending_data_df.tail())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77531,19100.0,11.261,86600,0.65358,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1
77535,15600.0,9.742,72300,0.585062,9,2,42300,1


In [27]:
lending_data_df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
77531    False
77532     True
77533     True
77534    False
77535     True
Length: 77536, dtype: bool

In [4]:
lending_data_df[lending_data_df.duplicated()]

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
64,8500.0,6.728,43900,0.316629,3,0,13900,0
72,8600.0,6.773,44300,0.322799,3,0,14300,0
87,8400.0,6.700,43600,0.311927,3,0,13600,0
88,10800.0,7.698,53000,0.433962,5,1,23000,0
113,8600.0,6.778,44400,0.324324,3,0,14400,0
...,...,...,...,...,...,...,...,...
77529,19300.0,11.347,87400,0.656751,12,2,57400,1
77530,19700.0,11.508,88900,0.662542,13,2,58900,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1


In [30]:
# looking at multiple instances of duplicates
lending_data_df.loc[(lending_data_df['total_debt']==13900)&(lending_data_df['interest_rate']==6.728)]

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
55,8500.0,6.728,43900,0.316629,3,0,13900,0
64,8500.0,6.728,43900,0.316629,3,0,13900,0
464,8500.0,6.728,43900,0.316629,3,0,13900,0
614,8500.0,6.728,43900,0.316629,3,0,13900,0
3294,8500.0,6.728,43900,0.316629,3,0,13900,0
6254,8500.0,6.728,43900,0.316629,3,0,13900,0
6759,8500.0,6.728,43900,0.316629,3,0,13900,0
6825,8500.0,6.728,43900,0.316629,3,0,13900,0
8055,8500.0,6.728,43900,0.316629,3,0,13900,0
8386,8500.0,6.728,43900,0.316629,3,0,13900,0


In [5]:
lending_data_df=lending_data_df.drop_duplicates()

In [6]:
lending_data_df.shape

(5229, 8)

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [7]:
# Separate the data into labels and features

# Separate the y variable, the labels
# YOUR CODE HERE!]
y = lending_data_df['loan_status']


# Separate the X variable, the features
# YOUR CODE HERE!
X = lending_data_df.drop(columns = 'loan_status')


In [8]:
# Review the y variable Series
# YOUR CODE HERE!

display(y.head())
display(y.tail())

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

77524    1
77526    1
77528    1
77531    1
77534    1
Name: loan_status, dtype: int64

In [9]:
# Review the X variable DataFrame
# YOUR CODE HERE!

display(X.head())
display(X.tail())


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
77524,16900.0,10.302,77500,0.612903,10,2,47500
77526,18300.0,10.895,83100,0.638989,11,2,53100
77528,15100.0,9.557,70500,0.574468,9,2,40500
77531,19100.0,11.261,86600,0.65358,12,2,56600
77534,16300.0,10.068,75300,0.601594,10,2,45300


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [10]:
# Check the balance of our target values
# YOUR CODE HERE!

y.value_counts()

0    3653
1    1576
Name: loan_status, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
# YOUR CODE HERE!
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    random_state = 1
)



---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [12]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
LR_model = LogisticRegression(random_state = 1)


# Fit the model using training data
# YOUR CODE HERE!
LR_model.fit(X_train, y_train)


LogisticRegression(random_state=1)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [13]:
# Make a prediction using the testing data
# YOUR CODE HERE!

LR_predictions = LR_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [14]:
# Print the balanced_accuracy score of the model
# YOUR CODE HERE!
balanced_accuracy_score(y_test, LR_predictions)


0.8937359901403674

In [15]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!

cm_imbalanced = confusion_matrix(y_test, LR_predictions)
cm_imbalanced_df = pd.DataFrame(cm_imbalanced, 
                                index = ['Actual Healthy Loans (low-risk)', 
                                'Actual Non-Healthy Loans (high-risk)'], 
                                columns = ['Predicted Healthy Loans (low-risk)', 'Predicted Non-Healthy Loans (high-risk)']
                              )
cm_imbalanced_df

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Actual Healthy Loans (low-risk),832,77
Actual Non-Healthy Loans (high-risk),51,348


In [16]:
# Print the classification report for the model
# YOUR CODE HERE!

print(classification_report_imbalanced(y_test, LR_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.92      0.87      0.93      0.89      0.80       909
          1       0.82      0.87      0.92      0.84      0.89      0.79       399

avg / total       0.90      0.90      0.89      0.90      0.89      0.80      1308



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** WRITE YOUR ANSWER HERE!

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [17]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!

ROS_model = RandomOverSampler(random_state = 1)


# Fit the original training data to the random_oversampler model
# YOUR CODE HERE!

X_oversampled, y_oversampled = ROS_model.fit_resample(X_train, y_train)

In [18]:
# Count the distinct values of the resampled labels data
# YOUR CODE HERE!

y_oversampled.value_counts()

0    2744
1    2744
Name: loan_status, dtype: int64

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [19]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
LR_oversampled_model = LogisticRegression(random_state = 1)


# Fit the model using the resampled training data
# YOUR CODE HERE!
LR_oversampled_model.fit(X_oversampled, y_oversampled)

# Make a prediction using the testing data
# YOUR CODE HERE!

LR_oversampled_pred = LR_oversampled_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [20]:
# Print the balanced_accuracy score of the model 
# YOUR CODE HERE!

balanced_accuracy_score(y_test, LR_oversampled_pred)

0.9409580055749935

In [21]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!

cm_oversampled = confusion_matrix(y_test, LR_oversampled_pred)
cm_oversampled_df = pd.DataFrame(cm_oversampled, 
                                index = ['Actual Healthy Loans (low-risk)', 
                                'Actual Non-Healthy Loans (high-risk)'], 
                                columns = ['Predicted Healthy Loans (low-risk)', 'Predicted Non-Healthy Loans (high-risk)']
                              )
cm_oversampled_df

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Actual Healthy Loans (low-risk),829,80
Actual Non-Healthy Loans (high-risk),12,387


In [22]:
# Print the classification report for the model
# YOUR CODE HERE!
print(classification_report(y_test, LR_oversampled_pred))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95       909
           1       0.83      0.97      0.89       399

    accuracy                           0.93      1308
   macro avg       0.91      0.94      0.92      1308
weighted avg       0.94      0.93      0.93      1308



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** YOUR ANSWER HERE!