In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
### Step One 1

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# YOUR CODE HERE!
my_data = Path('lending_data.csv')
lending_ = pd.read_csv(my_data)

# Review the DataFrame
# YOUR CODE HERE!
display(lending_.head())
display(lending_.tail())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77531,19100.0,11.261,86600,0.65358,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1
77535,15600.0,9.742,72300,0.585062,9,2,42300,1


In [4]:
### Step two 2

In [5]:
# Separate the data into labels and features
# Separate the y variable, the labels
# YOUR CODE HERE!]
y = lending_['loan_status']

# Separate the X variable, the features
# YOUR CODE HERE!
X = lending_.drop(columns=['loan_status'])
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [6]:
# Review the y variable Series
# YOUR CODE HERE!
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [7]:
# Review the X variable DataFrame
# YOUR CODE HERE!
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [8]:
### Step three 3

In [9]:
# Check the balance of our target values
# YOUR CODE HERE!
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [10]:
### Step four 4

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
# YOUR CODE HERE!
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    stratify=y,
                                                    random_state=1, 
                                                    )

In [12]:
## Create a Logistic Regression Model with the Original Data

In [13]:
## step one 1 

In [15]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
classifier = LogisticRegression(solver='lbfgs',
                                max_iter = 200,
                                random_state=1)

# Fit the model using training data
# YOUR CODE HERE!
_model = classifier.fit(X_train, y_train)

In [16]:
## Step two 2

In [17]:
# Make a prediction using the testing data
# YOUR CODE HERE!
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
36831,0,0
75818,0,1
36563,0,0
13237,0,0
43292,0,0
...,...,...
38069,0,0
36892,0,0
5035,0,0
40821,0,0


In [18]:
### Step 3: Evaluate the model’s performance by doing the following:

In [19]:
# Print the balanced_accuracy score of the model
# YOUR CODE HERE!
balanced_accuracy_score(y_test, predictions)

0.9442676901753825

In [20]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18679,80
Actual 1,67,558


In [21]:
# Print the classification report for the model
# YOUR CODE HERE!
print("Classification Report")
target_names = ["healthy loan","high-risk loan"]
print(classification_report(y_test, predictions, target_names = target_names))

Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      1.00      1.00     18759
high-risk loan       0.87      0.89      0.88       625

      accuracy                           0.99     19384
     macro avg       0.94      0.94      0.94     19384
  weighted avg       0.99      0.99      0.99     19384



In [22]:
### Step 4: Answer the following question.

In [23]:
## Predict a Logistic Regression Model with Resampled Training Data

In [24]:
##Step 1

In [25]:
from imblearn.over_sampling import RandomOverSampler
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
model1 = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
# YOUR CODE HERE!
X_resampled, y_resampled = model1.fit_resample(X_train, y_train)

In [26]:
# Count the distinct values of the resampled labels data
# YOUR CODE HERE!

In [27]:
## Step 2

In [28]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
classifier_ros = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
# YOUR CODE HERE!
classifier_ros.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
# YOUR CODE HERE!
predictions_ros = classifier_ros.predict(X_test)
pd.DataFrame({"Prediction": predictions_ros, "Actual": y_test})


Unnamed: 0,Prediction,Actual
36831,0,0
75818,1,1
36563,0,0
13237,0,0
43292,0,0
...,...,...
38069,0,0
36892,0,0
5035,0,0
40821,0,0


In [29]:
## Step 3: Evaluate the model’s performance by doing the following:

In [30]:
# Print the balanced_accuracy score of the model 
# YOUR CODE HERE!
balanced_accuracy_score(y_test, predictions_ros)

0.9959744975744975

In [31]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!
matrix = confusion_matrix(y_test, predictions_ros)
matrix_df = pd.DataFrame(
   matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(matrix_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18668,91
Actual 1,2,623


In [32]:
# Print the classification report for the model
# YOUR CODE HERE!print("ROS Classification Report")
print("ROS Classification Report")
target_names = ["healthy loan","high-risk loan"]
print(classification_report(y_test, predictions_ros, target_names = target_names))

ROS Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      1.00      1.00     18759
high-risk loan       0.87      1.00      0.93       625

      accuracy                           1.00     19384
     macro avg       0.94      1.00      0.96     19384
  weighted avg       1.00      1.00      1.00     19384



In [None]:
### Step 4: Answer the following question

In [None]:
##Question: How well does the logistic regression model, fit with oversampled data, predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

In [None]:
###
Precision – Accuracy of positive predictions.
"Healthy loan":
the precision is 1.00, means perfect accuracy. 
"High-risk loan":
the precision is 0.87: the predicted high-risk loans are correct in percentage of 87.

Recall is the ability of a classifier to find all positive instances
"healthy " and "high-risk l" classes results are 1.00, this means that instances of both classes are complete.

The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0
For the "healthy loan" class, the F1-score is 1.00, for the "high-risk loan" class is 0.93 the figure 1 indicates a high standard and excellent act, 0.93: explains the acceptable and good stability and balancing between precision and recall for this label.

As per the above figures and factors, the preferred model will be logistic regression with the oversampling data, due to its performance in defining the loan with high risks 
###