In [1]:
import warnings
warnings.filterwarnings('ignore')

# Ensemble Learning
## Initial Imports

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced 

## Inputting the data

In [39]:
# Load the data
file_path = Path('Resources/Lending_data.csv')
df = pd.read_csv(file_path)

In [40]:
# Preview the data
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [41]:
# Preview the data
df.tail()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77531,19100.0,11.261,own,86600,0.65358,12,2,56600,high_risk
77532,17700.0,10.662,mortgage,80900,0.629172,11,2,50900,high_risk
77533,17600.0,10.595,rent,80300,0.626401,11,2,50300,high_risk
77534,16300.0,10.068,mortgage,75300,0.601594,10,2,45300,high_risk
77535,15600.0,9.742,mortgage,72300,0.585062,9,2,42300,high_risk


In [43]:
df.shape

(77536, 9)

## Pre-Processing the data for testing and training

In [45]:
df.dtypes

loan_size           float64
interest_rate       float64
homeowner            object
borrower_income       int64
debt_to_income      float64
num_of_accounts       int64
derogatory_marks      int64
total_debt            int64
loan_status          object
dtype: object

In [61]:
X = df.copy()
X.drop(columns=["homeowner","loan_status"], inplace=True)

In [62]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [68]:
# Create our features
X = X

# Create our target
y = df["loan_status"]

In [69]:
X.describe

<bound method NDFrame.describe of        loan_size  interest_rate  borrower_income  debt_to_income  \
0        10700.0          7.672            52800        0.431818   
1         8400.0          6.692            43600        0.311927   
2         9000.0          6.963            46100        0.349241   
3        10700.0          7.664            52700        0.430740   
4        10800.0          7.698            53000        0.433962   
...          ...            ...              ...             ...   
77531    19100.0         11.261            86600        0.653580   
77532    17700.0         10.662            80900        0.629172   
77533    17600.0         10.595            80300        0.626401   
77534    16300.0         10.068            75300        0.601594   
77535    15600.0          9.742            72300        0.585062   

       num_of_accounts  derogatory_marks  total_debt  
0                    5                 1       22800  
1                    3                 

## Splitting the data into train and test

In [96]:
from sklearn.model_selection import train_test_split

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

## Using the StandardScaler to Pre-Process the data

In [97]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [98]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [99]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Ensemble Learners
In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. 
You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier. 

For each algorithm, be sure to complete the folliowing steps:
1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the imbalanced_classification_report from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score.

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [100]:
# Create the random forest classifier instance
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=500, random_state=100)

In [101]:
# Fit the model
rf_model = rf_model.fit(X_test_scaled, y_test)

In [102]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [103]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)


In [104]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,610,3
Actual 1,26,18745


Accuracy Score : 0.9985039207593892
Classification Report
              precision    recall  f1-score   support

   high_risk       0.96      1.00      0.98       613
    low_risk       1.00      1.00      1.00     18771

    accuracy                           1.00     19384
   macro avg       0.98      1.00      0.99     19384
weighted avg       1.00      1.00      1.00     19384



In [105]:
# Get the feature importance array
importances = rf_model.feature_importances_

In [106]:
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.26104037112577594, 'interest_rate'),
 (0.17640482552984402, 'loan_size'),
 (0.17115411794573518, 'total_debt'),
 (0.1711153267308679, 'borrower_income'),
 (0.1522719777531095, 'debt_to_income'),
 (0.06619443036014477, 'num_of_accounts'),
 (0.0018189505545227187, 'derogatory_marks')]

### Easy Ensemble Classifier

In [107]:
# Fit the model
rf_model_train = rf_model.fit(X_train_scaled, y_train)

In [108]:
# Making predictions using the testing data
predictions = rf_model_train.predict(X_train_scaled)

In [109]:
# Calculating the confusion matrix
cm = confusion_matrix(y_train, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_train, predictions)

In [110]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_train, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1854,33
Actual 1,124,56141


Accuracy Score : 0.9973001788416563
Classification Report
              precision    recall  f1-score   support

   high_risk       0.94      0.98      0.96      1887
    low_risk       1.00      1.00      1.00     56265

    accuracy                           1.00     58152
   macro avg       0.97      0.99      0.98     58152
weighted avg       1.00      1.00      1.00     58152



In [111]:
# Get the feature importance array
importances_train = rf_model_train.feature_importances_

In [112]:
# List the top 10 most important features
importances_sorted_train = sorted(zip(rf_model_train.feature_importances_, X.columns), reverse=True)
importances_sorted_train[:10]

[(0.30691949258838164, 'interest_rate'),
 (0.17213871982265913, 'borrower_income'),
 (0.15981397976640935, 'total_debt'),
 (0.1459293565625411, 'debt_to_income'),
 (0.14242945194491155, 'loan_size'),
 (0.07261642785997742, 'num_of_accounts'),
 (0.00015257145511973836, 'derogatory_marks')]

Final Questions

1. Which model had the best balanced accuracy score? 

Both models had the same accruacy scores.

2. Which model had the best recall score? 

The test model had the best recall by 1%.

3. What are the top three features? 

The top three features were interest rate, borrower income, and debt to income.