# Credit Risk Ensemble Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [5]:
from sklearn.preprocessing import LabelEncoder
#Convert all categorical data into binary data
le = LabelEncoder()

# homeowner column
le.fit(df["homeowner"])
df["homeowner"] = le.transform(df["homeowner"])

# loan status column
le.fit(df["loan_status"])
df["loan_status"] = le.transform(df["loan_status"])

df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,1,52800,0.431818,5,1,22800,1
1,8400.0,6.692,1,43600,0.311927,3,0,13600,1
2,9000.0,6.963,2,46100,0.349241,3,0,16100,1
3,10700.0,7.664,1,52700,0.43074,5,1,22700,1
4,10800.0,7.698,0,53000,0.433962,5,1,23000,1


# Split the Data into Training and Testing

In [6]:
# Create our features
X = pd.get_dummies(df.drop('loan_status', axis=1))

# Create our target
y = df["loan_status"]

In [7]:
X.describe()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,0.606144,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,0.667811,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,0.0,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,0.0,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,1.0,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,1.0,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,2.0,105200.0,0.714829,16.0,3.0,75200.0


In [8]:
# Check the balance of our target values
y.value_counts()

1    75036
0     2500
Name: loan_status, dtype: int64

In [9]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(58152, 8)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [15]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(n_estimators=500, random_state=1)
model = model.fit(X_train, y_train)
model

BalancedRandomForestClassifier(n_estimators=500, random_state=78)

In [16]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9936552268244576

In [20]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,622,3
Actual 1,148,18611


In [21]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      1.00      0.99      0.89      0.99      0.99       625
          1       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       0.99      0.99      1.00      0.99      0.99      0.99     19384



In [23]:
# List the features sorted in descending order by feature importance
importances = model.feature_importances_

# Sort the features by their importance
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.19339431692680864, 'interest_rate'),
 (0.183973477548623, 'borrower_income'),
 (0.1703341491434559, 'debt_to_income'),
 (0.15303326181575788, 'total_debt'),
 (0.14822482992810818, 'loan_size'),
 (0.11972467552532383, 'num_of_accounts'),
 (0.028627147615202794, 'derogatory_marks'),
 (0.002688141496719738, 'homeowner')]

### Easy Ensemble Classifier

In [24]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
model = EasyEnsembleClassifier(n_estimators=100, random_state=2)
model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=2)

In [25]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9944548430086891

In [26]:
# Display the confusion matrix
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,622,3
Actual 1,118,18641


In [27]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      1.00      0.99      0.91      0.99      0.99       625
          1       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       0.99      0.99      1.00      0.99      0.99      0.99     19384

