# Credit Risk Ensemble Techniques

In [36]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [38]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [39]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


# Split the Data into Training and Testing

In [40]:
# Create our features
X = df["loan_status"]

# Create our target
y = df.drop(columns="loan_status")

In [41]:
X.describe()

count        77536
unique           2
top       low_risk
freq         75036
Name: loan_status, dtype: object

In [42]:
# Check the balance of our target values
df["loan_status"].value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [43]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.datasets import make_blobs

X, y = make_blobs(centers=2, random_state=1, cluster_std=3)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

X_train.shape

(75, 2)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [44]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=500, random_state=1)



In [45]:
# Fitting the model
rf_model = rf_model.fit(X_train , y_train)

In [46]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.9583333333333333

In [47]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[11,  1],
       [ 0, 13]], dtype=int64)

In [48]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.92      1.00      0.96      0.96      0.91        12
          1       0.93      1.00      0.92      0.96      0.96      0.92        13

avg / total       0.96      0.96      0.96      0.96      0.96      0.92        25



In [49]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_

In [57]:
importances_sorted = sorted(zip(rf_model.feature_importances_, df.columns), reverse=True)
importances_sorted

[(0.5039411487482319, 'interest_rate'), (0.496058851251768, 'loan_size')]

### Easy Ensemble Classifier

In [51]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec_model = EasyEnsembleClassifier(n_estimators=500, random_state=1)

In [52]:
# Fitting the model
eec_model = eec_model.fit(X_train , y_train)

In [53]:
# Calculated the balanced accuracy score
y_pred = eec_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

1.0

In [54]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[12,  0],
       [ 0, 13]], dtype=int64)

In [55]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00        12
          1       1.00      1.00      1.00      1.00      1.00      1.00        13

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        25

