In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Load the data
csv_path = Path(r"C:\Users\Sejas\Desktop\FinTech\mia-mia-fin-pt-05-2021-u-c-master\Week11_Classification\lending_data.csv")
lending = pd.read_csv(csv_path)

lending.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [3]:
# Fitting and encoding the columns with the LabelEncoder
le = LabelEncoder()

# Encoding homeowner column
le.fit(lending["homeowner"])
lending["homeowner"] = le.transform(lending["homeowner"])

lending.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,1,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,1,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,2,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,1,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,0,53000,0.433962,5,1,23000,low_risk


In [4]:
# Create our features
X = lending.drop(columns="loan_status")

# Create our target
y = lending["loan_status"]
X

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,1,52800,0.431818,5,1,22800
1,8400.0,6.692,1,43600,0.311927,3,0,13600
2,9000.0,6.963,2,46100,0.349241,3,0,16100
3,10700.0,7.664,1,52700,0.430740,5,1,22700
4,10800.0,7.698,0,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,1,86600,0.653580,12,2,56600
77532,17700.0,10.662,0,80900,0.629172,11,2,50900
77533,17600.0,10.595,2,80300,0.626401,11,2,50300
77534,16300.0,10.068,0,75300,0.601594,10,2,45300


In [5]:
X.describe()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,0.606144,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,0.667811,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,0.0,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,0.0,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,1.0,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,1.0,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,2.0,105200.0,0.714829,16.0,3.0,75200.0


In [6]:
# Check the balance of our target values
y.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [7]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, 
                                                   y, 
                                                   random_state=1, 
                                                   stratify=y)
X_train.shape

(58152, 8)

In [8]:
scaler = StandardScaler()

# Fitting the scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=1000, random_state=1)

In [10]:
# Calculated the balanced accuracy score
y_pred_brf = brf.predict(X_test_scaled)
bas_brf=balanced_accuracy_score(y_test, y_pred_brf)
print(bas_brf)

0.9937351884428807


In [11]:
# Display the confusion matrix
cm_brf = confusion_matrix(y_test, y_pred_brf)
cm_df_brf = pd.DataFrame(
    cm_brf, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"]
)
cm_df_brf

Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,622,3
Actual Low Risk,145,18614


In [12]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brf))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.81      1.00      0.99      0.89      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       0.99      0.99      1.00      0.99      0.99      0.99     19384



In [13]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.18703526235640708, 'total_debt'),
 (0.17254127613882989, 'borrower_income'),
 (0.16565598628726366, 'debt_to_income'),
 (0.1624914600616813, 'interest_rate'),
 (0.15613786418656678, 'loan_size'),
 (0.12193867705419299, 'num_of_accounts'),
 (0.03158406206866748, 'derogatory_marks'),
 (0.0026154118463908595, 'homeowner')]

In [14]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=1000, random_state=1)
eec.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(n_estimators=1000, random_state=1)

In [20]:
# Calculated the balanced accuracy score
y_pred_eec = eec.predict(X_test_scaled)
bas_eec=balanced_accuracy_score(y_test, y_pred_brf)
print(bas_eec)

0.9937351884428807


In [21]:
# Display the confusion matrix
cm_eec = confusion_matrix(y_test, y_pred_eec)
cm_df_eec = pd.DataFrame(
    cm_eec, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"]
)
cm_df_eec

Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,622,3
Actual Low Risk,118,18641


In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      1.00      0.99      0.91      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       0.99      0.99      1.00      0.99      0.99      0.99     19384



In [23]:
print(f"In my analysis, both models have the same balanced accuracy score:")
print(f"Balanced Random Forest Classifier = {bas_brf}")
print(f"Easy Ensemble Classifier = {bas_eec}")

In my analysis, both models have the same balanced accuracy score:
Balanced Random Forest Classifier = 0.9937351884428807
Easy Ensemble Classifier = 0.9937351884428807
