# Ensemble Learning

In [36]:
# The following is a script to:
# 1. Read in loan data from a csv file.
# 2. Encode columns in the loan data using LabelEncoder, Pandas get_dummies function, and splitting of "Month-Year" string to two columns with integer month and numeric year.
# 3. Model credit risks for loan data using machine learning ensemble learning model Balanced RandomForest Classifier and resampling model Random Under Sampler (cannot use Easy 
#    Ensemble Classifier due to it being deprecated - see comments in code below).

In [37]:
# Initial imports
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

## Read the CSV and Perform Basic Data Cleaning

In [38]:
# Load the data
file_path = Path('LoanStats_2019Q1.csv')
df = pd.read_csv(file_path)

# Preview the data
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


# Encoding

Need to encode categorical data into integers for training, testing, and predictions

In [39]:
# First will need to encode two columns with strings for the month and year: issue_d and next_pymnt_d
# Create dictionary with #s for months to replace in the existing df
name_to_num = {
    "Jan" : 1,
    'Feb': 2,
    'Mar': 3,
    'Apr': 4,
    'May': 5,
    'Jun': 6,
    'Jul': 7,
    'Aug': 8,
    'Sep': 9,
    'Oct': 10,
    'Nov': 11,
    'Dec': 12}

In [40]:
#First break the issue_d column into two and separate the month from the year - save each of these in a separate dataframe
issue_df= df["issue_d"].str.split("-",expand=True)
issue_df.rename(columns={0: "issue_month", 1: "issue_year"},inplace=True)

next_pymnt_df= df["next_pymnt_d"].str.split("-",expand=True)
next_pymnt_df.rename(columns={0: "next_pymnt_month", 1: "next_pymnt_year"},inplace=True)

next_pymnt_df.head()

Unnamed: 0,next_pymnt_month,next_pymnt_year
0,May,2019
1,May,2019
2,May,2019
3,May,2019
4,May,2019


In [41]:
# Encode the month name in both of the dataframes using the dictionary of months encoded to integers
issue_df["issue_month"] = issue_df["issue_month"].apply(lambda x: name_to_num[x])
next_pymnt_df["next_pymnt_month"] = next_pymnt_df["next_pymnt_month"].apply(lambda x: name_to_num[x])
next_pymnt_df.head()

Unnamed: 0,next_pymnt_month,next_pymnt_year
0,5,2019
1,5,2019
2,5,2019
3,5,2019
4,5,2019


In [42]:
# Concatenate the issue month and issue year dataframes with the original and drop the original columns with the string issue date in a single column
df = pd.concat([df,issue_df,next_pymnt_df],axis="columns", join='inner',)
df.rename(columns={0: "issue_month", 1: "issue_year"},inplace=True)
df.drop(columns=["issue_d","next_pymnt_d"],inplace=True)
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,issue_month,issue_year,next_pymnt_month,next_pymnt_year
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,low_risk,n,27.24,0.0,...,65687.0,38199.0,2000.0,61987.0,N,N,3,2019,5,2019
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,low_risk,n,20.23,0.0,...,271427.0,60641.0,41200.0,49197.0,N,N,3,2019,5,2019
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,low_risk,n,24.26,0.0,...,60644.0,45684.0,7500.0,43144.0,N,N,3,2019,5,2019
3,10000.0,0.164,353.55,RENT,92000.0,Verified,low_risk,n,31.44,0.0,...,99506.0,68784.0,19700.0,76506.0,N,N,3,2019,5,2019
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,low_risk,n,18.76,0.0,...,219750.0,25919.0,27600.0,20000.0,N,N,3,2019,5,2019


In [43]:
# Next, use LabelEncoder to convert specific columns to binary integers 0 & 1
# Create the LabelEncoder instance
le = LabelEncoder()

In [44]:
# Fit and encode the necessary df columns (i.e., those that binary - only 2 options) with the LabelEncoder
# Encode the verification_status column

le.fit(df["verification_status"])
df["verification_status"] = le.transform(df["verification_status"])

# Encode the loan_status column
le.fit(df["loan_status"])
df["loan_status"] = le.transform(df["loan_status"])

# Encode the pymnt_plan column
le.fit(df["pymnt_plan"])
df["pymnt_plan"] = le.transform(df["pymnt_plan"])

# Encode the initial_list_status column
le.fit(df["initial_list_status"])
df["initial_list_status"] = le.transform(df["initial_list_status"])

# Encode the application_type column
le.fit(df["application_type"])
df["application_type"] = le.transform(df["application_type"])

# Encode the hardship_flag column
le.fit(df["hardship_flag"])
df["hardship_flag"] = le.transform(df["hardship_flag"])

# Encode the debt_settlement_flag column
le.fit(df["debt_settlement_flag"])
df["debt_settlement_flag"] = le.transform(df["debt_settlement_flag"])

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,issue_month,issue_year,next_pymnt_month,next_pymnt_year
0,10500.0,0.1719,375.35,RENT,66000.0,1,1,0,27.24,0.0,...,65687.0,38199.0,2000.0,61987.0,0,0,3,2019,5,2019
1,25000.0,0.2,929.09,MORTGAGE,105000.0,2,1,0,20.23,0.0,...,271427.0,60641.0,41200.0,49197.0,0,0,3,2019,5,2019
2,20000.0,0.2,529.88,MORTGAGE,56000.0,2,1,0,24.26,0.0,...,60644.0,45684.0,7500.0,43144.0,0,0,3,2019,5,2019
3,10000.0,0.164,353.55,RENT,92000.0,2,1,0,31.44,0.0,...,99506.0,68784.0,19700.0,76506.0,0,0,3,2019,5,2019
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,0,1,0,18.76,0.0,...,219750.0,25919.0,27600.0,20000.0,0,0,3,2019,5,2019


In [45]:
# Encode the home_ownership column using the get_dummies function since four potential strings populate this column's rows
df = pd.get_dummies(df, columns=["home_ownership"])
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,hardship_flag,debt_settlement_flag,issue_month,issue_year,next_pymnt_month,next_pymnt_year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT
0,10500.0,0.1719,375.35,66000.0,1,1,0,27.24,0.0,0.0,...,0,0,3,2019,5,2019,0,0,0,1
1,25000.0,0.2,929.09,105000.0,2,1,0,20.23,0.0,0.0,...,0,0,3,2019,5,2019,0,1,0,0
2,20000.0,0.2,529.88,56000.0,2,1,0,24.26,0.0,0.0,...,0,0,3,2019,5,2019,0,1,0,0
3,10000.0,0.164,353.55,92000.0,2,1,0,31.44,0.0,1.0,...,0,0,3,2019,5,2019,0,0,0,1
4,22000.0,0.1474,520.39,52000.0,0,1,0,18.76,0.0,1.0,...,0,0,3,2019,5,2019,0,1,0,0


## Split the Data into Training and Testing

In [46]:
# Create our features and convert the two columns with years from objects to numeric for the model input later
X = df.drop(columns="loan_status")
X["issue_year"] = pd.to_numeric(df["issue_year"])
X["next_pymnt_year"] = pd.to_numeric(df["next_pymnt_year"])

# Create our target
y = df["loan_status"].to_frame()

In [47]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,open_acc,...,hardship_flag,debt_settlement_flag,issue_month,issue_year,next_pymnt_month,next_pymnt_year,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,0.669994,0.0,21.778153,0.217766,0.497697,12.58734,...,0.0,0.0,1.726172,2019.0,4.616839,2019.0,0.009285,0.526309,0.106747,0.357659
std,10277.34859,0.04813,288.062432,115580.0,0.719105,0.0,20.199244,0.718367,0.758122,6.022869,...,0.0,0.0,0.743862,0.0,0.486161,0.0,0.095914,0.499311,0.308793,0.479314
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,1.0,2019.0,4.0,2019.0,0.0,0.0,0.0,0.0
25%,9000.0,0.0881,265.73,50000.0,0.0,0.0,13.89,0.0,0.0,8.0,...,0.0,0.0,1.0,2019.0,4.0,2019.0,0.0,0.0,0.0,0.0
50%,15000.0,0.118,404.56,73000.0,1.0,0.0,19.76,0.0,0.0,11.0,...,0.0,0.0,2.0,2019.0,5.0,2019.0,0.0,1.0,0.0,0.0
75%,24000.0,0.1557,648.1,104000.0,1.0,0.0,26.66,0.0,1.0,16.0,...,0.0,0.0,2.0,2019.0,5.0,2019.0,0.0,1.0,0.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,2.0,0.0,999.0,18.0,5.0,72.0,...,0.0,0.0,3.0,2019.0,5.0,2019.0,1.0,1.0,1.0,1.0


In [48]:
# Check the balance of our target values - integer of 1 = low risk and 0 = high risk 
y["loan_status"].value_counts()

1    68470
0      347
Name: loan_status, dtype: int64

In [49]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, only scale the features data (`X_train` and `X_testing`).

In [50]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [51]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [52]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Ensemble Learners

In this section, two ensemble algorithms are compared to determine which algorithm results in the best performance. Train a Balanced Random Forest Classifier and an Easy Ensemble classifier. For each algorithm, the folliowing steps are completed:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [53]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [54]:
# Calculated the balanced accuracy score
brf_y_pred = brf.predict(X_test_scaled)
balanced_accuracy_score(y_test, brf_y_pred)

0.7561059192916486

In [55]:
# Display the confusion matrix
confusion_matrix(y_test, brf_y_pred)

array([[   66,    35],
       [ 2416, 14688]], dtype=int64)

In [56]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, brf_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.03      0.65      0.86      0.05      0.75      0.55       101
          1       1.00      0.86      0.65      0.92      0.75      0.57     17104

avg / total       0.99      0.86      0.65      0.92      0.75      0.57     17205



In [78]:
# Calculate the importances into an array 
importances = brf.feature_importances_

# Sort the features in descending order (using argsoft and [::-1]) by importance
sorted_indices = np.argsort(importances)[::-1]

# Convert to a dataframe and preview to get the top 10 importances
importances_series = pd.DataFrame(importances, index=X_train.columns[sorted_indices])
importances_series.rename(columns={0: "Feature Score"},inplace=True)
importances_series.head(10)

Unnamed: 0,Feature Score
total_rec_prncp,0.01032
total_pymnt,0.030077
total_pymnt_inv,0.016133
last_pymnt_amnt,0.014127
total_rec_int,0.004494
issue_month,0.0
int_rate,0.018101
mths_since_recent_inq,0.002821
out_prncp_inv,0.004448
dti,0.009991


### Easy Ensemble Classifier

In [58]:
# The attribute "n_features_in_" in Easy Ensemble was deprecated recently (https://github.com/scikit-learn-contrib/imbalanced-learn/issues/872); therefore, Easy Ensember Classifier cannot be run.
# Since Random Undersampling is similar to Easy Ensemble, that will be used here.

# Resample the training data with the RandomUndersampler
rus = RandomUnderSampler(random_state=1)
rus_X_resampled, rus_y_resampled = rus.fit_resample(X_train_scaled, y_train)

# View the count of target classes with Counter
Counter(rus_y_resampled)

Counter({'loan_status': 1})

In [59]:
# Train the Logistic Regression model using the resampled data
rus_model = LogisticRegression(solver='lbfgs', random_state=1)
rus_model.fit(rus_X_resampled, rus_y_resampled)

LogisticRegression(random_state=1)

In [60]:
# Calculate the balanced accuracy score
rus_y_pred = rus_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, rus_y_pred)

0.8221309473089498

In [61]:
# Display the confusion matrix
confusion_matrix(y_test, rus_y_pred)

array([[   85,    16],
       [ 3375, 13729]], dtype=int64)

In [62]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, rus_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.02      0.84      0.80      0.05      0.82      0.68       101
          1       1.00      0.80      0.84      0.89      0.82      0.67     17104

avg / total       0.99      0.80      0.84      0.89      0.82      0.67     17205



### Final Questions

Since the Easy Ensemble attribute "n_features_in_" was deprecated and thus Easy Ensember Classifier cannot be run, the Balanced Random Forest Classifier is compared to the Random Undersampler Model.

1. Which model had the best balanced accuracy score?

    The random undersampling model had the better balance accuracy score (0.8221309473089498 compared to 0.7561059192916486).

2. Which model had the best recall score?

    The balanced random forest classifier had the better recall score (0.86 compared to 0.80).

3. Which model had the best geometric mean score?

    The random undersampling model had the better geometric score (0.82 compared to 0.75).

4. What are the top three features?
    
    The top three features are total received principal (total_rec_prncp), total payment (total_pymnt), and total payment invoiced (total_pymnt_inv).
