# Ensemble Learning

## Initial Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.preprocessing import LabelEncoder

## Read the CSV and Inspect the Data

In [4]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path)

# Preview the data
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [5]:
# Check dataframe for any null or non numeric feature values pre clean
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   68817 non-null  float64
 1   int_rate                    68817 non-null  float64
 2   installment                 68817 non-null  float64
 3   home_ownership              68817 non-null  object 
 4   annual_inc                  68817 non-null  float64
 5   verification_status         68817 non-null  object 
 6   issue_d                     68817 non-null  object 
 7   loan_status                 68817 non-null  object 
 8   pymnt_plan                  68817 non-null  object 
 9   dti                         68817 non-null  float64
 10  delinq_2yrs                 68817 non-null  float64
 11  inq_last_6mths              68817 non-null  float64
 12  open_acc                    68817 non-null  float64
 13  pub_rec                     688

In [6]:
# Identify any non numeric feature value columns that need to be treated
# Examine this feature data to determin the appropriate mehtods for cleaning
df.select_dtypes(include='object')

Unnamed: 0,home_ownership,verification_status,issue_d,loan_status,pymnt_plan,initial_list_status,next_pymnt_d,application_type,hardship_flag,debt_settlement_flag
0,RENT,Source Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
1,MORTGAGE,Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
2,MORTGAGE,Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
3,RENT,Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
4,MORTGAGE,Not Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
...,...,...,...,...,...,...,...,...,...,...
68812,RENT,Source Verified,Jan-2019,low_risk,n,w,May-2019,Individual,N,N
68813,RENT,Not Verified,Jan-2019,low_risk,n,w,May-2019,Individual,N,N
68814,MORTGAGE,Source Verified,Jan-2019,low_risk,n,w,May-2019,Individual,N,N
68815,MORTGAGE,Verified,Jan-2019,low_risk,n,f,May-2019,Individual,N,N


In [7]:
# Return columns from dataframe for any null or non numeric feature values pre clean
df.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   home_ownership        68817 non-null  object
 1   verification_status   68817 non-null  object
 2   issue_d               68817 non-null  object
 3   loan_status           68817 non-null  object
 4   pymnt_plan            68817 non-null  object
 5   initial_list_status   68817 non-null  object
 6   next_pymnt_d          68817 non-null  object
 7   application_type      68817 non-null  object
 8   hardship_flag         68817 non-null  object
 9   debt_settlement_flag  68817 non-null  object
dtypes: object(10)
memory usage: 5.3+ MB


### Data Inspection Assessment

It can be seen when examining this data that there are 9 feature data columns (other than our targe column **loan_status**) that contain objects that need to be treated. 

At this stage it is noted other than incompatible Dtypes for modelling there are no null rows, values or other basic data integrity issues

## Perform Initial Data Cleaning

The feature data will be cleaned and treated using date splitting and conversion along with integer encoding. Before progressing to data set splitting and scaling

In [8]:
# Split and treat the no numeric data in the issue_d and next_pymnt_d columns
# Split year from Month short name into two new columns for issue_d column
df[['issue_month','issue_year']] = df['issue_d'].str.split('-', expand=True)
# Split year from Month short name into two new columns for next_pymnt_d column
df[['next_pymnt_month','next_pymnt_year']] = df['next_pymnt_d'].str.split('-', expand=True)

# Create Months dictionary using month shortname as seen in the data
months_num = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "Jun": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12,
}

# Months' names encoded using the dictionary values
df["issue_month_num"] = df["issue_month"].apply(lambda x: months_num[x])
df["next_pymnt_month_num"] = df["next_pymnt_month"].apply(lambda x: months_num[x])

# Convert issue_year and next_pymnt_year column to int data type
df = df.astype({"issue_year": int,"next_pymnt_year": int})

# Drop the date columns as this data has been treated and numerically represented in new columns
df.drop(["issue_d", "issue_month","next_pymnt_d", "next_pymnt_month"], axis=1, inplace=True)

# Create list of all remaining columns that need to be treated by integer encoder
non_integer_columns =  ["home_ownership", "verification_status", "pymnt_plan", "initial_list_status", "application_type", "hardship_flag", "debt_settlement_flag"]  

#Create loop to integer encode the remaining data 
for column in non_integer_columns:
    # Creating an instance of label encoder for integer encoding the columns data
    label_encoder = LabelEncoder()
    # Fitting the label encoder
    label_encoder.fit(df[column])
    # Encode the data as an integer into new label encoded column
    encoded_column_name = column + "_le"
    df[encoded_column_name] = label_encoder.transform(df[column]) 
    # Drop the source column
    df.drop([column], axis=1, inplace=True)

# Inspect the cleaned data    
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,next_pymnt_year,issue_month_num,next_pymnt_month_num,home_ownership_le,verification_status_le,pymnt_plan_le,initial_list_status_le,application_type_le,hardship_flag_le,debt_settlement_flag_le
0,10500.0,0.1719,375.35,66000.0,low_risk,27.24,0.0,0.0,8.0,0.0,...,2019,3,5,3,1,0,1,0,0,0
1,25000.0,0.2,929.09,105000.0,low_risk,20.23,0.0,0.0,17.0,1.0,...,2019,3,5,1,2,0,1,0,0,0
2,20000.0,0.2,529.88,56000.0,low_risk,24.26,0.0,0.0,8.0,0.0,...,2019,3,5,1,2,0,1,0,0,0
3,10000.0,0.164,353.55,92000.0,low_risk,31.44,0.0,1.0,10.0,1.0,...,2019,3,5,3,2,0,1,0,0,0
4,22000.0,0.1474,520.39,52000.0,low_risk,18.76,0.0,1.0,14.0,0.0,...,2019,3,5,1,0,0,1,0,0,0


In [9]:
# Return columns from dataframe for any null or non numeric feature values pre clean
df.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   loan_status  68817 non-null  object
dtypes: object(1)
memory usage: 537.8+ KB


In [10]:
# Check dataframe for any null or non numeric feature values post clean
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 88 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   68817 non-null  float64
 1   int_rate                    68817 non-null  float64
 2   installment                 68817 non-null  float64
 3   annual_inc                  68817 non-null  float64
 4   loan_status                 68817 non-null  object 
 5   dti                         68817 non-null  float64
 6   delinq_2yrs                 68817 non-null  float64
 7   inq_last_6mths              68817 non-null  float64
 8   open_acc                    68817 non-null  float64
 9   pub_rec                     68817 non-null  float64
 10  revol_bal                   68817 non-null  float64
 11  total_acc                   68817 non-null  float64
 12  out_prncp                   68817 non-null  float64
 13  out_prncp_inv               688

### Initial Data Cleaning Assessment

The feature data is ready to split into testing and training sets, scaled and then passed to modelling

## Split the Data into Training and Testing

In [11]:
# Create our features
X = df.copy()

# Dropping homeowner and loan_statis columns
X.drop(["loan_status"], axis=1, inplace=True)

# Create our target
y = df["loan_status"]

In [12]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,next_pymnt_year,issue_month_num,next_pymnt_month_num,home_ownership_le,verification_status_le,pymnt_plan_le,initial_list_status_le,application_type_le,hardship_flag_le,debt_settlement_flag_le
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,2019.0,1.726172,4.616839,1.812779,0.669994,0.0,0.876121,0.13966,0.0,0.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.0,0.743862,0.486161,0.941313,0.719105,0.0,0.329446,0.346637,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,2019.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,2019.0,1.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,2019.0,2.0,5.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,2019.0,2.0,5.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,2019.0,3.0,5.0,3.0,2.0,0.0,1.0,1.0,0.0,0.0


In [13]:
# Check the balance of our target values
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [14]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test= train_test_split(X, 
                                                   y, 
                                                   random_state=1, 
                                                   stratify=y)
# Examine X and y testing and training set shapes
#X_train.shape
#X_test.shape
#y_train.shape
y_test.shape

(17205,)

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [15]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [16]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [17]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [18]:
# Resample the training data with the BalancedRandomForestClassifier
brf_model = RandomForestClassifier(n_estimators=100, random_state=1)
brf_model.fit(X_train_scaled, y_train)
y_pred = brf_model.predict(X_test_scaled)
brf_model

RandomForestClassifier(random_state=1)

In [19]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6663745764692137

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[   29,    58],
       [   10, 17108]], dtype=int64)

In [21]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred, digits=16))

                        pre       rec       spe        f1       geo       iba       sup

       high_risk  0.7435897435897436 0.3333333333333333 0.9994158196050941 0.4603174603174603 0.5771816061994971 0.3109488274036350        87
        low_risk  0.9966212280088547 0.9994158196050941 0.3333333333333333 0.9980165674950415 0.5771816061994971 0.3553283856664278     17118

     avg / total  0.9953417314006325 0.9960476605637896 0.3367014923746378 0.9952976007804556 0.5771816061994972 0.3551039729045060     17205



In [22]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.09103494104659202, 'total_rec_prncp'),
 (0.07100785388726943, 'total_rec_int'),
 (0.06648847235067998, 'total_pymnt'),
 (0.05895125860033097, 'last_pymnt_amnt'),
 (0.054815748726515026, 'total_pymnt_inv'),
 (0.0219652225585143, 'out_prncp_inv'),
 (0.01924753732244239, 'dti'),
 (0.019034097371012238, 'installment'),
 (0.018465077520802183, 'out_prncp'),
 (0.01714900701212202, 'mo_sin_old_rev_tl_op'),
 (0.016186386811046737, 'loan_amnt'),
 (0.01572972566926636, 'bc_open_to_buy'),
 (0.0155364837490489, 'mo_sin_old_il_acct'),
 (0.015196420504958882, 'max_bal_bc'),
 (0.015135665999943737, 'revol_bal'),
 (0.014587558276528419, 'total_rec_late_fee'),
 (0.014290545902401981, 'tot_cur_bal'),
 (0.014160603228495303, 'tot_hi_cred_lim'),
 (0.01390340710073431, 'total_bal_ex_mort'),
 (0.013767942958685562, 'issue_month_num'),
 (0.013574456472400636, 'bc_util'),
 (0.013287291026414176, 'annual_inc'),
 (0.01325607659054017, 'avg_cur_bal'),
 (0.01317326110408159, 'total_bc_limit'),
 (0.01294727425

### Easy Ensemble Classifier

In [23]:
# Train the Classifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
ee_model.fit(X_train_scaled, y_train)
y_pred = ee_model.predict(X_test_scaled)
ee_model

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [24]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.9324304724609305

In [25]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[   80,     7],
       [  936, 16182]], dtype=int64)

In [26]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred, digits=16))

                        pre       rec       spe        f1       geo       iba       sup

       high_risk  0.0787401574803150 0.9195402298850575 0.9453207150368034 0.1450589301903898 0.9323413686091855 0.8670194320653232        87
        low_risk  0.9995676076348138 0.9453207150368034 0.9195402298850575 0.9716876332302520 0.9323413686091855 0.8715014231747749     17118

     avg / total  0.9949112874858199 0.9451903516419645 0.9196705932798962 0.9675076450195883 0.9323413686091855 0.8714787592267061     17205



### Final Questions

1. Which model had the best balanced accuracy score?

    - 0.6663745764692137 - Balanced Random Forest Classifier
    - **0.9324304724609305 - Easy Ensemble Classifier**
    
    **ANSWER:** Examining the figures above it can bee seen that the **Easy Ensemble Classifier** model had the best balanced accuracy score.

2. Which model had the best recall score?

    - **0.9960476605637896 - Balanced Random Forest Classifier**
    - 0.9451903516419645 - Easy Ensemble Classifier
    
    **ANSWER:** Examining the figures above it can bee seen that the **Balanced Random Forest Classifier** model had the best recall score.

3. Which model had the best geometric mean score?

    - 0.5771816061994972 - Balanced Random Forest Classifier
    - **0.9323413686091855 - Easy Ensemble Classifier**
    
    **ANSWER:** Examining the figures above it can bee seen that the **Easy Ensemble Classifier** model had the best geometric mean score.

4. What are the top three features?

     **ANSWER:** The following lists the top three features
         - 0.09103494104659202, 'total_rec_prncp'
         - 0.07100785388726943, 'total_rec_int'
         - 0.06648847235067998, 'total_pymnt'