# Ensemble Learning

## Initial Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path)

# Preview the data
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


## Split the Data into Training and Testing

In [10]:
# Create our features
# I am trying to predict loan_status so x becomes everything minus loan_status 
X = df.drop(columns =['loan_status'])
# getting dummie variables (numerical data) for columns with strings 
X  =pd.get_dummies(X)
# Create our target
y = df.loan_status

In [11]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,issue_d_Mar-2019,pymnt_plan_n,initial_list_status_f,initial_list_status_w,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.177238,1.0,0.123879,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.381873,0.0,0.329446,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
# Check the balance of our target values 
# balance for y variable 
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [17]:
# comparing against my trained data to see the difference 
X.shape

(68817, 95)

In [16]:
# library needed to split X and y into test and train 
from sklearn.model_selection import train_test_split
# Split the X and y into X_train, X_test, y_train, y_test
X_train,X_test, y_train, y_test = train_test_split(X,y)
# looking at my train data 
X_train.shape

(51612, 95)

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [20]:
# importing the StandardScaler library 
from sklearn.preprocessing import StandardScaler
# Create the StandardScaler instance
scaler = StandardScaler()
scaler

StandardScaler()

In [25]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_train_scaled = scaler.fit(X_train)
# no need to fit the y_train because it is already in a small scale 

In [31]:
# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [37]:
from sklearn.ensemble import RandomForestClassifier
# n_estimators = the number of trees in the forest 
rf_model = RandomForestClassifier(n_estimators = 100,random_state = 1)
# Resample the training data with the BalancedRandomForestClassifier
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=1)

In [42]:
# to calculate the balanced accuracy score I need to have a prediction variable 
ypredictions = rf_model.predict(X_test_scaled)
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test,ypredictions)

0.6750327556609668

In [48]:
# Display the confusion matrix
confusion_matrix(y_test,ypredictions)

array([[   27,    50],
       [   10, 17118]])

In [45]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test,ypredictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.73      0.35      1.00      0.47      0.59      0.33        77
   low_risk       1.00      1.00      0.35      1.00      0.59      0.37     17128

avg / total       1.00      1.00      0.35      1.00      0.59      0.37     17205



In [51]:
# List the features sorted in descending order by feature importance
# Calculating feature importance 
importance_list = rf_model.feature_importances_
importance_list

array([0.0163004 , 0.0113883 , 0.01862736, 0.01330766, 0.01732946,
       0.00332639, 0.00541426, 0.00795046, 0.00201651, 0.01405321,
       0.00987628, 0.02191997, 0.02094996, 0.07724181, 0.06479989,
       0.07716515, 0.06476262, 0.01079304, 0.        , 0.        ,
       0.06249164, 0.00127254, 0.        , 0.        , 0.00517379,
       0.0134973 , 0.00508323, 0.0059197 , 0.00462676, 0.00647127,
       0.0113169 , 0.0130272 , 0.0106463 , 0.00535571, 0.00817614,
       0.01496502, 0.01101236, 0.01468297, 0.00620063, 0.00730763,
       0.00883907, 0.00726405, 0.01369599, 0.01545744, 0.01284235,
       0.00015712, 0.        , 0.01418668, 0.01672154, 0.00890024,
       0.00989586, 0.00573166, 0.00982203, 0.00971167, 0.00327626,
       0.00742667, 0.00882476, 0.00586919, 0.00751881, 0.00896986,
       0.00860267, 0.00987502, 0.00854773, 0.00880193, 0.        ,
       0.        , 0.00157465, 0.00529176, 0.00700786, 0.00646381,
       0.00126074, 0.        , 0.01314656, 0.01423185, 0.01405

In [61]:
# sorting the feature importances in descending order
#zip gives allows us to enter the labels for the feature importance sort list
sorted(zip(rf_model.feature_importances_,X.columns[:10]),reverse = True)

[(0.01862735688222908, 'installment'),
 (0.01732946436490145, 'dti'),
 (0.01630040292246222, 'loan_amnt'),
 (0.014053207422605594, 'revol_bal'),
 (0.013307655242154541, 'annual_inc'),
 (0.011388300192680015, 'int_rate'),
 (0.007950460722673629, 'open_acc'),
 (0.005414259447478468, 'inq_last_6mths'),
 (0.003326390263855416, 'delinq_2yrs'),
 (0.0020165067808196853, 'pub_rec')]

### Easy Ensemble Classifier

In [74]:
# importing appropriate library 
from sklearn.ensemble import GradientBoostingClassifier
# Train the Classifier
learning_rates = 0.10
classifier = GradientBoostingClassifier(
    n_estimators = 100,
    learning_rate =learning_rates,
    random_state =1,
    max_leaf_nodes = 8,
    max_depth = 5
)
# fitting the model 
classifier.fit(X_train_scaled,y_train.ravel())

GradientBoostingClassifier(max_depth=5, max_leaf_nodes=8, random_state=1)

In [83]:
# getting prediction variable 
predictions = classifier.predict(X_test_scaled)
# Calculated the balanced accuracy score
bca = balanced_accuracy_score(y_test, predictions)
print(f"The balanced accuracy score is: {bca}")

The balanced accuracy score is: 0.713526723160072


In [84]:
# Display the confusion matrix
confusion_matrix(y_test,predictions)

array([[   33,    44],
       [   26, 17102]])

In [86]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.56      0.43      1.00      0.49      0.65      0.40        77
   low_risk       1.00      1.00      0.43      1.00      0.65      0.45     17128

avg / total       1.00      1.00      0.43      1.00      0.65      0.45     17205



### Final Q & A's 

1. Which model had the best balanced accuracy score?

   The Gradient Boosting Classifer has the best balanced accurary score of the two models.

2. Which model had the best recall score?

    Both models have the same average recall score. 

3. Which model had the best geometric mean score?

    The Gradient Boosting Classifer model has the best geometric mean. 

4. What are the top three features?

    The top three features are installment, dti and loan amount. 