# Ensembles:

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org


In [None]:
# Load csv to DataFrame:
file_path = Path('Resources/LoanStats_2019Q1.csv')
stats_df = pd.read_csv(file_path)
stats_df.head()

In [None]:
# Create LabelEncoder instance:
le = LabelEncoder()

# Encode home_ownership column:
le.fit(stats_df['home_ownership'])
stats_df['home_ownership'] = le.transform(stats_df['home_ownership'])

# Encode verification_status column:
le.fit(stats_df['verification_status'])
stats_df['verification_status'] = le.transform(stats_df['verification_status'])

# Encode issue_d column:
le.fit(stats_df['issue_d'])
stats_df['issue_d'] = le.transform(stats_df['issue_d'])

# Encode pymnt_plan column:
le.fit(stats_df["pymnt_plan"])
stats_df['pymnt_plan'] = le.transform(stats_df['pymnt_plan'])

# Encode loan_status column:
le.fit(stats_df['loan_status'])
stats_df['loan_status'] = le.transform(stats_df['loan_status'])

# Encode next_pymnt_d column:
le.fit(stats_df['next_pymnt_d'])
stats_df['next_pymnt_d'] = le.transform(stats_df['next_pymnt_d'])

# Encode application_type column:
le.fit(stats_df['application_type'])
stats_df['application_type'] = le.transform(stats_df['application_type'])


# Encode initial_list_status column:
le.fit(stats_df['initial_list_status'])
stats_df['initial_list_status'] = le.transform(stats_df['initial_list_status'])

# Encode hardship_flag column:
le.fit(stats_df['hardship_flag'])
stats_df['hardship_flag'] = le.transform(stats_df['hardship_flag'])

# Encode debt_settlement_flag column:
le.fit(stats_df['debt_settlement_flag'])
stats_df['debt_settlement_flag'] = le.transform(stats_df['debt_settlement_flag'])

stats_df.head()

## Split the Data into Training and Testing

In [4]:
# Define target:
y = stats_df['loan_status']

# Check the balance of target values:
y.value_counts()

1    68470
0      347
Name: loan_status, dtype: int64

In [5]:
# Define features:
x = stats_df.copy()
x.drop('loan_status',axis=1,inplace=True)
x.describe()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,1.812779,88213.71,0.669994,0.805542,0.0,21.778153,0.217766,...,95.057627,30.626217,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4,0.0,0.0
std,10277.34859,0.04813,288.062432,0.941313,115580.0,0.719105,0.714932,0.0,20.199244,0.718367,...,8.326426,33.631463,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45,0.0,0.0
min,1000.0,0.06,30.89,0.0,40.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,3600.0,235.0,100.0,127.0,0.0,0.0
25%,9000.0,0.0881,265.73,1.0,50000.0,0.0,0.0,0.0,13.89,0.0,...,93.0,0.0,0.0,0.0,66977.0,26503.0,11600.0,22880.0,0.0,0.0
50%,15000.0,0.118,404.56,1.0,73000.0,1.0,1.0,0.0,19.76,0.0,...,100.0,20.0,0.0,0.0,146710.0,45357.0,22100.0,42000.0,0.0,0.0
75%,24000.0,0.1557,648.1,3.0,104000.0,1.0,1.0,0.0,26.66,0.0,...,100.0,50.0,0.0,0.0,303640.0,76570.0,39300.0,72499.0,0.0,0.0
max,40000.0,0.3084,1676.23,3.0,8797500.0,2.0,2.0,0.0,999.0,18.0,...,100.0,100.0,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0,0.0,0.0


In [6]:
# Split x and y into x_train, x_test, y_train, y_test:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=78)

## Data Pre-Processing

Scale the training and testing data using `StandardScaler`.

In [7]:
# Create a SandardScaler instance:
scaler = StandardScaler()

# Fit the Standard Scaler:
x_scaler = scaler.fit(x_train)

# Scale training and testing data:
x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

## Ensemble Learners

Compare the BalancedRandomForestClassifier and EasyEnsembleClassifier algorithms and determine which results in the best performance then complete the folliowing for each algorithm:

1. Train the model using the training data. 
2. Calculate the Balanced Accuracy Score from sklearn.metrics.
3. Display the Confusion Matrix from sklearn.metrics.
4. Generate an Imbalanced Classification Report from imbalanced-learn.
5. For the Balanced Random Forest Classifier, output the feature importance in descending order (most important to least important) along with the feature score.

### Balanced Random Forest Classifier

In [8]:
# Resample the training data with the BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100,random_state=1)
brf_model = brf_model.fit(x_train_scaled,y_train)

In [9]:
# Calculate the balanced accuracy score:
brf_predictions = brf_model.predict(x_test_scaled)
brf_ba_score = balanced_accuracy_score(y_test,brf_predictions)
display(brf_ba_score)

0.7518737344353009

In [19]:
# Display the confusion matrix:
brf_cm = confusion_matrix(y_test,brf_predictions)
brf_cm_df = pd.DataFrame(brf_cm,index=['Actual 0','Actual 1'],columns=['Predicted 0','Predicted 1'])
brf_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,58,34
Actual 1,2168,14945


In [11]:
# Print the imbalanced classification report:
print(classification_report_imbalanced(y_test,brf_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.03      0.63      0.87      0.05      0.74      0.54        92
          1       1.00      0.87      0.63      0.93      0.74      0.56     17113

avg / total       0.99      0.87      0.63      0.93      0.74      0.56     17205



In [None]:
# List features in descending order by importance:
importance_sorted = sorted(zip(brf_model.feature_importances_,x.columns),reverse=True)
display(importance_sorted)

### Easy Ensemble Classifier

In [13]:
# Train the EasyEnsembleClassifier:
eec_model = EasyEnsembleClassifier(n_estimators=100,random_state=1)
eec_model.fit(x_train_scaled,y_train)
y_pred_eec = eec_model.predict(x_test)

In [14]:
# Calculate balanced accuracy score:
eec_ba_score = balanced_accuracy_score(y_test,y_pred_eec)
display(eec_ba_score)

0.48017112594290123

In [15]:
# Display confusion matrix:
eec_cm = confusion_matrix(y_test,y_pred_eec)
eec_cm_df = pd.DataFrame(eec_cm,index=['Actual 0','Actual 1'],columns=['Predicted 0','Predicted 1'])
eec_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,31,61
Actual 1,6445,10668


In [16]:
# Print imbalanced classification report:
print(classification_report_imbalanced(y_test,y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.34      0.62      0.01      0.46      0.20        92
          1       0.99      0.62      0.34      0.77      0.46      0.22     17113

avg / total       0.99      0.62      0.34      0.76      0.46      0.22     17205



### Final Questions

1. Which model had the best balanced accuracy score?

    The Balanced Random Forest Classifier had the best Balanced Accuracy Score by about 27%

2. Which model had the best recall score?

    The Balanced Random Forest Classifier had the best Recall Score by 15%

3. Which model had the best geometric mean score?

    YOUR ANSWER HERE.

4. What are the top three features?

    YOUR ANSWER HERE.