# Credit Risk Resampling

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, recall_score
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org


In [2]:
# Load csv into DataFrame:
file_path = Path('Resources/lending_data.csv')
lending_df = pd.read_csv(file_path)
lending_df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [3]:
# Create LabelEncoder instance:
le = LabelEncoder()

# Encode the "loan_status" target column with the LabelEncoder:
le.fit(lending_df['loan_status'])
lending_df['loan_status'] = le.transform(lending_df['loan_status'])

# Encode the "homeowner" column with the LabelEncoder:
le.fit(lending_df["homeowner"])
lending_df["homeowner"] = le.transform(lending_df["homeowner"])

### Split data into Training and Testing

In [4]:
# Create features:
x = lending_df.copy()
x.drop('loan_status',axis=1,inplace=True)
x.describe()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,0.606144,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,0.667811,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,0.0,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,0.0,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,1.0,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,1.0,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,2.0,105200.0,0.714829,16.0,3.0,75200.0


In [5]:
# Create target:
y = lending_df['loan_status']

In [6]:
# Check the balance of target values:
y.value_counts()

1    75036
0     2500
Name: loan_status, dtype: int64

In [7]:
# Create X_train, X_test, y_train, y_test:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1)

## Data Pre-Processing

In [8]:
# Create a StandardScaler instance:
scaler = StandardScaler()

In [9]:
# Fit the Standard Scaler with training data:
x_scaler = scaler.fit(x_train)

In [10]:
# Scale the training and testing data:
x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

# Simple Logistic Regression

In [11]:
# Create the Logistic Regression model:
lr_model = LogisticRegression(solver='lbfgs',random_state=1)

# Fit the model:
lr_model.fit(x_train,y_train)

LogisticRegression(random_state=1)

In [12]:
# Make predictions using the testing data then calculate Balanced Accuracy Score:
y_pred = lr_model.predict(x_test)
balanced_accuracy_score(y_test,y_pred)

0.9520479254722232

In [13]:
# Display the Confusion Matrix:
lr_cm = confusion_matrix(y_test,y_pred)
lr_cm_df = pd.DataFrame(lr_cm,index=['Actual 0','Actual 1'],columns=['Predicted 0','Predicted 1'])
lr_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,563,56
Actual 1,102,18663


In [14]:
# Print Imbalanced Classification Report:
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.85      0.91      0.99      0.88      0.95      0.90       619
          1       1.00      0.99      0.91      1.00      0.95      0.91     18765

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



# Oversampling

1. View the count of the target classes. 
3. Use the resampled data to train a Logistic Regression model.
3. Calculate the Balanced Accuracy Score.
4. Print the Confusion Matrix.
5. Generate an Imbalanced Classication Report.

### Naive Random Oversampling

In [15]:
# Resample the training data with RandomOversampler:
ro_model = RandomOverSampler(random_state=1)
x_resampled, y_resampled = ro_model.fit_resample(x_train,y_train)

# View Count of target classes:
Counter(y_resampled)

Counter({1: 56271, 0: 56271})

In [16]:
# Train the Logistic Regression model using the resampled data:
ro_lr_model = LogisticRegression(solver='lbfgs',random_state=1)
ro_lr_model.fit(x_resampled,y_resampled)

LogisticRegression(random_state=1)

In [17]:
# Make predictions using the testing data then calculate the Balanced Accuracy Score:
ro_y_pred = ro_lr_model.predict(x_test)
balanced_accuracy_score(y_test,y_pred)

0.9520479254722232

In [18]:
# Display Confusion Matrix:
ro_cm = confusion_matrix(y_test,ro_y_pred)
ro_cm_df = pd.DataFrame(ro_cm,index=['Actual 0','Actual 1'],columns=['Predicted 0','Predicted 1'])
ro_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,615,4
Actual 1,116,18649


In [19]:
# Print Imbalanced Classification Report:
print(classification_report_imbalanced(y_test,ro_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.99      0.99      0.91      0.99      0.99       619
          1       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



### SMOTE Oversampling

In [20]:
# Resample the training data with SMOTE:
x_resampled, y_resampled = SMOTE(random_state=1,sampling_strategy=1.0).fit_resample(x_train,y_train)

# View Count of target classes:
Counter(y_resampled)

Counter({1: 56271, 0: 56271})

In [21]:
# Train the Logistic Regression model using the resampled data:
smote_lr_model = LogisticRegression(solver='lbfgs',random_state=1)
smote_lr_model.fit(x_resampled,y_resampled)

LogisticRegression(random_state=1)

In [22]:
# Calculate the Balanced Accuracy Score:
smote_y_pred = smote_lr_model.predict(x_test)
balanced_accuracy_score(y_test,smote_y_pred)

0.9936781215845847

In [23]:
# Display Confusion Matrix:
confusion_matrix(y_test,smote_y_pred)

array([[  615,     4],
       [  116, 18649]], dtype=int64)

In [24]:
# Print Imbalanced Classification Report:
print(classification_report_imbalanced(y_test,smote_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.99      0.99      0.91      0.99      0.99       619
          1       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



# Undersampling

1. View the count of the target classes. 
3. Use the resampled data to train a Logistic Regression model.
3. Calculate the Balanced Accuracy score.
4. Display the Confusion Matrix.
5. Generate an Imbalanced Classication Report.

In [25]:
# Resample data using ClusterCentroids resampler:
cc_model = ClusterCentroids(random_state=1)
x_resampled, y_resampled = cc_model.fit_resample(x_train,y_train)

# View Count of target classes:
Counter(y_resampled)

Counter({0: 1881, 1: 1881})

In [26]:
# Train the Logistic Regression model using the resampled data:
cc_lr_model = LogisticRegression(solver='lbfgs',random_state=1)
cc_lr_model.fit(x_resampled,y_resampled)

LogisticRegression(random_state=1)

In [27]:
# Calculate Balanced Accuracy Score:
cc_y_pred = cc_lr_model.predict(x_test)
balanced_accuracy_score(y_test,cc_y_pred)

0.9865149130022852

In [28]:
# Display Confusion Matrix:
confusion_matrix(y_test,cc_y_pred)

array([[  606,    13],
       [  112, 18653]], dtype=int64)

In [29]:
# Print Imbalanced Classification Report:
print(classification_report_imbalanced(y_test,cc_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.98      0.99      0.91      0.99      0.97       619
          1       1.00      0.99      0.98      1.00      0.99      0.97     18765

avg / total       0.99      0.99      0.98      0.99      0.99      0.97     19384



# Combination (Over and Under) Sampling

1. View Count of the target classes. 
3. Use the resampled data to train a Logistic Regression model.
3. Calculate the Balanced Accuracy Score.
4. Display the Confusion Matrix.
5. Generate an Imbalanced Classication Report.

In [30]:
# Resample training data with SMOTEENN:
smtn_model = SMOTEENN(random_state=0)
x_resampled, y_resampled = smtn_model.fit_resample(x_train,y_train)

# View Count of target classes:
Counter(y_resampled)

Counter({0: 55569, 1: 55925})

In [31]:
# Train Logistic Regression model using resampled data:
smtn_lr_model = LogisticRegression(solver='lbfgs',random_state=1).fit(x_resampled,y_resampled)

In [32]:
# Calculate Balanced Accuracy Score:
smtn_y_pred = smtn_lr_model.predict(x_test)
balanced_accuracy_score(y_test,smtn_y_pred)

0.9935182494822666

In [33]:
# Display Confusion Matrix:
confusion_matrix(y_test,smtn_y_pred)

array([[  615,     4],
       [  122, 18643]], dtype=int64)

In [34]:
# Print Imbalanced Classification Report:
print(classification_report_imbalanced(y_test,smtn_y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.99      0.99      0.91      0.99      0.99       619
          1       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



# Final Questions and Analysis:

1. Which model had the best balanced accuracy score?
   * The SMOTEENN Logistic Regression Model had the best Balanced Accuracy Score of 0.9935182494822666.

2. Which model had the best recall score?
   * The ClusterCentroids SAMPLING model had the best recall score with a recall of recall_score 0.9940314415134559.
   * The Ssimple Logistic REGRESSION model had the bst overall recall score of 0.994564348521183.
The recall scores of each model are output below:

In [35]:
print(f'LogisticRegression Recall Score = {recall_score(y_test,y_pred)}\nRandomOverSampler Recall Score = {recall_score(y_test,ro_y_pred)}\nSMOTE Recall Score = {recall_score(y_test,smote_y_pred)}\nClusterCentroids Recall Score = {recall_score(y_test,cc_y_pred)}\nSMOTEENN Recall Score = {recall_score(y_test,smtn_y_pred)}')

LogisticRegression Recall Score = 0.994564348521183
RandomOverSampler Recall Score = 0.993818278710365
SMOTE Recall Score = 0.993818278710365
ClusterCentroids Recall Score = 0.9940314415134559
SMOTEENN Recall Score = 0.9934985345057288


3. Which model had the best geometric mean score?
    * The RandomOverSampler and SMOTE sampling models had the best geometric mean scores tied at 0.9940314415134559.

The geometric mean scores of each model are output below:

In [36]:
print(f'LogisticRegression Geometric Mean Score = {geometric_mean_score(y_test,y_pred)}\nRandomOverSampler Geometric Mean Score = {geometric_mean_score(y_test,ro_y_pred)}\nSMOTE Geometric Mean Score = {geometric_mean_score(y_test,smote_y_pred)}\nClusterCentroids Geometric Mean Score = {geometric_mean_score(y_test,cc_y_pred)}\nSMOTEENN Geometric Mean Score = {geometric_mean_score(y_test,smtn_y_pred)}')

LogisticRegression Geometric Mean Score = 0.9510981054376492
RandomOverSampler Geometric Mean Score = 0.9936781117000861
SMOTE Geometric Mean Score = 0.9936781117000861
ClusterCentroids Geometric Mean Score = 0.9864862773374231
SMOTEENN Geometric Mean Score = 0.9935182492866586
