In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [2]:
RANDOM_STATE=42
DATA_PATH = "../data/credit-risk-data_fct_customer.csv"
DROP_COLS = [
    "id",
    "home_ownership",
    "verification_status",
    "fico_range_low",
    "fico_range_high",
    "is_not_verified",
    "issue_d"
]
VERSION = "0.1.0"
MODEL_PATH = f"./model/xgboost_{VERSION}.pkl"

### Load the Data

In [3]:
data = pd.read_csv(DATA_PATH)
data.head()

Unnamed: 0,id,loan_status,loan_amnt,term,int_rate,installment,sub_grade,emp_length,home_ownership,is_mortgage,is_rent,is_own,is_any,is_other,annual_inc,verification_status,is_verified,is_not_verified,is_source_verified,issue_d,purpose,addr_state,dti,fico_range_low,fico_range_high,open_acc,pub_rec,revol_bal,revol_util,mort_acc,pub_rec_bankruptcies,age,pay_status
0,7289,False,5000,36,11.99,166.05,11,0,1,True,False,False,False,False,62500,1,True,True,True,2016-01-04,1,85,14.42,670,674,10,0,8495,48.0,0,0,29,-2
1,14756,False,15000,60,10.99,326.07,9,4,1,True,False,False,False,False,49000,1,True,True,True,2015-01-08,1,97,15.07,689,689,8,0,7596,71.0,2,0,45,2
2,10163,False,12000,36,11.47,395.55,10,1,1,True,False,False,False,False,62500,1,True,True,True,2016-01-03,1,74,25.89,660,664,9,1,16214,79.5,1,1,34,0
3,14007,True,2500,36,15.61,87.42,16,2,1,True,False,False,False,False,62500,1,True,True,True,2015-01-07,3,53,21.9,670,674,15,0,11161,75.9,1,0,52,0
4,469,True,17750,60,18.92,459.67,19,10,1,True,False,False,False,False,99000,1,True,True,True,2014-01-03,1,102,13.3,715,719,24,0,3940,19.1,2,0,40,0


### Feature Selection
Some of the features we have in this dataset are essentially repeating themselves. We will rid ourselves of redundancies, for starters. First of all, let's take the middle of the fico ranges and turn it into one single feature, "fico_score".

In [4]:
data["fico_score"] = round((data["fico_range_low"] + data["fico_range_high"]) / 2)
data = data.drop(DROP_COLS, axis=1)

### Explore the Data

In [5]:
data.describe()

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,emp_length,annual_inc,purpose,addr_state,dti,open_acc,pub_rec,revol_bal,revol_util,mort_acc,pub_rec_bankruptcies,age,pay_status,fico_score
count,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0,58852.0
mean,14281.248726,14.010739,13.184463,434.544837,11.630854,5.645993,150311.9,2.256032,75.032437,17.993072,11.623122,0.216781,16196.97,51.938696,1.621678,0.134473,35.285309,0.054883,697.784119
std,8617.247099,17.8951,4.752709,258.650524,6.454525,3.837227,504534.4,2.357391,15.140317,8.339486,5.471044,0.589514,21133.72,24.458518,2.010912,0.381288,9.361171,1.136492,31.754917
min,1000.0,3.0,5.31,30.65,1.0,0.0,7000.0,1.0,52.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,20.0,-2.0,660.0
25%,7800.0,3.0,9.67,247.29,7.0,2.0,46000.0,1.0,61.0,11.7675,8.0,0.0,6009.0,33.7,0.0,0.0,28.0,-1.0,672.0
50%,12000.0,3.0,12.73,373.22,11.0,6.0,65000.0,1.0,74.0,17.52,11.0,0.0,11110.5,52.4,1.0,0.0,34.0,0.0,692.0
75%,20000.0,36.0,15.99,572.6025,15.0,10.0,90000.0,2.0,87.0,23.87,14.0,0.0,19836.25,70.8,3.0,0.0,41.0,1.0,712.0
max,40000.0,60.0,30.99,1607.8,35.0,10.0,6998721.0,14.0,102.0,49.94,67.0,21.0,1044210.0,162.0,24.0,8.0,78.0,9.0,850.0


In [6]:
data.shape

(58852, 27)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58852 entries, 0 to 58851
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   loan_status           58852 non-null  bool   
 1   loan_amnt             58852 non-null  int64  
 2   term                  58852 non-null  int64  
 3   int_rate              58852 non-null  float64
 4   installment           58852 non-null  float64
 5   sub_grade             58852 non-null  int64  
 6   emp_length            58852 non-null  int64  
 7   is_mortgage           58852 non-null  bool   
 8   is_rent               58852 non-null  bool   
 9   is_own                58852 non-null  bool   
 10  is_any                58852 non-null  bool   
 11  is_other              58852 non-null  bool   
 12  annual_inc            58852 non-null  int64  
 13  is_verified           58852 non-null  bool   
 14  is_source_verified    58852 non-null  bool   
 15  purpose            

In [8]:
# full Pearson Correlation table
data.corr().style.background_gradient(cmap='viridis')

Unnamed: 0,loan_status,loan_amnt,term,int_rate,installment,sub_grade,emp_length,is_mortgage,is_rent,is_own,is_any,is_other,annual_inc,is_verified,is_source_verified,purpose,addr_state,dti,open_acc,pub_rec,revol_bal,revol_util,mort_acc,pub_rec_bankruptcies,age,pay_status,fico_score
loan_status,1.0,0.064174,0.032269,0.25234,0.051043,0.26016,-0.028091,-0.068599,0.06848,0.002169,-0.002749,0.008699,-0.027231,0.090064,0.025871,-0.006325,-0.003826,0.113903,0.022544,0.028538,-0.018874,0.062255,-0.065576,0.027181,0.174805,0.116648,-0.131404
loan_amnt,0.064174,1.0,0.070513,0.140879,0.954052,0.146905,0.116076,0.174821,-0.164987,-0.020928,-0.006116,-0.002498,0.183512,0.207024,0.036946,-0.120672,0.013783,0.025835,0.185292,-0.064542,0.341838,0.110981,0.226861,-0.096092,0.013058,0.008459,0.101943
term,0.032269,0.070513,1.0,0.077108,0.024905,0.081259,0.021197,0.024614,-0.0222,-0.004561,0.000146,-0.001536,0.232129,0.02024,0.007466,-0.006103,0.001657,0.011809,0.01029,-0.006258,0.014482,0.011488,0.023237,-0.007964,0.002482,0.019406,0.013929
int_rate,0.25234,0.140879,0.077108,1.0,0.15183,0.976408,0.000714,-0.0654,0.066309,0.000816,-0.000839,-0.001511,-0.042026,0.221783,0.004313,-0.001358,0.00171,0.182008,-0.009441,0.057738,-0.029982,0.243384,-0.074903,0.056719,0.048107,0.028209,-0.405108
installment,0.051043,0.954052,0.024905,0.15183,1.0,0.148957,0.101459,0.143054,-0.1369,-0.014131,-0.005355,-0.001821,0.181365,0.203491,0.027421,-0.115348,0.007554,0.032082,0.174762,-0.051335,0.326691,0.12979,0.196065,-0.088842,0.01011,0.00484,0.053698
sub_grade,0.26016,0.146905,0.081259,0.976408,0.148957,1.0,-0.001314,-0.069346,0.068559,0.003663,-0.00206,-0.001336,-0.042297,0.23323,0.020665,-0.000249,0.004485,0.192831,-0.006091,0.066715,-0.026573,0.250823,-0.077843,0.061156,0.050547,0.030867,-0.422757
emp_length,-0.028091,0.116076,0.021197,0.000714,0.101459,-0.001314,1.0,0.173813,-0.171712,-0.008635,-0.006379,-0.003274,0.036311,-0.015244,0.006723,-0.02401,0.004553,0.026,0.057474,0.006876,0.093751,0.048894,0.166104,-0.00026,-0.001208,-0.003934,0.019563
is_mortgage,-0.068599,0.174821,0.024614,-0.0654,0.143054,-0.069346,0.173813,1.0,-0.803379,-0.342393,-0.012187,-0.013473,0.072667,-0.0073,-0.033667,0.017703,0.0594,-0.006868,0.119765,-0.006696,0.156317,0.028771,0.468049,-0.008448,-0.011048,-0.015588,0.102435
is_rent,0.06848,-0.164987,-0.0222,0.066309,-0.1369,0.068559,-0.171712,-0.803379,1.0,-0.283287,-0.010083,-0.011147,-0.069773,0.007983,0.031934,-0.042894,-0.078844,-0.008163,-0.123247,0.004818,-0.160856,0.00519,-0.443304,0.012468,0.008549,0.016647,-0.118753
is_own,0.002169,-0.020928,-0.004561,0.000816,-0.014131,0.003663,-0.008635,-0.342393,-0.283287,1.0,-0.004297,-0.004751,-0.00677,-0.000483,0.004052,0.038843,0.028685,0.024336,0.002048,0.00331,0.002344,-0.054229,-0.054085,-0.006117,0.00397,-0.001204,0.022303


In [9]:
# I want to remove 1 of the pairs which are correlated around 70% or above
# Derogatory public records and public record bankruptcies are highly correlated at around 68%
data = data.drop("pub_rec_bankruptcies", axis=1)

### Balancing Classes

In [10]:
# Split data on majority and minority.. minority is loan_status == Trye
# minority = data[data.loan_status==True]
# majority = data[data.loan_status==False]

# print('Minority size:', minority.shape)
# print('Majority size:', majority.shape)

# # choosing upsample as even now we do not have too much data
# minority_upsample = resample(minority, replace=True, n_samples=majority.shape[0], random_state=RANDOM_STATE)
# print('Minority upsampled size:', minority_upsample.shape)

# # merge majority with upsampled minority
# data = pd.concat([minority_upsample, majority], axis=0)

### Prepare the Data for Training

In [11]:
# Create a holdout set
# holdout_data = data.sample(frac=0.1, replace=False, random_state=RANDOM_STATE)
# data = data.drop(holdout_data.index)

In [12]:
# Split data into X and y
X = data[data.columns.drop("loan_status")].values
y = data["loan_status"].values
# X_holdout = holdout_data[holdout_data.columns.drop("loan_status")].values
# y_holdout = holdout_data["loan_status"].values

In [13]:
# to split the dataset for training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(47081, 25)
(47081,)
(11771, 25)
(11771,)


### Fit Model to Training Data

In [14]:
# Fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

### Performance

In [15]:
# Random Forest
model = RandomForestClassifier(n_jobs=-1,random_state=123)
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_test_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_test_hat))
print('')

print('Accuracy score')
print('-------------------------------------------------------')
print(accuracy_score(y_test, y_test_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_test_hat))

RandomForestClassifier(n_jobs=-1, random_state=123)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     37675
        True       1.00      1.00      1.00      9406

    accuracy                           1.00     47081
   macro avg       1.00      1.00      1.00     47081
weighted avg       1.00      1.00      1.00     47081

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

       False       0.83      0.98      0.89      9405
        True       0.65      0.18      0.28      2366

    accuracy                           0.82     11771
   macro avg       0.74      0.58      0.59     11771
weighted avg       0.79      0.82      0.77     11771

Roc_auc score
-------------------------------------------------------
0.5771106760985304

Accuracy score
------------------------------------------

In [16]:
model = XGBClassifier(random_state=123)
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_test_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_test_hat))
print('')

print('Accuracy score')
print('-------------------------------------------------------')
print(accuracy_score(y_test, y_test_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_test_hat))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=123, ...)
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

       False       0.87      0.98      0.92     37675
        True       0.85      0.41      0.55      9406

    accurac

### Model Optimization

In [17]:
# Random forest
# n_jobs=-1 # to allow run it on all cores
# params = {
#     'n_estimators': [100, 200, 500],
#     'criterion': ['gini', 'entropy'],
#     'min_samples_split': [1,2,4,5],
#     'min_samples_leaf': [1,2,4,5],
#     'max_leaf_nodes': [4,10,20,50,None]
# }

# gs1 = GridSearchCV(RandomForestClassifier(n_jobs=-1), params, n_jobs=-1, cv=KFold(n_splits=3), scoring='roc_auc')
# gs1.fit(X_train, y_train)

# print('Best score:', gs1.best_score_)
# print('Best score:', gs1.best_params_)

In [18]:
# XGBoost
# n_jobs=-1 to allow run it on all cores
# params = {
#     'n_estimators': [100, 200, 500],
#     'learning_rate': [0.01,0.05,0.1],
#     'booster': ['gbtree', 'gblinear'],
#     'gamma': [0, 0.5, 1],
#     'reg_alpha': [0, 0.5, 1],
#     'reg_lambda': [0.5, 1, 5],
#     'base_score': [0.2, 0.5, 1]
# }

# gs2 = GridSearchCV(XGBClassifier(n_jobs=-1), params, n_jobs=-1, cv=KFold(n_splits=3), scoring='roc_auc')
# gs2.fit(X_train, y_train)

# print('Best score:', gs2.best_score_)
# print('Best score:', gs2.best_params_)

### Saving Model

In [19]:
import pickle

# save
pickle.dump(model, open(MODEL_PATH, "wb"))

# load
xgb_model_loaded = pickle.load(open(MODEL_PATH, "rb"))

# test
xgb_model_loaded.predict(X_test)[0] == model.predict(X_test)[0]

True