Data preparation from google drive

In [None]:
import pandas as pd
import gdown

meta_url = 'https://drive.google.com/uc?export=download&confirm=pbef&id='
link = "1Mx4L0JvFRqboRuB6H7Fp9lO4l-DnJXDU"
url = meta_url + link
output = 'data.csv'
gdown.download(url, output, quiet=True)
data = pd.read_csv(output)

  exec(code_obj, self.user_global_ns, self.user_ns)


Make target from loan status to 1 and 0 -> still loaning and not loaning again

In [None]:
data['loan_status'].value_counts()

Current                                                224226
Fully Paid                                             184739
Charged Off                                             42475
Late (31-120 days)                                       6900
In Grace Period                                          3146
Does not meet the credit policy. Status:Fully Paid       1988
Late (16-30 days)                                        1218
Default                                                   832
Does not meet the credit policy. Status:Charged Off       761
Name: loan_status, dtype: int64

In [None]:
target = data['loan_status'].str.contains('Charged Off').astype(int)
data['loan_status'] = target

Drop the columns that has many missing value

In [None]:
data.nunique().sort_values(ascending=False)[:10]

Unnamed: 0         466285
url                466285
member_id          466285
id                 466285
total_pymnt        351609
total_pymnt_inv    347659
total_rec_int      270249
tot_cur_bal        220690
emp_title          205475
last_pymnt_amnt    198194
dtype: int64

In [None]:
tmp = (data.isnull().sum().sort_values(ascending=False)>20000)
data.drop(tmp[tmp].index, axis=1, inplace=True)

Drop columns that has many unique value for categorical

In [None]:
data.select_dtypes('object').nunique().sort_values(ascending=False)[:10]

url                   466285
title                  63099
zip_code                 888
earliest_cr_line         664
last_credit_pull_d       103
last_pymnt_d              98
issue_d                   91
addr_state                50
sub_grade                 35
purpose                   14
dtype: int64

In [None]:
tmp = (data.select_dtypes('object').nunique()>1000)
data.drop(tmp[tmp].index, axis=1, inplace=True)

Drop the first three columns because irrevelant

In [None]:
data.drop(data.iloc[:,0:3].columns, axis=1, inplace=True)

Imputer on the numerical missing value with iterative to learn pattern

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer()
x = imputer.fit_transform(data.select_dtypes(exclude='object'))

Impute the categorical with most frequent value

In [None]:
import pandas as pd
import numpy as np

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        pass
        
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

z = DataFrameImputer().fit_transform(data.select_dtypes(include='object'))

In [2]:
pip install catboost --quiet

Make the dataframe from numerical and categorical value

In [None]:
num = pd.DataFrame(x ,columns = data.select_dtypes(exclude='object').columns)
cat = pd.DataFrame(z ,columns = data.select_dtypes(include='object').columns)

In [None]:
num.drop("loan_status", axis=1, inplace=True)
data.drop("loan_status", axis=1, inplace=True)

Standardize the numerical columns

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num = scaler.fit_transform(num)
num = pd.DataFrame(num ,columns = data.select_dtypes(exclude='object').columns)

In [None]:
data = pd.concat([num, cat], axis=1)

Data preview

In [None]:
data

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,issue_d,pymnt_plan,purpose,zip_code,addr_state,earliest_cr_line,initial_list_status,last_pymnt_d,last_credit_pull_d,application_type
0,-1.124392,-1.122963,-1.114455,-0.729587,-1.105575,-0.896551,1.328632,-0.357034,0.178864,-1.641510,...,Dec-11,n,credit_card,860xx,AZ,Jan-85,f,Jan-15,Jan-16,INDIVIDUAL
1,-1.426088,-1.425101,-1.412732,0.330634,-1.528763,-0.787387,-2.065791,-0.357034,3.843321,-1.641510,...,Dec-11,n,car,309xx,GA,Apr-99,f,Apr-13,Sep-13,INDIVIDUAL
2,-1.438156,-1.437186,-1.424784,0.488979,-1.428140,-1.110294,-1.082491,-0.357034,1.094978,-1.842014,...,Dec-11,n,small_business,606xx,IL,Nov-01,f,Jun-14,Jan-16,INDIVIDUAL
3,-0.521001,-0.518687,-0.508860,-0.077850,-0.380931,-0.438063,0.354248,-0.357034,0.178864,-0.237978,...,Dec-11,n,other,917xx,CA,Feb-96,f,Jan-15,Jan-15,INDIVIDUAL
4,-1.365749,-1.364673,-1.352474,-0.261438,-1.496071,0.122311,0.091865,-0.357034,-0.737250,0.764544,...,Dec-11,n,other,972xx,OR,Jan-96,f,Jan-16,Jan-16,INDIVIDUAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466280,0.492696,0.496497,0.503478,0.147046,0.002377,0.668130,0.335143,-0.357034,1.094978,1.366058,...,Jan-14,n,debt_consolidation,773xx,TX,Apr-03,w,Jan-16,Jan-16,INDIVIDUAL
466281,0.927137,0.931576,0.937336,1.409214,0.617856,0.085923,0.156824,-0.357034,3.843321,1.366058,...,Jan-14,n,debt_consolidation,377xx,TN,Jun-97,f,Dec-14,Jan-16,INDIVIDUAL
466282,0.770256,0.774464,0.780665,0.725348,0.337921,-0.496284,1.073891,-0.357034,1.094978,1.366058,...,Jan-14,n,debt_consolidation,458xx,OH,Dec-01,f,Jan-16,Dec-15,INDIVIDUAL
466283,-1.486427,-1.485528,-1.472991,-1.360671,-1.517427,0.176893,-1.506635,3.405477,0.178864,1.967571,...,Jan-14,n,credit_card,913xx,CA,Feb-03,w,Dec-14,Apr-15,INDIVIDUAL


Split the data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.15, random_state=42)

In [None]:
col = data.select_dtypes('object').columns.tolist()

Train the model with catboost

In [None]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier()

clf.fit(
    X_train, y_train,
    cat_features=col,
    eval_set=(X_test, y_test),
    verbose=True,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.138878
0:	learn: 0.3235463	test: 0.3232963	best: 0.3232963 (0)	total: 1.48s	remaining: 24m 34s
1:	learn: 0.1656773	test: 0.1658270	best: 0.1658270 (1)	total: 2.89s	remaining: 24m 1s
2:	learn: 0.0952123	test: 0.0955879	best: 0.0955879 (2)	total: 3.96s	remaining: 21m 57s
3:	learn: 0.0694025	test: 0.0698427	best: 0.0698427 (3)	total: 5.29s	remaining: 21m 57s
4:	learn: 0.0570932	test: 0.0575061	best: 0.0575061 (4)	total: 6.5s	remaining: 21m 33s
5:	learn: 0.0410318	test: 0.0413488	best: 0.0413488 (5)	total: 7.65s	remaining: 21m 7s
6:	learn: 0.0334078	test: 0.0335438	best: 0.0335438 (6)	total: 8.72s	remaining: 20m 37s
7:	learn: 0.0272023	test: 0.0276128	best: 0.0276128 (7)	total: 10.1s	remaining: 20m 46s
8:	learn: 0.0240589	test: 0.0244183	best: 0.0244183 (8)	total: 10.8s	remaining: 19m 47s
9:	learn: 0.0221905	test: 0.0225640	best: 0.0225640 (9)	total: 11.9s	remaining: 19m 42s
10:	learn: 0.0202251	test: 0.0205792	best: 0.0205792 (10)	total: 13s	remaining: 19m 24s
11:	l

<catboost.core.CatBoostClassifier at 0x7f7cda706650>

In [None]:
X_test.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate',
       'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq', 'term',
       'grade', 'sub_grade', 'home_ownership', 'verification_status',
       'issue_d', 'pymnt_plan', 'purpose', 'zip_code', 'addr_state',
       'earliest_cr_line', 'initial_list_status', 'last_pymnt_d',
       'last_credit_pull_d', 'application_type'],
      dtype='object')

Classification report with catboost
* From the validation set, we can see the model can learn perfectly with the pattern on the data with score = 100%

In [None]:
from sklearn.metrics import classification_report

pred = clf.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     63434
           1       1.00      0.99      1.00      6509

    accuracy                           1.00     69943
   macro avg       1.00      1.00      1.00     69943
weighted avg       1.00      1.00      1.00     69943



Feature importances of the model

In [None]:
df = pd.DataFrame({"columns":X_test.columns,"importances":clf.feature_importances_})
df.sort_values(by="importances", ascending=False)

Unnamed: 0,columns,importances
15,out_prncp_inv,20.610162
14,out_prncp,19.042558
18,total_rec_prncp,15.937292
23,last_pymnt_amnt,8.311103
21,recoveries,7.053479
1,funded_amnt,6.661425
4,installment,3.879683
39,last_pymnt_d,3.443304
0,loan_amnt,2.568016
19,total_rec_int,2.413033
