In [341]:
import csv
import pandas as pd
import io
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score

In [342]:
# loading into dataframe
train_df=pd.read_csv('loan_train.csv')
# train_df.head()

In [343]:
# some general info
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24999 entries, 0 to 24998
Columns: 111 entries, id to total_il_high_credit_limit
dtypes: float64(74), int64(13), object(24)
memory usage: 21.2+ MB


In [344]:
# removing data with value 'Current' in 'loan_status' column
indexes=train_df[train_df['loan_status']=='Current'].index
train_df.drop(indexes,inplace=True)
# train_df.head()

In [345]:
# replacing values in 'loan_status'
train_df.replace({'loan_status':{'Fully Paid':1,'Charged Off':-1}}, inplace=True)
# train_df.head()

In [346]:
# checking prevelance of NaN
train_df.isnull().sum()

id                                0
member_id                         0
loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
                              ...  
tax_liens                        17
tot_hi_cred_lim               24301
total_bal_ex_mort             24301
total_bc_limit                24301
total_il_high_credit_limit    24301
Length: 111, dtype: int64

In [347]:
# removing those columns with more than half NaN entries
train_df.dropna(axis=1,thresh=12500,inplace=True)
# train_df.head()

In [348]:
# some info after the preprocessing
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24301 entries, 0 to 24998
Data columns (total 54 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          24301 non-null  int64  
 1   member_id                   24301 non-null  int64  
 2   loan_amnt                   24301 non-null  int64  
 3   funded_amnt                 24301 non-null  int64  
 4   funded_amnt_inv             24301 non-null  float64
 5   term                        24301 non-null  object 
 6   int_rate                    24301 non-null  object 
 7   installment                 24301 non-null  float64
 8   grade                       24301 non-null  object 
 9   sub_grade                   24301 non-null  object 
 10  emp_title                   24295 non-null  object 
 11  emp_length                  24261 non-null  object 
 12  home_ownership              24301 non-null  object 
 13  annual_inc                  243

In [349]:
train_df.application_type=train_df.application_type.astype('category').cat.codes
train_df.home_ownership=train_df.home_ownership.astype('category').cat.codes
train_df.term=train_df.term.astype('category').cat.codes
train_df.grade=train_df.grade.astype('category').cat.codes
train_df.sub_grade=train_df.sub_grade.astype('category').cat.codes
train_df['int_rate']=train_df['int_rate'].str.replace('%','').astype('float64')
# train_df.head()

In [350]:
# set(train_df['verification_status'])
train_df['verification_status']=train_df['verification_status'].str.replace('Source ','')
# set(train_df['verification_status'])
train_df.verification_status=train_df.verification_status.astype('category').cat.codes

In [351]:
# set(train_df['emp_length'])
train_df['emp_length']=train_df['emp_length'].str.replace('years','')
train_df['emp_length']=train_df['emp_length'].str.replace('year','')
train_df['emp_length']=train_df['emp_length'].str.replace('+','')
train_df['emp_length']=train_df['emp_length'].str.replace('<','')
train_df['emp_length']=train_df['emp_length'].str.replace(' ','')
# set(train_df['emp_length'])

In [352]:
mode=train_df['emp_length'].mode()
# print(mode[0])
train_df['emp_length'].fillna(mode[0],inplace=True)
# set(train_df['emp_length'])
train_df.emp_length=train_df.emp_length.astype('int64')
# set(train_df.emp_length)

In [353]:
# set(train_df['revol_util'])
train_df['revol_util']=train_df['revol_util'].str.replace('%','').astype('float64')
# set(train_df['revol_util'])

In [354]:
mean=train_df['revol_util'].mean()
# print(mean)
train_df['revol_util'].fillna(mean,inplace=True)
# set(train_df['emp_length'])

In [355]:
# set(train_df['purpose'])
train_df.purpose=train_df.purpose.astype('category').cat.codes
# set(train_df['purpose'])

In [356]:
# set(train_df['addr_state'])
train_df.addr_state=train_df.addr_state.astype('category').cat.codes
# set(train_df['purpose'])

In [357]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24301 entries, 0 to 24998
Data columns (total 54 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          24301 non-null  int64  
 1   member_id                   24301 non-null  int64  
 2   loan_amnt                   24301 non-null  int64  
 3   funded_amnt                 24301 non-null  int64  
 4   funded_amnt_inv             24301 non-null  float64
 5   term                        24301 non-null  int8   
 6   int_rate                    24301 non-null  float64
 7   installment                 24301 non-null  float64
 8   grade                       24301 non-null  int8   
 9   sub_grade                   24301 non-null  int8   
 10  emp_title                   24295 non-null  object 
 11  emp_length                  24301 non-null  int64  
 12  home_ownership              24301 non-null  int8   
 13  annual_inc                  243

In [358]:
# dropping data which are not useful
train_df.drop(['emp_title','issue_d','id','purpose','member_id','url','pymnt_plan','desc','title','zip_code','earliest_cr_line','initial_list_status','last_pymnt_d','last_credit_pull_d'],axis=1,inplace=True)

In [359]:
# some info
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24301 entries, 0 to 24998
Data columns (total 40 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   24301 non-null  int64  
 1   funded_amnt                 24301 non-null  int64  
 2   funded_amnt_inv             24301 non-null  float64
 3   term                        24301 non-null  int8   
 4   int_rate                    24301 non-null  float64
 5   installment                 24301 non-null  float64
 6   grade                       24301 non-null  int8   
 7   sub_grade                   24301 non-null  int8   
 8   emp_length                  24301 non-null  int64  
 9   home_ownership              24301 non-null  int8   
 10  annual_inc                  24301 non-null  float64
 11  verification_status         24301 non-null  int8   
 12  loan_status                 24301 non-null  int64  
 13  addr_state                  243

In [360]:
# checking prevelance of NaN
train_df.isnull().sum()

loan_amnt                       0
funded_amnt                     0
funded_amnt_inv                 0
term                            0
int_rate                        0
installment                     0
grade                           0
sub_grade                       0
emp_length                      0
home_ownership                  0
annual_inc                      0
verification_status             0
loan_status                     0
addr_state                      0
dti                             0
delinq_2yrs                     0
inq_last_6mths                  0
open_acc                        0
pub_rec                         0
revol_bal                       0
revol_util                      0
total_acc                       0
out_prncp                       0
out_prncp_inv                   0
total_pymnt                     0
total_pymnt_inv                 0
total_rec_prncp                 0
total_rec_int                   0
total_rec_late_fee              0
recoveries    

In [361]:
mean=train_df['collections_12_mths_ex_med'].mean()
train_df['collections_12_mths_ex_med'].fillna(mean,inplace=True)
mean=train_df['chargeoff_within_12_mths'].mean()
train_df['chargeoff_within_12_mths'].fillna(mean,inplace=True)
mean=train_df['pub_rec_bankruptcies'].mean()
train_df['pub_rec_bankruptcies'].fillna(mean,inplace=True)
mean=train_df['tax_liens'].mean()
train_df['tax_liens'].fillna(mean,inplace=True)

# some info
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24301 entries, 0 to 24998
Data columns (total 40 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   24301 non-null  int64  
 1   funded_amnt                 24301 non-null  int64  
 2   funded_amnt_inv             24301 non-null  float64
 3   term                        24301 non-null  int8   
 4   int_rate                    24301 non-null  float64
 5   installment                 24301 non-null  float64
 6   grade                       24301 non-null  int8   
 7   sub_grade                   24301 non-null  int8   
 8   emp_length                  24301 non-null  int64  
 9   home_ownership              24301 non-null  int8   
 10  annual_inc                  24301 non-null  float64
 11  verification_status         24301 non-null  int8   
 12  loan_status                 24301 non-null  int64  
 13  addr_state                  243

In [362]:
# loading into dataframe
test_df=pd.read_csv('loan_test.csv')

In [363]:
# some info
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14718 entries, 0 to 14717
Columns: 111 entries, id to total_il_high_credit_limit
dtypes: float64(74), int64(13), object(24)
memory usage: 12.5+ MB


In [364]:
# following the same preprocessing steps

# removing data with value 'Current' in 'loan_status' column
indexes=test_df[test_df['loan_status']=='Current'].index
test_df.drop(indexes,inplace=True)

# replacing values in 'loan_status'
test_df.replace({'loan_status':{'Fully Paid':1,'Charged Off':-1}}, inplace=True)

# removing those columns with more than half NaN entries
test_df.dropna(axis=1,thresh=7500,inplace=True)

test_df.application_type=test_df.application_type.astype('category').cat.codes
test_df.home_ownership=test_df.home_ownership.astype('category').cat.codes
test_df.term=test_df.term.astype('category').cat.codes
test_df.grade=test_df.grade.astype('category').cat.codes
test_df.sub_grade=test_df.sub_grade.astype('category').cat.codes
test_df['int_rate']=test_df['int_rate'].str.replace('%','').astype('float64')

test_df['verification_status']=test_df['verification_status'].str.replace('Source ','')
test_df.verification_status=test_df.verification_status.astype('category').cat.codes

test_df['emp_length']=test_df['emp_length'].str.replace('years','')
test_df['emp_length']=test_df['emp_length'].str.replace('year','')
test_df['emp_length']=test_df['emp_length'].str.replace('+','')
test_df['emp_length']=test_df['emp_length'].str.replace('<','')
test_df['emp_length']=test_df['emp_length'].str.replace(' ','')

mode=test_df['emp_length'].mode()
test_df['emp_length'].fillna(mode[0],inplace=True)
test_df.emp_length=test_df.emp_length.astype('int64')

test_df['revol_util']=test_df['revol_util'].str.replace('%','').astype('float64')

mean=test_df['revol_util'].mean()
test_df['revol_util'].fillna(mean,inplace=True)

test_df.purpose=test_df.purpose.astype('category').cat.codes
test_df.addr_state=test_df.addr_state.astype('category').cat.codes

mean=test_df['collections_12_mths_ex_med'].mean()
test_df['collections_12_mths_ex_med'].fillna(mean,inplace=True)
mean=test_df['chargeoff_within_12_mths'].mean()
test_df['chargeoff_within_12_mths'].fillna(mean,inplace=True)
mean=test_df['pub_rec_bankruptcies'].mean()
test_df['pub_rec_bankruptcies'].fillna(mean,inplace=True)
mean=test_df['tax_liens'].mean()
test_df['tax_liens'].fillna(mean,inplace=True)

# dropping data which are not useful
test_df.drop(['emp_title','issue_d','id','purpose','member_id','url','pymnt_plan','desc','title','zip_code','earliest_cr_line','initial_list_status','last_pymnt_d','last_credit_pull_d'],axis=1,inplace=True)

# some info
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14276 entries, 0 to 14717
Data columns (total 40 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   14276 non-null  int64  
 1   funded_amnt                 14276 non-null  int64  
 2   funded_amnt_inv             14276 non-null  float64
 3   term                        14276 non-null  int8   
 4   int_rate                    14276 non-null  float64
 5   installment                 14276 non-null  float64
 6   grade                       14276 non-null  int8   
 7   sub_grade                   14276 non-null  int8   
 8   emp_length                  14276 non-null  int64  
 9   home_ownership              14276 non-null  int8   
 10  annual_inc                  14276 non-null  float64
 11  verification_status         14276 non-null  int8   
 12  loan_status                 14276 non-null  int64  
 13  addr_state                  142

In [365]:
X_train=np.array(train_df.drop(columns=['loan_status']))
Y_train=np.array(train_df.loan_status)
X_test=np.array(test_df.drop(columns=['loan_status']))
Y_test=np.array(test_df.loan_status)

### Exploring various combinations of hyperparameters

In [366]:
grb=GradientBoostingClassifier()
grb.fit(X_train,Y_train)
print('Accuracy:',grb.score(X_test,Y_test))
prediction=grb.predict(X_test)
print('Precision:',precision_score(Y_test,prediction))
print('Recall:',recall_score(Y_test,prediction))

Accuracy: 0.9948164752031381
Precision: 0.9940949725252194
Recall: 0.9998350243339107


In [367]:
grb=GradientBoostingClassifier(n_estimators=300)
grb.fit(X_train,Y_train)
print('Accuracy:',grb.score(X_test,Y_test))
prediction=grb.predict(X_test)
print('Precision:',precision_score(Y_test,prediction))
print('Recall:',recall_score(Y_test,prediction))

Accuracy: 0.9965676660128888
Precision: 0.996137409598948
Recall: 0.9998350243339107


In [368]:
grb=GradientBoostingClassifier(n_estimators=500)
grb.fit(X_train,Y_train)
print('Accuracy:',grb.score(X_test,Y_test))
prediction=grb.predict(X_test)
print('Precision:',precision_score(Y_test,prediction))
print('Recall:',recall_score(Y_test,prediction))

Accuracy: 0.997128047072009
Precision: 0.9967927631578948
Recall: 0.9998350243339107


In [369]:
grb=GradientBoostingClassifier(n_estimators=700)
grb.fit(X_train,Y_train)
print('Accuracy:',grb.score(X_test,Y_test))
prediction=grb.predict(X_test)
print('Precision:',precision_score(Y_test,prediction))
print('Recall:',recall_score(Y_test,prediction))

Accuracy: 0.997338189969179
Precision: 0.9969569865942923
Recall: 0.9999175121669553


In [372]:
grb=GradientBoostingClassifier(n_estimators=700,max_features='auto')
grb.fit(X_train,Y_train)
print('Accuracy:',grb.score(X_test,Y_test))
prediction=grb.predict(X_test)
print('Precision:',precision_score(Y_test,prediction))
print('Recall:',recall_score(Y_test,prediction))

Accuracy: 0.997338189969179
Precision: 0.9969569865942923
Recall: 0.9999175121669553


In [373]:
grb=GradientBoostingClassifier(n_estimators=700,min_samples_leaf=2)
grb.fit(X_train,Y_train)
print('Accuracy:',grb.score(X_test,Y_test))
prediction=grb.predict(X_test)
print('Precision:',precision_score(Y_test,prediction))
print('Recall:',recall_score(Y_test,prediction))

Accuracy: 0.997408237601569
Precision: 0.9969572368421052
Recall: 1.0


### Best combination of hyperparameters

In [375]:
grb=GradientBoostingClassifier(n_estimators=700,min_samples_leaf=2,max_depth=4)
grb.fit(X_train,Y_train)
print('Accuracy:',grb.score(X_test,Y_test))
prediction=grb.predict(X_test)
print('Precision:',precision_score(Y_test,prediction))
print('Recall:',recall_score(Y_test,prediction))

Accuracy: 0.9974782852339591
Precision: 0.997039230199852
Recall: 1.0


### Simple decision tree

In [370]:
dt=DecisionTreeClassifier()
dt.fit(X_train,Y_train)
print('Accuracy:',dt.score(X_test,Y_test))
prediction=dt.predict(X_test)
print('Precision:',precision_score(Y_test,prediction))
print('Recall:',recall_score(Y_test,prediction))

Accuracy: 0.9912440459512468
Precision: 0.9952121512299819
Recall: 0.99447331518601
