In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing,metrics 
from IPython.core.display import HTML
pd.set_option("display.max_columns",75)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import linear_model,svm
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

<p>
<span style="color:blue">
> Importing data of Lending club for the years 2012-14
</span>
</p>

In [2]:
df2012_13 = pd.read_csv('Loan_2012_2013.csv',skiprows=1)
df2014 = pd.read_csv('Loan_2014.csv',skiprows=1)

In [3]:
df2012_13.shape

(235631, 144)

In [4]:
list((df2012_13.columns)) == list((df2014.columns))

True

## Data Cleaning

<p>
<span style="color:blue">
> Merged datasets of 2012-14 <br>
> Removed all empty columns ( these are the columns with personal data of the borrowers. These are not disclosed by the company. so we dropped them)<br>
> Target variable(Borrower is a Loan defaulter) - encoded to 0 or 1<br>
</span>
</p>

In [5]:
dataset = pd.concat([df2012_13, df2014]) #merging 2012 to 2014 datasets

In [6]:
dataset.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,...,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,10400.0,10400.0,10400.0,36 months,6.99%,321.08,A,A3,Truck Driver Delivery Personel,8 years,MORTGAGE,58000.0,Not Verified,Dec-2014,Charged Off,n,,,credit_card,Credit card refinancing,937xx,CA,14.92,0.0,Sep-1989,2.0,42.0,,17.0,0.0,6133.0,31.6%,36.0,w,0.0,...,179407.0,15030.0,13000.0,11325.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
1,,,15000.0,15000.0,15000.0,60 months,12.39%,336.64,C,C1,MANAGEMENT,10+ years,RENT,78000.0,Source Verified,Dec-2014,Fully Paid,n,,,debt_consolidation,Debt consolidation,235xx,VA,12.03,0.0,Aug-1994,0.0,,,6.0,0.0,138008.0,29%,17.0,w,0.0,...,196500.0,149140.0,10000.0,12000.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
2,,,9600.0,9600.0,9600.0,36 months,13.66%,326.53,C,C3,Admin Specialist,10+ years,RENT,69000.0,Source Verified,Dec-2014,Fully Paid,n,,,debt_consolidation,Debt consolidation,077xx,NJ,25.81,0.0,Nov-1992,0.0,,,12.0,0.0,16388.0,59.4%,44.0,f,0.0,...,52490.0,38566.0,21100.0,24890.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
3,,,7650.0,7650.0,7650.0,36 months,13.66%,260.2,C,C3,Technical Specialist,< 1 year,RENT,50000.0,Source Verified,Dec-2014,Charged Off,n,,,debt_consolidation,Debt consolidation,850xx,AZ,34.81,0.0,Aug-2002,1.0,,,11.0,0.0,16822.0,91.9%,20.0,f,0.0,...,82331.0,64426.0,4900.0,64031.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,
4,,,12800.0,12800.0,12800.0,60 months,17.14%,319.08,D,D4,Senior Sales Professional,10+ years,MORTGAGE,125000.0,Verified,Dec-2014,Current,n,,,car,Car financing,953xx,CA,8.31,1.0,Oct-2000,0.0,17.0,,8.0,0.0,5753.0,100.9%,13.0,w,1838.85,...,368700.0,18007.0,4400.0,18000.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,N,,,,,,


In [7]:
dataset['loan_status'].value_counts()

Fully Paid            345615
Charged Off            71140
Current                 6653
Late (31-120 days)       207
In Grace Period          118
Late (16-30 days)         61
Default                   16
Name: loan_status, dtype: int64

In [8]:
dataset.shape

(423814, 144)

In [9]:
dataset=dataset[(dataset['loan_status']=="Fully Paid") | (dataset['loan_status']=="Charged Off")]


In [10]:
dataset['loan_status'].value_counts()

Fully Paid     345615
Charged Off     71140
Name: loan_status, dtype: int64

In [11]:
di = {"Fully Paid":0, "Charged Off":1}   #converting target variable to boolean
dataset= dataset.replace({"loan_status": di})

In [12]:
dataset['loan_status'].value_counts()

0    345615
1     71140
Name: loan_status, dtype: int64

In [13]:
dataset.isna().sum()

id                                            416755
member_id                                     416755
loan_amnt                                          0
funded_amnt                                        0
funded_amnt_inv                                    0
term                                               0
int_rate                                           0
installment                                        0
grade                                              0
sub_grade                                          0
emp_title                                      24716
emp_length                                     19652
home_ownership                                     0
annual_inc                                         0
verification_status                                0
issue_d                                            0
loan_status                                        0
pymnt_plan                                         0
url                                           

In [14]:
dataset.dropna(axis=1,thresh=300000,inplace=True)

In [15]:
dataset.shape

(416755, 86)

In [16]:
corr_matrix=dataset.corr()

In [17]:
corr_matrix

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,policy_code,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
loan_amnt,1.000000,0.999913,0.999852,0.952182,0.381943,0.052991,0.033968,0.001372,-0.004905,0.193440,-0.092707,0.337535,0.226176,,,0.889979,0.889962,0.838134,0.685254,0.080152,0.187962,0.171632,0.441395,-0.012076,,0.003654,-0.002848,0.332364,0.288691,-0.002533,0.244779,0.199706,0.047840,0.004151,-0.000010,0.144937,0.184214,0.060494,0.025968,0.239216,0.044183,0.015894,-0.057577,0.185164,0.145309,0.199065,0.185408,0.090402,0.171402,0.176441,0.148455,0.195526,-0.001331,0.005561,-0.018423,-0.027657,0.100806,0.010608,-0.128290,0.002940,0.351354,0.286914,0.388238,0.193399
funded_amnt,0.999913,1.000000,0.999943,0.952283,0.381942,0.052952,0.033982,0.001392,-0.004927,0.193468,-0.092676,0.337546,0.226184,,,0.890065,0.890052,0.838233,0.685279,0.080179,0.187968,0.171657,0.441411,-0.012065,,0.003663,-0.002848,0.332361,0.288689,-0.002545,0.244780,0.199722,0.047833,0.004162,-0.000007,0.144937,0.184212,0.060495,0.025970,0.239231,0.044179,0.015906,-0.057582,0.185164,0.145308,0.199057,0.185405,0.090397,0.171398,0.176436,0.148454,0.195522,-0.001331,0.005561,-0.018429,-0.027667,0.100806,0.010617,-0.128263,0.002955,0.351351,0.286906,0.388259,0.193385
funded_amnt_inv,0.999852,0.999943,1.000000,0.952234,0.381933,0.052939,0.034064,0.001472,-0.004792,0.193545,-0.092602,0.337534,0.226239,,,0.890101,0.890120,0.838242,0.685364,0.080207,0.187988,0.171670,0.441470,-0.012048,,0.003667,-0.002847,0.332354,0.288666,-0.002452,0.244753,0.199689,0.047843,0.004169,-0.000019,0.144922,0.184191,0.060443,0.025900,0.239227,0.044146,0.015702,-0.057565,0.185177,0.145340,0.199058,0.185366,0.090418,0.171417,0.176424,0.148485,0.195553,-0.001337,0.005557,-0.018428,-0.027578,0.100769,0.010632,-0.128179,0.002968,0.351340,0.286944,0.388212,0.193418
installment,0.952182,0.952283,0.952234,1.000000,0.379395,0.030118,0.025709,0.011167,0.018028,0.184589,-0.082506,0.326350,0.205801,,,0.837570,0.837572,0.815742,0.581954,0.076751,0.158754,0.142793,0.386764,-0.010304,,0.006396,-0.002342,0.300740,0.269927,0.007086,0.217165,0.171076,0.069583,0.006234,0.000728,0.123117,0.164925,0.044579,0.016146,0.206963,0.031568,-0.003311,-0.048809,0.191413,0.155274,0.198531,0.181812,0.072339,0.172141,0.170796,0.158039,0.186164,0.000061,0.008428,-0.012587,-0.008116,0.082228,0.029528,-0.120875,0.009336,0.318409,0.270512,0.359124,0.177734
annual_inc,0.381943,0.381942,0.381933,0.379395,1.000000,-0.051960,-0.203654,0.061116,0.065061,0.155374,-0.017544,0.338489,0.221303,,,0.347647,0.347667,0.358072,0.201349,0.036918,0.033628,0.032794,0.195521,-0.001236,,0.017783,0.001919,0.485187,0.291197,0.046078,0.385435,0.178051,-0.011879,0.017611,0.007759,0.141776,0.159899,0.046680,-0.022297,0.267995,0.044909,-0.046547,0.027607,0.097540,0.060835,0.118170,0.138794,0.127542,0.079953,0.120579,0.062239,0.153207,0.004887,0.017335,0.017514,0.050400,-0.013506,-0.033083,-0.066067,0.042234,0.499627,0.381963,0.307188,0.307032
loan_status,0.052991,0.052952,0.052939,0.030118,-0.051960,1.000000,0.090308,0.010757,0.055167,0.025370,0.011341,-0.019158,-0.006489,,,-0.251856,-0.251812,-0.393353,0.036109,0.122142,0.536536,0.470196,-0.294508,0.007165,,0.002921,-0.000407,-0.058699,-0.040284,0.092020,-0.066200,-0.071665,0.062702,-0.001795,-0.001573,-0.021692,-0.044664,-0.053270,-0.057625,-0.051285,-0.049683,-0.048446,0.009271,0.030348,0.060100,0.009147,-0.008956,0.005233,0.031799,0.001090,0.060654,0.026352,-0.001153,0.003694,0.004990,0.080035,-0.000598,0.066557,0.010990,0.001414,-0.064904,-0.007278,-0.065666,-0.005408
dti,0.033968,0.033982,0.034064,0.025709,-0.203654,0.090308,1.000000,-0.010872,-0.000088,0.289928,-0.058609,0.134221,0.216872,,,0.025053,0.025096,-0.011884,0.094976,0.010320,0.054848,0.054569,-0.031032,-0.003534,,0.006868,-0.003523,-0.018986,0.058682,0.163880,-0.132721,-0.076394,0.185511,-0.010187,-0.003576,0.036686,0.031891,-0.023236,-0.097963,-0.055452,-0.000031,0.002444,-0.055497,0.150345,0.241854,0.089413,0.057902,0.240322,0.171538,0.117584,0.243586,0.286940,-0.001056,0.008418,-0.025323,0.100417,0.097236,0.171739,-0.052445,-0.028339,-0.006209,0.292092,0.034506,0.328801
delinq_2yrs,0.001372,0.001392,0.001472,0.011167,0.061116,0.010757,-0.010872,1.000000,0.023772,0.056100,-0.014645,-0.032326,0.131892,,,0.005524,0.005582,-0.006229,0.028729,0.038144,0.016482,0.017184,-0.010134,0.038585,,0.126790,0.000092,0.072185,-0.034683,-0.058930,0.057940,-0.041210,-0.015567,0.130595,0.029982,0.084205,0.093275,0.038854,0.025315,0.094804,0.068861,-0.022180,0.220862,-0.055584,-0.016002,-0.042450,0.038408,0.085500,0.008065,0.086077,-0.016776,0.052041,0.049498,0.107047,0.634255,-0.037264,-0.449825,-0.015448,-0.041855,0.014242,0.076263,0.034595,-0.069109,0.067978
inq_last_6mths,-0.004905,-0.004927,-0.004792,0.018028,0.065061,0.055167,-0.000088,0.023772,1.000000,0.106981,0.044569,-0.015485,0.139204,,,-0.013361,-0.013224,-0.035530,0.036445,0.013102,0.037176,0.027194,0.032212,0.000462,,-0.005796,0.001862,0.061639,0.009488,0.218286,0.018536,0.018912,-0.084975,0.008566,-0.000617,0.014140,-0.003268,-0.154334,-0.216258,0.076391,-0.099049,-0.642155,0.052029,0.021624,0.058693,0.066982,0.096533,0.076737,0.089328,0.115751,0.050624,0.105570,0.002560,-0.008160,0.024284,0.266522,-0.036850,-0.083513,0.051738,0.007461,0.061820,0.058201,-0.013056,0.074716
open_acc,0.193440,0.193468,0.193545,0.184589,0.155374,0.025370,0.289928,0.056100,0.106981,1.000000,-0.040047,0.217368,0.677877,,,0.164331,0.164398,0.156668,0.118751,0.009567,0.045460,0.046721,0.083920,0.009438,,0.017241,-0.000465,0.243677,0.289437,0.452228,-0.100369,0.261654,-0.100066,0.006645,0.003355,0.122669,0.133806,-0.219116,-0.215185,0.114224,-0.184991,-0.085488,0.009520,0.513916,0.647490,0.586947,0.460016,0.399288,0.814436,0.608315,0.649876,0.998833,0.004029,0.015283,0.010330,0.318402,0.110729,-0.101378,-0.054962,-0.003849,0.274129,0.404195,0.319066,0.367244


In [18]:
week_pos_ser=corr_matrix[(corr_matrix['loan_status'] > 0) & (corr_matrix['loan_status'] < 0.1)]['loan_status']

In [19]:
week_neg_ser=corr_matrix[(corr_matrix['loan_status'] < 0) & (corr_matrix['loan_status'] >- 0.1)]['loan_status']

In [20]:
week_list=list(week_neg_ser.index) + list(week_pos_ser.index)

In [21]:
len(week_list)  # these no of week relationship with target which needs to drop.

53

In [22]:
dataset.drop(week_list,axis=1,inplace=True)

In [23]:
dataset.shape

(416755, 33)

In [24]:
dataset.head()

Unnamed: 0,term,int_rate,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,policy_code,application_type,hardship_flag,debt_settlement_flag
0,36 months,6.99%,A,A3,Truck Driver Delivery Personel,8 years,MORTGAGE,Not Verified,Dec-2014,1,n,credit_card,Credit card refinancing,937xx,CA,Sep-1989,31.6%,w,0.0,0.0,6611.69,6611.69,5217.75,0.0,521.27,93.8286,Aug-2016,321.08,Feb-2017,1.0,Individual,N,N
1,60 months,12.39%,C,C1,MANAGEMENT,10+ years,RENT,Source Verified,Dec-2014,0,n,debt_consolidation,Debt consolidation,235xx,VA,Aug-1994,29%,w,0.0,0.0,17392.37,17392.37,15000.0,0.0,0.0,0.0,Jun-2016,12017.81,Jul-2019,1.0,Individual,N,N
2,36 months,13.66%,C,C3,Admin Specialist,10+ years,RENT,Source Verified,Dec-2014,0,n,debt_consolidation,Debt consolidation,077xx,NJ,Nov-1992,59.4%,f,0.0,0.0,9973.43,9973.43,9600.0,0.0,0.0,0.0,Apr-2015,9338.58,Jul-2019,1.0,Individual,N,N
3,36 months,13.66%,C,C3,Technical Specialist,< 1 year,RENT,Source Verified,Dec-2014,1,n,debt_consolidation,Debt consolidation,850xx,AZ,Aug-2002,91.9%,f,0.0,0.0,2281.98,2281.98,704.38,0.0,1237.99,222.8382,Aug-2015,17.7,Oct-2016,1.0,Individual,N,N
5,60 months,15.59%,D,D1,Programming Analysis Supervisor,6 years,RENT,Source Verified,Dec-2014,0,n,credit_card,Credit card refinancing,658xx,MO,Aug-2003,76.2%,w,0.0,0.0,25512.2,25512.2,21425.0,0.0,0.0,0.0,May-2016,17813.19,Apr-2018,1.0,Individual,N,N


In [25]:
Final_data = dataset
Final_data["int_rate"] = Final_data["int_rate"].apply(lambda x:float(x[:-1]) ) #reomving % sign, conv to float  - int_rate column
Final_data= Final_data.reset_index(drop=True)
print("Current shape of dataset :",Final_data.shape)

Current shape of dataset : (416755, 33)


## Data Transformation


<p>
<span style="color:blue">
> Grade - Borrower's grade given basing on his/her past history - encoded to numerical values. <br>
> home_ownership - this is feature in the dataset which had to be encoded to numerical values. <br>
> Emp_Length - this feature was not formatted properly. It has some values which was in the format like "10+years","5years"...etc. we changed them to numerical values in the below cell.
</span>
</p>

In [26]:
#Data encoding
Final_data['grade'] = Final_data['grade'].map({'A':7,'B':6,'C':5,'D':4,'E':3,'F':2,'G':1})
Final_data["home_ownership"] = Final_data["home_ownership"].map({"MORTGAGE":6,"RENT":5,"OWN":4,"OTHER":3,"NONE":2,"ANY":1})
Final_data["emp_length"] = Final_data["emp_length"].replace({'years':'','year':'',' ':'','<':'','\+':'','n/a':'0'}, regex = True)


In [27]:
Final_data["emp_length"].isna().sum()

19652

In [28]:
Final_data["emp_length"].fillna(0,inplace=True)

In [29]:
Final_data["emp_length"] = Final_data["emp_length"].apply(lambda x: x if (int(x)==x) else 0)
print("Current shape of dataset :",Final_data.shape)
Final_data.head()

Current shape of dataset : (416755, 33)


Unnamed: 0,term,int_rate,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,policy_code,application_type,hardship_flag,debt_settlement_flag
0,36 months,6.99,7,A3,Truck Driver Delivery Personel,0,6,Not Verified,Dec-2014,1,n,credit_card,Credit card refinancing,937xx,CA,Sep-1989,31.6%,w,0.0,0.0,6611.69,6611.69,5217.75,0.0,521.27,93.8286,Aug-2016,321.08,Feb-2017,1.0,Individual,N,N
1,60 months,12.39,5,C1,MANAGEMENT,0,5,Source Verified,Dec-2014,0,n,debt_consolidation,Debt consolidation,235xx,VA,Aug-1994,29%,w,0.0,0.0,17392.37,17392.37,15000.0,0.0,0.0,0.0,Jun-2016,12017.81,Jul-2019,1.0,Individual,N,N
2,36 months,13.66,5,C3,Admin Specialist,0,5,Source Verified,Dec-2014,0,n,debt_consolidation,Debt consolidation,077xx,NJ,Nov-1992,59.4%,f,0.0,0.0,9973.43,9973.43,9600.0,0.0,0.0,0.0,Apr-2015,9338.58,Jul-2019,1.0,Individual,N,N
3,36 months,13.66,5,C3,Technical Specialist,0,5,Source Verified,Dec-2014,1,n,debt_consolidation,Debt consolidation,850xx,AZ,Aug-2002,91.9%,f,0.0,0.0,2281.98,2281.98,704.38,0.0,1237.99,222.8382,Aug-2015,17.7,Oct-2016,1.0,Individual,N,N
4,60 months,15.59,4,D1,Programming Analysis Supervisor,0,5,Source Verified,Dec-2014,0,n,credit_card,Credit card refinancing,658xx,MO,Aug-2003,76.2%,w,0.0,0.0,25512.2,25512.2,21425.0,0.0,0.0,0.0,May-2016,17813.19,Apr-2018,1.0,Individual,N,N


## Filling Missing values and Feature scaling 


<p>
<span style="color:blue">
> We have some important features which have some missing values. We filled those missing those values with the mean of the column. <br>
> We scaled the features all the features here using standard scaler. <br>
> We sampled our dataset here after infering from the learning curve plotted.
</span>
</p>

In [30]:
#Final_data.fillna(Final_data.mean(),inplace = True)
#HTML(Final_data.tail().to_html())
print("Current shape of dataset :",Final_data.shape)

Current shape of dataset : (416755, 33)


In [31]:
list_num_imp=list(Final_data.describe().columns) # to see the numeric column

In [32]:
list_num_imp

['int_rate',
 'grade',
 'emp_length',
 'home_ownership',
 'loan_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'policy_code']

In [33]:
# removing last column and putting target in last
dataset= Final_data[['int_rate',
 'grade',
 'emp_length',
 'home_ownership',
 
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'loan_status']]

In [34]:
dataset.head()

Unnamed: 0,int_rate,grade,emp_length,home_ownership,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,loan_status
0,6.99,7,0,6,0.0,0.0,6611.69,6611.69,5217.75,0.0,521.27,93.8286,321.08,1
1,12.39,5,0,5,0.0,0.0,17392.37,17392.37,15000.0,0.0,0.0,0.0,12017.81,0
2,13.66,5,0,5,0.0,0.0,9973.43,9973.43,9600.0,0.0,0.0,0.0,9338.58,0
3,13.66,5,0,5,0.0,0.0,2281.98,2281.98,704.38,0.0,1237.99,222.8382,17.7,1
4,15.59,4,0,5,0.0,0.0,25512.2,25512.2,21425.0,0.0,0.0,0.0,17813.19,0


In [35]:
Final_data = dataset
scl = preprocessing.StandardScaler() #instance of preprocessing
fields = Final_data.columns.values[:-1]
data_clean = pd.DataFrame(scl.fit_transform(Final_data[fields]), columns = fields)
data_clean['loan_status'] = Final_data['loan_status']
data_clean['loan_status'].value_counts()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0    345615
1     71140
Name: loan_status, dtype: int64

In [36]:
loanstatus_0 = data_clean[data_clean["loan_status"]==0]
loanstatus_1 = data_clean[data_clean["loan_status"]==1]
subset_of_loanstatus_0 = loanstatus_0.sample(n=5500)
subset_of_loanstatus_1 = loanstatus_1.sample(n=5500)
data_clean = pd.concat([subset_of_loanstatus_1, subset_of_loanstatus_0])
data_clean = data_clean.sample(frac=1).reset_index(drop=True)
print("Current shape of dataset :",data_clean.shape)
data_clean.head()


Current shape of dataset : (11000, 14)


Unnamed: 0,int_rate,grade,emp_length,home_ownership,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,loan_status
0,-0.679863,0.628763,0.0,0.888361,0.0,0.0,-0.200409,-0.199682,-0.099499,-0.135265,-0.243422,-0.213324,-0.662372,0
1,-0.524886,0.628763,0.0,-0.642478,0.0,0.0,-1.258515,-1.258301,-1.272405,-0.135265,-0.243422,-0.213324,-0.694342,1
2,-1.447911,1.394567,0.0,-0.642478,0.0,0.0,-1.164814,-1.164554,-1.105407,-0.135265,-0.243422,-0.213324,-0.371708,0
3,1.371304,-1.668647,0.0,0.888361,0.0,0.0,0.807045,0.80826,0.96558,-0.135265,-0.243422,-0.213324,1.843419,0
4,2.399166,-2.43445,0.0,-0.642478,0.0,0.0,2.275832,2.277758,2.125332,-0.135265,-0.243422,-0.213324,1.584268,0


In [37]:
X=data_clean.iloc[:,:-1].values

In [38]:
y=data_clean.iloc[:,-1].values

In [39]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state =0)


In [40]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [41]:
y_pred=predictions

In [48]:
# Making the Confusion Matrix

from sklearn.metrics import confusion_matrix,accuracy_score,precision_recall_fscore_support as score
cm = confusion_matrix(y_test, y_pred)

accuracy_score_log=accuracy_score(y_test,y_pred)
print(cm)
print(accuracy_score_log)
precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

[[1321   35]
 [  44 1350]]
0.9712727272727273
precision: [0.96776557 0.97472924]
recall: [0.97418879 0.96843615]
fscore: [0.97096656 0.97157251]
support: [1356 1394]


In [44]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
y_pred=predictions
cm = confusion_matrix(y_test, y_pred)

accuracy_score_knn=accuracy_score(y_test,y_pred)
print(cm)
print(accuracy_score_knn)

[[1288   68]
 [  87 1307]]
0.9436363636363636


In [45]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)

In [46]:
y_pred=predictions
cm = confusion_matrix(y_test, y_pred)

accuracy_score_dt=accuracy_score(y_test,y_pred)
print(cm)
print(accuracy_score_dt)

[[1321   35]
 [  44 1350]]
0.9712727272727273


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)