In [None]:
import warnings
warnings.filterwarnings("ignore")

Read in the Data:

In [None]:
import pandas as pd
import numpy as np
import csv
filename = "LoanStats_securev1_2019Q1.csv"
df = pd.read_csv(filename,header=1)
df.shape

(115677, 150)

We can see there are 150 features. The next steps of the code will be feature selection, which will find the most important features that we should be focusing on. Our target variable is Loan Status. We only want to focus on loans that are fully paid or charged off. Therefore, we will remove any rows that have something otherwise. 

In [None]:
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]

We will drop the features with the following criteria:

*   More than 50% of the data is missing
*   The feature would not have been available at the time of the loan application ("issue_d").

We will also address the following:

*  convert strings to numerical values
* drop superfluous attributes 
* highly correlated predictors
* zero (or nearly zero) variance predictors


This null values function below will give us a visual of the features with missing data and % of total values.

In [None]:
def null_values(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [None]:
null_values(df)

Dataframe has 150 columns.
There are 61 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
member_id,37761,100.0
next_pymnt_d,37761,100.0
desc,37761,100.0
settlement_percentage,37239,98.6
settlement_amount,37239,98.6
...,...,...
last_pymnt_d,224,0.6
dti,98,0.3
revol_util,47,0.1
all_util,9,0.0


Next, we will drop all variables that have more than 50% missing data. 

In [None]:
missing_frac = df.isnull().mean()
drop_list = sorted(missing_frac[missing_frac >= 0.50].index)
print(drop_list)

['annual_inc_joint', 'debt_settlement_flag_date', 'deferral_term', 'desc', 'dti_joint', 'hardship_amount', 'hardship_dpd', 'hardship_end_date', 'hardship_last_payment_amount', 'hardship_length', 'hardship_loan_status', 'hardship_payoff_balance_amount', 'hardship_reason', 'hardship_start_date', 'hardship_status', 'hardship_type', 'member_id', 'mths_since_last_delinq', 'mths_since_last_major_derog', 'mths_since_last_record', 'mths_since_recent_bc_dlq', 'mths_since_recent_revol_delinq', 'next_pymnt_d', 'orig_projected_additional_accrued_interest', 'payment_plan_start_date', 'revol_bal_joint', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'sec_app_earliest_cr_line', 'sec_app_fico_range_high', 'sec_app_fico_range_low', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_mths_since_last_major_derog', 'sec_app_num_rev_accts', 'sec_app_open_acc', 'sec_app_open_act_il', 'sec_app_revol_util', 'settlement_amount', 'settlement_date', 'settlement_percentage', 'settlem

The function below will remove the variables in the drop list above from the dataset. 

In [None]:
def drop_cols(cols):
    df.drop(labels=cols, axis=1, inplace=True)

In [None]:
drop_cols(drop_list)

In [None]:
df.shape

(37761, 106)

Next, I will remove the features that were unavailable before lending a loan.

In [None]:
print(sorted(df.columns))

['acc_now_delinq', 'acc_open_past_24mths', 'addr_state', 'all_util', 'annual_inc', 'application_type', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'debt_settlement_flag', 'delinq_2yrs', 'delinq_amnt', 'dti', 'earliest_cr_line', 'emp_length', 'emp_title', 'fico_range_high', 'fico_range_low', 'funded_amnt', 'funded_amnt_inv', 'grade', 'hardship_flag', 'home_ownership', 'id', 'il_util', 'initial_list_status', 'inq_fi', 'inq_last_12m', 'inq_last_6mths', 'installment', 'int_rate', 'issue_d', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'last_pymnt_amnt', 'last_pymnt_d', 'loan_amnt', 'loan_status', 'max_bal_bc', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_rcnt_il', 'mths_since_recent_bc', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'n

In [None]:
drop_list2 = ['acc_now_delinq', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'debt_settlement_flag', 'delinq_2yrs', 'delinq_amnt', 'funded_amnt', 'funded_amnt_inv', 'hardship_flag', 'inq_last_6mths', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'last_pymnt_amnt', 'last_pymnt_d', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mths_since_recent_bc', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m',  'out_prncp', 'out_prncp_inv', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pymnt_plan', 'recoveries', 'tax_liens', 'tot_coll_amt', 'tot_cur_bal', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'total_pymnt', 'total_pymnt_inv', 'total_rec_int', 'total_rec_late_fee', 'total_rec_prncp', 'total_rev_hi_lim']
drop_cols(drop_list2)
len(df.columns)

49

In [None]:
df.head()

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,url,purpose,title,zip_code,addr_state,dti,earliest_cr_line,fico_range_low,fico_range_high,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,policy_code,application_type,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,inq_fi,total_cu_tl,inq_last_12m,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mort_acc,pub_rec_bankruptcies
1,149089105,6500.0,36 months,8.81%,206.13,A,A5,Service Technician,10+ years,OWN,85000.0,Source Verified,Mar-2019,Fully Paid,https://lendingclub.com/browse/loanDetail.acti...,debt_consolidation,Debt consolidation,851xx,AZ,21.62,Feb-2008,685.0,689.0,5.0,0.0,1870.0,33.4%,10.0,f,1.0,Individual,0.0,3.0,0.0,3.0,23.0,94033.0,72.0,0.0,0.0,1757.0,67.0,1.0,3.0,5.0,131.0,36.0,1.0,0.0
5,149477700,15000.0,36 months,17.97%,542.07,D,D1,Administrative Assistant,5 years,MORTGAGE,58240.0,Verified,Mar-2019,Fully Paid,https://lendingclub.com/browse/loanDetail.acti...,debt_consolidation,Debt consolidation,907xx,CA,38.53,Aug-2005,660.0,664.0,9.0,0.0,67035.0,66.4%,18.0,w,1.0,Individual,0.0,2.0,1.0,3.0,7.0,34645.0,88.0,1.0,3.0,13753.0,78.0,4.0,2.0,2.0,163.0,157.0,1.0,0.0
11,149459186,1000.0,36 months,7.56%,31.14,A,A3,,,RENT,26000.0,Not Verified,Mar-2019,Fully Paid,https://lendingclub.com/browse/loanDetail.acti...,other,Other,281xx,NC,14.73,Nov-2005,765.0,769.0,12.0,0.0,697.0,2%,23.0,f,1.0,Individual,1.0,3.0,0.0,2.0,20.0,6549.0,73.0,1.0,2.0,186.0,17.0,1.0,0.0,1.0,107.0,160.0,0.0,0.0
14,149521312,3025.0,36 months,17.19%,108.14,C,C5,Warehouse employee,3 years,RENT,30000.0,Source Verified,Mar-2019,Fully Paid,https://lendingclub.com/browse/loanDetail.acti...,debt_consolidation,Debt consolidation,300xx,GA,11.88,Jul-2005,665.0,669.0,9.0,0.0,2885.0,43.7%,22.0,w,1.0,Individual,0.0,3.0,0.0,1.0,16.0,16136.0,82.0,3.0,4.0,1530.0,72.0,1.0,1.0,1.0,164.0,149.0,3.0,0.0
15,149521158,2000.0,36 months,16.40%,70.71,C,C4,Teacher,3 years,RENT,53000.0,Source Verified,Mar-2019,Fully Paid,https://lendingclub.com/browse/loanDetail.acti...,car,Car financing,765xx,TX,20.86,Sep-2009,680.0,684.0,22.0,0.0,15299.0,34.8%,44.0,f,1.0,Individual,1.0,15.0,1.0,1.0,8.0,109166.0,113.0,1.0,5.0,5017.0,89.0,1.0,0.0,2.0,114.0,43.0,0.0,0.0


Now we are left with 49 features, I will screen each feature and decide if a feature should be dropped. After pre-processing each feature, I will do some overall statistical tests for all the features, their correlations, etc. The tasks involved include:

* Missing data and outliers
* Cleaning & Formatting:
* Transformation of features
* Choose the most relevant features in the data

In [None]:
drop_cols(['id','grade','emp_title','title','zip_code','policy_code', 'url','open_acc_6m', 'open_act_il', 'open_il_12m','open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util','open_rv_12m','open_rv_24m','open_rv_24m','max_bal_bc','all_util','inq_fi','total_cu_tl','inq_last_12m'])

In [None]:
# Fix the term feature
df['term'].sample(5)
df['term'] = df['term'].apply(lambda s: np.int8(s.split()[0]))

In [None]:
# Fix the emp_length feature (convert to integer)
df['emp_length'].replace('10+ years', '10 years', inplace=True)
df['emp_length'].replace('< 1 year', '0 years', inplace=True)
df['emp_length'].value_counts(dropna=False).sort_index()
df.emp_length.map( lambda x: str(x).split()[0]).value_counts(dropna=True).sort_index()
df['emp_length'] = df.emp_length.map(lambda x: float(str(x).split()[0]))
df['emp_length'].sample(5)

66116     6.0
77747     6.0
19028     0.0
47149     5.0
42952    10.0
Name: emp_length, dtype: float64

In [None]:
# Fix the int_rate feature
df['int_rate'].replace(regex=True, inplace=True, to_replace=r'%', value=r'')
df['int_rate'] = df['int_rate'].apply(pd.to_numeric, errors='coerce')

In [None]:
# Fix the annual income variable, use log-transform since there is a large range of variation, we log-transform the values.
df['annual_inc'] = df['annual_inc'].apply(lambda x:np.log10(x+1))

In [None]:
# fix earliest_cr_line to datetime variable
from datetime import datetime

df.earliest_cr_line = pd.to_datetime(df.earliest_cr_line)
dttoday = datetime.now().strftime('%Y-%m-%d')
df.earliest_cr_line = df.earliest_cr_line.apply(lambda x:(np.timedelta64((x - pd.Timestamp(dttoday)),'D').astype(int))/-365)
df.earliest_cr_line.shape

(37761,)

In [None]:
# create new variable fico_score, which is the mean value of fico_low and fico_high
df['fico_score'] = (df['fico_range_low'] + df['fico_range_high'])/2.
drop_cols(['fico_range_high','fico_range_low'])

In [None]:
# Change the bin width of pub_rec
df.pub_rec = df.pub_rec.map(lambda x: 3 if x >2.0 else x)

In [None]:
# Fix revol_bal
df['revol_bal'] = df['revol_bal'].apply(lambda x:np.log10(x+1))

In [None]:
# Fix revol_util
df['revol_util'] = df['revol_util'].replace(regex=True, inplace=False, to_replace=r'%', value=r'')

In [None]:
df['revol_util'] = df['revol_util'].apply(pd.to_numeric, errors='coerce')

In [None]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,addr_state,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mort_acc,pub_rec_bankruptcies,fico_score
1,6500.0,36,8.81,206.13,A5,10.0,OWN,4.929424,Source Verified,Mar-2019,Fully Paid,debt_consolidation,AZ,21.62,12.843836,5.0,0.0,3.272074,33.4,10.0,f,Individual,131.0,36.0,1.0,0.0,687.0
5,15000.0,36,17.97,542.07,D1,5.0,MORTGAGE,4.765229,Verified,Mar-2019,Fully Paid,debt_consolidation,CA,38.53,15.347945,9.0,0.0,4.826308,66.4,18.0,w,Individual,163.0,157.0,1.0,0.0,662.0
11,1000.0,36,7.56,31.14,A3,,RENT,4.41499,Not Verified,Mar-2019,Fully Paid,other,NC,14.73,15.09589,12.0,0.0,2.843855,2.0,23.0,f,Individual,107.0,160.0,0.0,0.0,767.0
14,3025.0,36,17.19,108.14,C5,3.0,RENT,4.477136,Source Verified,Mar-2019,Fully Paid,debt_consolidation,GA,11.88,15.432877,9.0,0.0,3.460296,43.7,22.0,w,Individual,164.0,149.0,3.0,0.0,667.0
15,2000.0,36,16.4,70.71,C4,3.0,RENT,4.724284,Source Verified,Mar-2019,Fully Paid,car,TX,20.86,11.260274,22.0,0.0,4.184691,34.8,44.0,f,Individual,114.0,43.0,0.0,0.0,682.0


In [None]:
# Change target variable to be a binary variable. Fully Paid = 0, Charged Off = 1
df['Charged_Off'] = df['loan_status'].apply(lambda s: np.float(s == 'Charged Off'))
drop_cols('loan_status')
df.shape

(37761, 27)

In [None]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,purpose,addr_state,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mort_acc,pub_rec_bankruptcies,fico_score,Charged_Off
1,6500.0,36,8.81,206.13,A5,10.0,OWN,4.929424,Source Verified,Mar-2019,debt_consolidation,AZ,21.62,12.843836,5.0,0.0,3.272074,33.4,10.0,f,Individual,131.0,36.0,1.0,0.0,687.0,0.0
5,15000.0,36,17.97,542.07,D1,5.0,MORTGAGE,4.765229,Verified,Mar-2019,debt_consolidation,CA,38.53,15.347945,9.0,0.0,4.826308,66.4,18.0,w,Individual,163.0,157.0,1.0,0.0,662.0,0.0
11,1000.0,36,7.56,31.14,A3,,RENT,4.41499,Not Verified,Mar-2019,other,NC,14.73,15.09589,12.0,0.0,2.843855,2.0,23.0,f,Individual,107.0,160.0,0.0,0.0,767.0,0.0
14,3025.0,36,17.19,108.14,C5,3.0,RENT,4.477136,Source Verified,Mar-2019,debt_consolidation,GA,11.88,15.432877,9.0,0.0,3.460296,43.7,22.0,w,Individual,164.0,149.0,3.0,0.0,667.0,0.0
15,2000.0,36,16.4,70.71,C4,3.0,RENT,4.724284,Source Verified,Mar-2019,car,TX,20.86,11.260274,22.0,0.0,4.184691,34.8,44.0,f,Individual,114.0,43.0,0.0,0.0,682.0,0.0


In [None]:
# drop columns that are not linearly correlated with the target variable. 
drop_cols(['installment', 'mo_sin_old_rev_tl_op','total_acc','pub_rec_bankruptcies'])
df.shape

(37761, 23)

In [None]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,purpose,addr_state,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,initial_list_status,application_type,mo_sin_old_il_acct,mort_acc,fico_score,Charged_Off
1,6500.0,36,8.81,A5,10.0,OWN,4.929424,Source Verified,Mar-2019,debt_consolidation,AZ,21.62,12.843836,5.0,0.0,3.272074,33.4,f,Individual,131.0,1.0,687.0,0.0
5,15000.0,36,17.97,D1,5.0,MORTGAGE,4.765229,Verified,Mar-2019,debt_consolidation,CA,38.53,15.347945,9.0,0.0,4.826308,66.4,w,Individual,163.0,1.0,662.0,0.0
11,1000.0,36,7.56,A3,,RENT,4.41499,Not Verified,Mar-2019,other,NC,14.73,15.09589,12.0,0.0,2.843855,2.0,f,Individual,107.0,0.0,767.0,0.0
14,3025.0,36,17.19,C5,3.0,RENT,4.477136,Source Verified,Mar-2019,debt_consolidation,GA,11.88,15.432877,9.0,0.0,3.460296,43.7,w,Individual,164.0,3.0,667.0,0.0
15,2000.0,36,16.4,C4,3.0,RENT,4.724284,Source Verified,Mar-2019,car,TX,20.86,11.260274,22.0,0.0,4.184691,34.8,f,Individual,114.0,0.0,682.0,0.0


In [None]:
dummy_list =['sub_grade','home_ownership','verification_status','purpose','addr_state','initial_list_status','application_type']
df[dummy_list].isnull().any()
df = pd.get_dummies(df, columns=dummy_list, drop_first=True)

I will use the earlier funded loans to predict the later funded loans

In [None]:
df['issue_d'].sample()

2871    Mar-2019
Name: issue_d, dtype: object

In [None]:
df['issue_d'] = pd.to_datetime(df['issue_d'])
df['issue_d'].sample()

68189   2019-02-01
Name: issue_d, dtype: datetime64[ns]

I will split the train/test data into 80/20

In [None]:
df_train = df.loc[df['issue_d']  < df['issue_d'].quantile(0.8)]
df_test =  df.loc[df['issue_d'] >= df['issue_d'].quantile(0.8)]

In [None]:
df_train.drop('issue_d', axis=1, inplace=True)
df_test.drop('issue_d', axis=1, inplace=True)

I will now separate the predictor variables from the target variable

In [None]:
X_train = df_train.drop(['Charged_Off'], axis=1)
y_train = df_train.loc[:, 'Charged_Off']

X_test = df_test.drop(['Charged_Off'], axis=1)
y_test = df_test['Charged_Off']

In [None]:
X_train.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,annual_inc,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,mo_sin_old_il_acct,mort_acc,fico_score,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,...,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,initial_list_status_w,application_type_Joint App
22658,35000.0,36,17.19,3.0,5.190335,3.22,25.523288,16.0,0.0,4.160619,17.8,42.0,0.0,672.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
22659,5000.0,36,16.4,4.0,4.778158,11.52,16.682192,10.0,0.0,3.591065,9.0,102.0,0.0,667.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
22662,4500.0,36,13.9,10.0,5.146131,8.33,20.265753,10.0,0.0,3.466868,11.4,175.0,1.0,712.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
23196,14000.0,36,15.02,2.0,4.792399,22.38,14.432877,7.0,0.0,3.512418,15.1,92.0,0.0,772.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
23410,8000.0,36,18.94,10.0,4.653222,7.89,12.260274,5.0,0.0,3.892929,90.9,,0.0,687.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


Impute the missing variables

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

# Train on the training features
imputer.fit(X_train)

# Transform both training and testing data
X_train = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [None]:
from sklearn.preprocessing import StandardScaler
# Create an imputer object with a median filling strategy
scaler = StandardScaler()

# Train on the training features
scaler.fit(X_train)

# Transform both training and testing data
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

kfold = 3
random_state = 42
pipeline_sgdlr = Pipeline([('model', SGDClassifier(loss='log', max_iter=1000, tol=1e-3, random_state=random_state, warm_start=True))])

param_grid_sgdlr  = {
    'model__alpha': [10**-5, 10**-1, 10**2],
    'model__penalty': ['l1', 'l2']}

grid_sgdlr = GridSearchCV(estimator=pipeline_sgdlr, param_grid=param_grid_sgdlr, scoring='roc_auc', n_jobs=-1, pre_dispatch='2*n_jobs', cv=kfold, verbose=1, return_train_score=False)
grid_sgdlr.fit(X_train, y_train)

sgdlr_estimator = grid_sgdlr.best_estimator_
print('Best score: ', grid_sgdlr.best_score_)
print('Best parameters set: \n', grid_sgdlr.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    9.7s finished


Best score:  0.6891919428126817
Best parameters set: 
 {'model__alpha': 0.1, 'model__penalty': 'l2'}


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, random_state=random_state, max_features= 'sqrt' ,n_estimators=50) 
param_grid_rf = {
    'n_estimators': [50], # The number of randomized trees to build 
    'class_weight': [{0:1, 1:1}] #'model__class_weight': [{0:1, 1:1}, {0:1,1:2}, {0:1, 1:3}, {0:1, 1:4}]
   # 'model__max_features': range(2,7),
   # 'model__min_samples_leaf':range(2,6),
}
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, scoring='roc_auc',n_jobs=-1,pre_dispatch='2*n_jobs', cv=kfold, verbose=1, return_train_score=False)
grid_rf.fit(X_train, y_train)
rf_estimator = grid_rf.best_estimator_
print('Best score: ', grid_rf.best_score_)
print('Best parameters set: \n', grid_rf.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.5s finished


Best score:  0.6761613536539723
Best parameters set: 
 {'class_weight': {0: 1, 1: 1}, 'n_estimators': 50}


In [None]:
names = list(X_train)
feature_importances = pd.DataFrame(grid_rf.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance')
print("Features sorted by their score: Top 10")
feature_importances.tail(10)

Features sorted by their score: Top 10


Unnamed: 0,importance
fico_score,0.051514
open_acc,0.052473
loan_amnt,0.061908
int_rate,0.063648
mo_sin_old_il_acct,0.063762
annual_inc,0.064356
revol_util,0.064444
revol_bal,0.065543
earliest_cr_line,0.065823
dti,0.070515
