In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
def std_col_names(df):
    """
    - Convert feature names to lower case
    - Rename reporting date column 
    """
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    df.rename(columns = {'employment_duration': 'property', 'debit_to_income': 'debt_to_income', 'home_ownership': 'emp_duration'}, inplace= True)
    df['loan_title'] = df['loan_title'].str.lower().str.strip().str.replace(' ', '_')
    return df

In [3]:
# Load DF
df = std_col_names(pd.read_csv('./data/train.csv'))

In [4]:
cols = df.columns.to_list()
cols

['id',
 'loan_amount',
 'funded_amount',
 'funded_amount_investor',
 'term',
 'batch_enrolled',
 'interest_rate',
 'grade',
 'sub_grade',
 'property',
 'emp_duration',
 'verification_status',
 'payment_plan',
 'loan_title',
 'debt_to_income',
 'delinquency_-_two_years',
 'inquires_-_six_months',
 'open_account',
 'public_record',
 'revolving_balance',
 'revolving_utilities',
 'total_accounts',
 'initial_list_status',
 'total_received_interest',
 'total_received_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'collection_12_months_medical',
 'application_type',
 'last_week_pay',
 'accounts_delinquent',
 'total_collection_amount',
 'total_current_balance',
 'total_revolving_credit_limit',
 'loan_status']

In [5]:
model_cols = [
'loan_amount',
'funded_amount',
'funded_amount_investor',
'term',
'interest_rate',
'grade',
'property',
'verification_status',
'debt_to_income',
'delinquency_-_two_years',
'inquires_-_six_months',
'open_account',
'public_record',
'revolving_balance',
'revolving_utilities',
'total_accounts',
'initial_list_status',
'total_received_interest',
'total_received_late_fee',
'recoveries',
'collection_recovery_fee',
'collection_12_months_medical',
'application_type',
'last_week_pay',
'total_collection_amount',
'total_current_balance',
'total_revolving_credit_limit'
]

unused_cols = [
'id',
'batch_enrolled',
'sub_grade',
'emp_duration',
'payment_plan',
'accounts_delinquent',
'loan_title'
]

In [6]:
assert list(set(model_cols) & set(unused_cols)) == []
len(model_cols) + len(unused_cols)

34

## Train on all features

In [11]:
num_cols = [
'loan_amount',
'funded_amount',
'funded_amount_investor',
'term',
'interest_rate',
'debt_to_income',
'inquires_-_six_months',
'open_account',
'public_record',
'revolving_balance',
'revolving_utilities',
'total_accounts',
'initial_list_status',
'total_received_interest',
'total_received_late_fee',
'recoveries',
'collection_recovery_fee',
'last_week_pay',
'total_collection_amount',
'total_current_balance',
'total_revolving_credit_limit'
]

cat_cols =[
'grade',
'property',
'verification_status',
'delinquency_-_two_years',
'collection_12_months_medical',
'application_type',
]

In [12]:
len(cat_cols) + len(num_cols)

27

In [13]:
df.head()

Unnamed: 0,id,loan_amount,funded_amount,funded_amount_investor,term,batch_enrolled,interest_rate,grade,sub_grade,property,emp_duration,verification_status,payment_plan,loan_title,debt_to_income,delinquency_-_two_years,inquires_-_six_months,open_account,public_record,revolving_balance,revolving_utilities,total_accounts,initial_list_status,total_received_interest,total_received_late_fee,recoveries,collection_recovery_fee,collection_12_months_medical,application_type,last_week_pay,accounts_delinquent,total_collection_amount,total_current_balance,total_revolving_credit_limit,loan_status
0,65087372,10000,32236,12329.36286,59,BAT2522922,11.135007,B,C4,MORTGAGE,176346.6267,Not Verified,n,debt_consolidation,16.284758,1,0,13,0,24246,74.932551,7,w,2929.646315,0.102055,2.498291,0.793724,0,INDIVIDUAL,49,0,31,311301,6619,0
1,1450153,3609,11940,12191.99692,59,BAT1586599,12.237563,C,D3,RENT,39833.921,Source Verified,n,debt_consolidation,15.412409,0,0,12,0,812,78.297186,13,f,772.769385,0.036181,2.377215,0.974821,0,INDIVIDUAL,109,0,53,182610,20885,0
2,1969101,28276,9311,21603.22455,59,BAT2136391,12.545884,F,D4,MORTGAGE,91506.69105,Source Verified,n,debt_consolidation,28.137619,0,0,14,0,1843,2.07304,20,w,863.324396,18.77866,4.316277,1.020075,0,INDIVIDUAL,66,0,34,89801,26155,0
3,6651430,11170,6954,17877.15585,59,BAT2428731,16.731201,C,C3,MORTGAGE,108286.5759,Source Verified,n,debt_consolidation,18.04373,1,0,7,0,13819,67.467951,12,w,288.173196,0.044131,0.10702,0.749971,0,INDIVIDUAL,39,0,40,9189,60214,0
4,14354669,16890,13226,13539.92667,59,BAT5341619,15.0083,C,D4,MORTGAGE,44234.82545,Source Verified,n,credit_card_refinancing,17.209886,1,3,13,1,1544,85.250761,22,w,129.239553,19.306646,1294.818751,0.368953,0,INDIVIDUAL,18,0,430,126029,22579,0


In [55]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import log_loss

In [18]:
x_train, x_val, y_train, y_val = train_test_split(df[num_cols+cat_cols], df['loan_status'], stratify= df['loan_status'], random_state=42, test_size=0.15)

In [19]:
len(x_train), len(x_val)

(57343, 10120)

In [25]:
x_train = x_train.reset_index(drop= True)
x_val = x_val.reset_index(drop= True)
y_train = y_train.reset_index(drop= True)
y_val = y_val.reset_index(drop= True)

In [50]:
def transform_dv(df, dv=None, train= False):
    """
    DictVectorize a dataframe
    """
    df_dict = df.to_dict(orient= 'records')
    
    if train:
        dv = DictVectorizer(sparse= False)
        train_dv = dv.fit_transform(df_dict)
        return df_dict, dv, train_dv
    else:
        try:
            val_dv = dv.transform(df_dict)
            return df_dict, val_dv
        except TypeError:
            print('DictVectorizer was not passed for non-train set')

In [44]:
train_dict, dv, train_dv = transform_dv(x_train, train= True)

In [52]:
len(dv.get_feature_names())

39

In [63]:
val_dict, val_dv = transform_dv(x_val, dv= dv)

In [93]:
rf= RandomForestClassifier(max_depth= 10, n_estimators= 150, random_state= 42, class_weight= 'balanced_subsample', criterion= 'gini')

In [94]:
rf.fit(train_dv, y_train)

RandomForestClassifier(class_weight='balanced_subsample', max_depth=10,
                       n_estimators=150, random_state=42)

In [95]:
y_train_pred = rf.predict(train_dv)
ll_train = log_loss(y_train, y_train_pred)
print(ll_train)

3.471223010059307


In [96]:
y_val_pred = rf.predict(val_dv)
ll_val = log_loss(y_val, y_val_pred)
print(ll_val)

5.9556227310227055


### Test

In [97]:
test_df = df = std_col_names(pd.read_csv('./data/test.csv'))

In [98]:
test_df.head()

Unnamed: 0,id,loan_amount,funded_amount,funded_amount_investor,term,batch_enrolled,interest_rate,grade,sub_grade,property,emp_duration,verification_status,payment_plan,loan_title,debt_to_income,delinquency_-_two_years,inquires_-_six_months,open_account,public_record,revolving_balance,revolving_utilities,total_accounts,initial_list_status,total_received_interest,total_received_late_fee,recoveries,collection_recovery_fee,collection_12_months_medical,application_type,last_week_pay,accounts_delinquent,total_collection_amount,total_current_balance,total_revolving_credit_limit,loan_status
0,56492997,17120,10365,16025.08269,59,BAT2575549,12.163926,A,D1,RENT,76468.8219,Source Verified,n,home_improvement,16.749219,1,0,12,1,3576,67.278287,5,f,4469.449851,0.088031,8.425776,0.731797,0,INDIVIDUAL,135,0,24,475442,4364,
1,22540813,7133,11650,12615.7956,59,BAT2833642,6.564296,B,E3,MORTGAGE,38079.01344,Source Verified,n,credit_card_refinancing,18.157975,0,0,11,0,1932,71.313157,21,w,993.90753,0.041237,6.157008,0.992918,0,INDIVIDUAL,56,0,1,72412,2573,
2,9862181,25291,25825,11621.28083,59,BAT1761981,14.7299,A,C3,MORTGAGE,51275.93268,Source Verified,n,debt_consolidation,15.190011,0,0,22,0,598,50.883065,23,f,729.113379,0.021745,5.705077,0.28158,0,INDIVIDUAL,3,0,26,284825,19676,
3,10097822,30781,9664,15375.82351,59,BAT5341619,10.523767,A,A2,RENT,68867.98965,Verified,n,debt_consolidation,21.29255,0,0,11,0,5222,82.449083,28,w,715.867091,0.092398,2.469688,0.959162,0,INDIVIDUAL,21,0,32,40842,7226,
4,47771809,8878,9419,7176.647582,58,BAT4694572,9.997013,C,B3,OWN,91556.85423,Verified,n,debt_consolidation,4.812117,0,0,11,0,553,49.075855,9,w,248.572854,0.010354,2.127835,0.402315,0,INDIVIDUAL,104,0,33,90825,26145,


In [99]:
test_dict, test_dv = transform_dv(test_df[num_cols+cat_cols], dv= dv)

In [101]:
test_pred = rf.predict(test_dv)

In [116]:
def get_sub(arr, outfile= 'sub.csv'):
    """
    Convert predicted output to a dataframe and extract to local
    """
    out_df = pd.DataFrame(arr, columns=['Loan Status'])
    out_df.to_csv(outfile, index= False)
    return out_df

In [117]:
sub1 = get_sub(test_pred, 'sub1.csv')