## Objectives for this notebook:  
Using kaggle Lending Club data (see module1)...  
1) continue modeling  for predictive probabilities;  
2) create partial dependence plot(s); and  
3) work with Shapley values

In [1]:
# Imports will ultimately encompass all 3 objectives above
# Below are imports initially made for objective 1

import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline

In [None]:
# For objective 2, unique library install/imports below
# Note: had to 'pip install pdpbox' from Anaconda terminal on local machine

!pip install pdpbox

In [None]:
# Partial dependence plot library imports

from pdpbox.pdp import pdp_isolate, pdp_plot

In [None]:
# Objective 3 library

!pip install shap

In [None]:
import shap

In [43]:
# Copy csv(s) from module1 to module3 [done locally] and load data plus get kaggle submission sample

X_train = pd.read_csv('train_features.csv')
X_test = pd.read_csv('test_features.csv')
y_train = pd.read_csv('train_labels.csv')['charged_off']
sample_submission = pd.read_csv('sample_submission.csv')

pd.set_option('display.max_columns', 500)

In [44]:
X_train.shape, X_test.shape, y_train.shape

((37745, 103), (9437, 103), (37745,))

In [4]:
pd.set_option('display.max_columns', 500)
X_train.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,collections_12_mths_ex_med,mths_since_last_major_derog,application_type,annual_inc_joint,dti_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,disbursement_method
0,43373,,12000,12000,36 months,16.02%,422.01,C,C5,Driver,4 years,MORTGAGE,81000.0,,,debt_consolidation,Debt consolidation,280xx,NC,12.76,0,Dec-2005,1,48.0,,7,0,4912,23.5%,15,w,0,48.0,Individual,,,0,0,174783,1,2,0,4,13.0,33668,82.0,2,5,2962,63.0,20900,1,1,2,9,24969.0,588.0,89.3,0,0,145.0,23,1,1,1,8.0,,1.0,,2,2,2,2,2,9,4,5,2,7,0.0,0,0,2,92.9,100.0,0,0,206618,38580,5500,40863,,,,,,,,,,,,Cash
1,24414,,6300,6300,36 months,14.07%,215.54,C,C3,GMP Lead,4 years,RENT,39000.0,,,debt_consolidation,Debt consolidation,920xx,CA,21.42,0,Mar-2012,1,,,10,0,11876,59.4%,12,w,0,,Individual,,,0,0,31046,0,1,0,2,14.0,19170,83.0,3,8,5153,72.0,20000,1,0,1,10,3105.0,6877.0,59.3,0,0,17.0,72,8,8,0,8.0,,6.0,,0,6,7,7,8,2,9,10,7,10,0.0,0,0,3,100.0,42.9,0,0,43140,31046,16900,23140,,,,,,,,,,,,Cash
2,46723,,4500,4500,36 months,7.21%,139.38,A,A3,Accounts,3 years,RENT,78000.0,,,debt_consolidation,Debt consolidation,906xx,CA,2.17,0,May-2000,2,,,13,0,1715,5.2%,19,w,0,,Individual,,,0,0,35329,0,8,1,2,10.0,33614,106.0,1,1,1196,54.0,33300,4,0,3,3,2718.0,21585.0,7.4,0,0,166.0,218,10,10,0,31.0,,1.0,,0,3,3,3,4,13,5,6,3,13,0.0,0,0,2,100.0,0.0,0,0,65092,35329,23300,31792,,,,,,,,,,,,Cash
3,24878,,12000,12000,36 months,9.44%,384.06,B,B1,Client Processing Associate,10+ years,MORTGAGE,62000.0,,,debt_consolidation,Debt consolidation,441xx,OH,13.76,0,Feb-2003,0,,,22,0,9404,16.3%,33,w,0,,Individual,,,0,0,75160,4,1,0,2,13.0,17123,67.0,6,9,6898,32.0,57600,3,1,2,12,3579.0,33602.0,17.0,0,0,162.0,179,1,1,1,1.0,,5.0,,0,1,3,9,13,7,20,25,3,22,0.0,0,0,7,100.0,0.0,0,0,133065,26527,40500,25465,,,,,,,,,,,,Cash
4,39038,,12000,12000,36 months,22.35%,460.47,D,D5,Construction Operator,10+ years,MORTGAGE,70000.0,,,debt_consolidation,Debt consolidation,751xx,TX,14.04,4,Nov-2001,2,13.0,,18,0,17895,24.2%,45,w,0,13.0,Individual,,,0,7218,113702,9,2,2,4,6.0,16711,68.0,9,10,5373,35.0,73900,3,6,3,14,6317.0,32863.0,24.1,0,0,159.0,201,0,0,1,0.0,62.0,3.0,62.0,6,9,11,11,14,22,15,20,11,18,0.0,0,2,11,82.2,9.1,0,0,192842,34606,43300,24499,,,,,,,,,,,,Cash


In [5]:
# Hat tip to RH/LSDS for methods used in this cell
# 'wrangle' function does significant feature engineering on data


def wrangle(X):
    X = X.copy()
    
    # Drop some columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)

    # Convert percentages from strings to floats
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
        
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # Transform features with many nulls to binary flags
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    for col in many_nulls:
        X[col] = X[col].isnull()
    
    # For features with few nulls, do mean imputation
    for col in X:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mean())
    
    # Return the wrangled dataframe
    return X


# Wrangle train and test in the same way
X_train = wrangle(X_train)
X_test  = wrangle(X_test)

In [45]:
# Check X_train for nulls

X_train.isna().sum().sum()

706188

In [None]:
# Hat tip to RH/LSDS for methods used in this cell
# Train GBC model

encoder = ce.HashingEncoder()
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [None]:
# Obtain roc auc score using gb model on data
# 2019-03-20 1408hrs PST: this is a TODO

y_pred_proba = gb.predict_proba(X_test)[:,1]
print('Validation ROC AUC:', roc_auc_score(y_test, y_pred_proba))

In [None]:
# Submit predictive probabilities to kaggle

submission_7 = sample_submission.copy()
submission_7['charged_off'] = gb.predict_proba(X_test)[:, 1]
submission_7.to_csv('submission-004.csv', index=False)

In [None]:
# 2019-03-21, this notebook copied to local machine xgboost dir

In [7]:
# More work on GBC; methods courtesy of RH/LSDS
%time
from sklearn.model_selection import train_test_split

# Disbursement method causing a ValueError, so will use get_dummies to directly encode to binary
# X_train['disbursement_method'] = pd.get_dummies(X_train['disbursement_method'])
# X_test['disbursement_method'] = pd.get_dummies(X_test['disbursement_method'])                                                

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

encoder = ce.OrdinalEncoder()
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_proba = gb.predict_proba(X_val)[:,1]
print('Validation ROC AUC:', roc_auc_score(y_val, y_pred_proba))

Wall time: 0 ns
Validation ROC AUC: 0.7443474543167272


In [None]:
# Solve ValueError: could not convert string to float: 'Cash'

X_test.apply(lambda row: row.astype(str).str.contains('Cash').any(), axis=1).sum()
X_test.iloc[8847]

In [None]:
X_train.disbursement_method.value_counts()

In [16]:
# Solve ValueError: Number of features of the model must match the input.
# Model n_features is 165 and input n_features is 166

# for col in X_test:
#     if col not in X_train:
#         print(col, 'not in X_train')
# X_train.shape, X_test.shape
X_test = X_test.drop('purpose_wedding', axis=1)  # corrected ValueError

print('purpose_wedding' in X_train.columns)  # False; False
print('purpose_wedding' in X_test.columns)  # True; False, after correction

False
False


In [17]:
# Submit predictive probabilities to kaggle

submission_8 = sample_submission.copy()
submission_8['charged_off'] = gb.predict_proba(X_test)[:, 1]
submission_8.to_csv('submission-008.csv', index=False)

In [19]:
# First attempt in this notebook at a XGBoost model

from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

In [46]:
# Go back to top of notebook and reload X_train, X_test, y_train
# Then try new encoding methodology

encoder = ce.OrdinalEncoder(handle_unknown='ignore')  # , cols=[col for col in X_train])
X_transformed = encoder.fit_transform(X_train)
X_transformed = X_transformed.select_dtypes(include='number')
X_test = encoder.transform(X_test)
X_test = X_test.select_dtypes(include='number')

In [47]:
model = XGBClassifier(booster='dart')
cross_val_score(model, X_transformed, y_train, scoring='accuracy', cv=5, n_jobs=-1)

array([0.85496689, 0.8544178 , 0.85388793, 0.85627235, 0.82405935])

In [48]:
model.fit(X_transformed, y_train)

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [49]:
# Submit predictive probabilities to kaggle

submission_9 = sample_submission.copy()
submission_9['charged_off'] = model.predict_proba(X_test)[:, 1]
submission_9.to_csv('submission-009.csv', index=False)