<br>
<br>
**<font size=5><center>Predicting Default Rates for Lending Club</center></font>**

### Authors:
Devon Luongo <br>
Ankit Agarwal <br>
Bryn Clarke <br>
Ben Yuen

# IV. Baseline Models

*Libraries*

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA

In [3]:
df_X = pd.read_pickle("./data/df_X_imputed.pkl")
df_X = df_X.drop(["verification_status_None", "term_None", "application_type_None", "initial_list_status_None", "purpose_None", "emp_length_None", "home_ownership_None", "addr_state_None", "funded_amnt_inv", "num_actv_rev_tl", "tot_cur_bal"], axis=1)
df_y = pd.read_pickle("./data/df_y.pkl")

In [4]:
sclr = StandardScaler()
X = sclr.fit_transform(df_X)
y = df_y.values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20161201)

In [6]:
pca = PCA(n_components=100)
pca.fit(X)

for n in range(5, 105, 5):
    print "%d-components: %.2f%% variance explained" % (n, sum(pca.explained_variance_ratio_[1:n])*100)

5-components: 10.91% variance explained
10-components: 18.61% variance explained
15-components: 24.24% variance explained
20-components: 28.70% variance explained
25-components: 32.46% variance explained
30-components: 35.90% variance explained
35-components: 39.22% variance explained
40-components: 42.48% variance explained
45-components: 45.69% variance explained
50-components: 48.87% variance explained
55-components: 52.02% variance explained
60-components: 55.16% variance explained
65-components: 58.29% variance explained
70-components: 61.41% variance explained
75-components: 64.51% variance explained
80-components: 67.61% variance explained
85-components: 70.69% variance explained
90-components: 73.77% variance explained
95-components: 76.83% variance explained
100-components: 79.84% variance explained


In [7]:
n = 40
pca = PCA(n_components=n)
X_pc = pca.fit_transform(X)
print "%d-components: %.2f%% variance explained" % (n, sum(pca.explained_variance_ratio_)*100)
X_pc_train, X_pc_test, y_pc_train, y_pc_test = train_test_split(X_pc, y, test_size=0.3, random_state=20161201)

40-components: 48.01% variance explained


In [8]:
# Baseline models
# All positive (label every applicant as default)
class Pos_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.array([1] * len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

pos_model = Pos_model()
pos_model.fit(X_train, y_train)
pos_model.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1])

In [9]:
# All negative (label every applicant as not default)
class Neg_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.array([0] * len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

neg_model = Neg_model()
neg_model.fit(X_train, y_train)
neg_model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
# Random (randomly predict flu or not flu)
class Random_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.random.randint(0, 2, len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

random_model = Random_model()
random_model.fit(X_train, y_train)
random_model.predict(X_test)

array([0, 0, 0, ..., 1, 1, 1])

In [63]:
def custom_cost(y, y_pred):
    cost_fixed_application = 10.0
    cost_fixed_servicing = 100.0
    cost_default = 15000.0
    cost_interest = -1000.0
    
    # TRUE POSITIVE: Predict default, applicant would default
    n_true_positive = sum((y==1) & (y_pred==1))
    # We only pay fixed application processing costs as we decline the loan
    cost_true_positive = n_true_positive*(cost_fixed_application)
    
    # TRUE NEGATIVE: Predict pay on time, applicant will pay on time
    n_true_negative = sum((y==0) & (y_pred==0))
    # We pay fixed application processing costs, costs of servicing the loan, and gain profit on interest payments
    cost_true_negative = n_true_negative*(cost_fixed_application + cost_fixed_servicing + cost_interest)
    
    # FALSE POSITIVE: Predict default, applicant would pay on time
    n_false_positive = sum((y==0) & (y_pred==1))
    # We only pay fixed application processing costs as we decline the loan
    cost_false_positive = n_false_positive*(cost_fixed_application)
    
    # FALSE NEGATIVE: Predict pay on time, applicant will default
    n_false_negative = sum((y==1) & (y_pred==0))
    # We pay fixed application processing costs, costs of servicing the loan, and the principal as the loan defaults
    cost_false_negative = n_false_negative*(cost_fixed_application + cost_fixed_servicing + cost_default)
    
    return (cost_true_positive + cost_true_negative + cost_false_positive + cost_false_negative)/len(y)


In [64]:
def test_model(model, label, scoring_funcs, use_pc=False, fit=True):
    if use_pc:
        if fit:
            model.fit(X_pc_train, y_train)
        y_pred = model.predict(X_pc_test)
    else:
        if fit:
            model.fit(X_train, y_train)
        y_pred = model.predict(X_test)        
        
    scores = {"model": label}
    for scoring_func in scoring_funcs:
        scores[scoring_func.__name__] = scoring_func(y_test, y_pred)
    
    return pd.DataFrame([scores]).set_index("model")

def compare_models(models, labels, scoring_funcs, use_pc=False, fit=True):
    return pd.concat([test_model(model, label, scoring_funcs, use_pc) for model, label in zip(models, labels)], fit)

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

compare_models([pos_model, neg_model, random_model], ["Positive Model", "Negative Model", "Random Model"], [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss, precision_score, recall_score, roc_auc_score])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,accuracy_score,average_precision_score,custom_cost,f1_score,log_loss,precision_score,recall_score,roc_auc_score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Positive Model,0.196514,0.598257,10.0,0.328477,27.752072,0.196514,1.0,0.5
Negative Model,0.803486,0.598257,2254.220934,0.0,6.787346,0.0,0.0,0.5
Random Model,0.500716,0.398035,1128.742755,0.282825,17.24498,0.197028,0.500976,0.500814


# V. Prediction Models

In [18]:
from sklearn.grid_search import GridSearchCV
import xgboost as xgb

CPU times: user 4min 12s, sys: 5.73 s, total: 4min 18s
Wall time: 7.97 s


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=36,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [37]:
xg = xgb.XGBClassifier(nthread=36)
xg_scores = compare_models([xg],
                           ["XGBoost - no tuning"],
                           [custom_cost, accuracy_score, average_precision_score, f1_score,
                            log_loss, precision_score, recall_score, roc_auc_score],
                           use_pc=False)

xg_scores

Unnamed: 0_level_0,accuracy_score,average_precision_score,custom_cost,f1_score,log_loss,precision_score,recall_score,roc_auc_score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
XGBoost - no tuning,0.832757,0.552967,1325.671667,0.431753,5.776411,0.649643,0.323314,0.640334


In [49]:
n_estimators = [100, 150] #[100, 200, 300, 400, 500]
learning_rate = [0.1, 0.5, 0.05] #[0.0001, 0.001, 0.01, 0.1]


param_grid1 = {'learning_rate':learning_rate, 'n_estimators':n_estimators}
param_grid1

{'learning_rate': [0.1, 0.5, 0.01, 0.05], 'n_estimators': [100, 200, 300]}

In [52]:
grid_search = GridSearchCV(xg, param_grid=param_grid1, scoring="accuracy", verbose=1, n_jobs=3 )
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed:  4.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=12,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'n_estimators': [100, 200, 300], 'learning_rate': [0.1, 0.5, 0.01, 0.05]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1)

In [66]:
xg_n100 = xgb.XGBClassifier(nthread=-1, n_estimators=100, learning_rate=0.5)
xg_n150 = xgb.XGBClassifier(nthread=-1, n_estimators=150, learning_rate=0.5)
xg_n200 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.5)
xg_n250 = xgb.XGBClassifier(nthread=-1, n_estimators=250, learning_rate=0.5)
xg_n300 = xgb.XGBClassifier(nthread=-1, n_estimators=300, learning_rate=0.5)

xgb_est = compare_models([xg_n100,xg_n150,xg_n200,xg_n250,xg_n300],
               ['xg_n100','xg_n150','xg_n200','xg_n250','xg_n300'],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss,
                precision_score, recall_score, roc_auc_score])

In [None]:
x = xgb_est.values
x_new = np.zeros((xgb_est.shape[0],8))
for i in xrange(x.shape[0]):
    x_new[i,:]=x[i,i*8:i*8+8]

xgb_est=pd.DataFrame(x_new)
xgb_est.columns=['custom_cost', 'accuracy_score', 'average_precision_score', 'f1_score', 'log_loss',
                'precision_score', 'recall_score', 'roc_auc_score']

In [73]:
xgb_est

Unnamed: 0,custom_cost,accuracy_score,average_precision_score,f1_score,log_loss,precision_score,recall_score,roc_auc_score
0,0.836677,0.575035,1120.496079,0.487687,5.640996,0.635717,0.395576,0.670068
1,0.836771,0.576398,1103.70866,0.491584,5.637759,0.633632,0.401561,0.672387
2,0.836388,0.575764,1099.212411,0.492075,5.651007,0.630972,0.403296,0.672804
3,0.835996,0.57494,1097.386635,0.49196,5.66455,0.628695,0.404077,0.672855
4,0.835331,0.573538,1094.717013,0.491672,5.687514,0.62495,0.405248,0.672884


In [68]:
xg_n200_l01 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.1)
xg_n200_l02 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.2)
xg_n200_l03 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.3)
xg_n200_l04 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4)
xg_n200_l05 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.5)

xgb_lr = compare_models([xg_n200_l01,xg_n200_l02,xg_n200_l03,xg_n200_l04,xg_n200_l05],
               ['xg_n200_l01','xg_n200_l02','xg_n200_l03','xg_n200_l04','xg_n200_l05'],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss,
                precision_score, recall_score, roc_auc_score])

In [77]:
x_new = np.zeros((xgb_lr.shape[0],8))
x = xgb_lr.values
for i in xrange(x.shape[0]):
    x_new[i,:]=x[i,i*8:i*8+8]

xgb_lr=pd.DataFrame(x_new)
xgb_lr.columns=['custom_cost', 'accuracy_score', 'average_precision_score', 'f1_score', 'log_loss',
                'precision_score', 'recall_score', 'roc_auc_score']

In [78]:
xgb_lr

Unnamed: 0,custom_cost,accuracy_score,average_precision_score,f1_score,log_loss,precision_score,recall_score,roc_auc_score
0,0.83597,0.568055,1206.826628,0.466452,5.665427,0.64643,0.364867,0.658029
1,0.837104,0.574375,1148.919195,0.481736,5.626274,0.642692,0.385253,0.666434
2,0.837589,0.577706,1115.196897,0.490098,5.609495,0.639768,0.397181,0.671242
3,0.837078,0.577483,1098.954143,0.493051,5.62716,0.634514,0.403166,0.673185
4,0.836388,0.575764,1099.212411,0.492075,5.651007,0.630972,0.403296,0.672804


In [85]:
xg_reg03 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4, reg_lambda=0.3 )
xg_reg05 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4, reg_lambda=0.5)
xg_reg07 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4, reg_lambda=0.7)
xg_reg09 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4, reg_lambda=0.9)

xgb_est = compare_models([xg_reg03,xg_reg05,xg_reg07,xg_reg09],
               ['xg_n100','xg_n150','xg_n200','xg_n250'],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss,
                precision_score, recall_score, roc_auc_score])

In [86]:
x_new = np.zeros((xgb_est.shape[0],8))
x = xgb_est.values
for i in xrange(x.shape[0]):
    x_new[i,:]=x[i,i*8:i*8+8]

xgb_est=pd.DataFrame(x_new)
xgb_est.columns=['custom_cost', 'accuracy_score', 'average_precision_score', 'f1_score', 'log_loss',
                'precision_score', 'recall_score', 'roc_auc_score']

In [87]:
xgb_est

Unnamed: 0,custom_cost,accuracy_score,average_precision_score,f1_score,log_loss,precision_score,recall_score,roc_auc_score
0,0.836609,0.576145,1101.554722,0.491848,5.643353,0.632465,0.402386,0.672598
1,0.836925,0.577161,1098.123935,0.493031,5.63246,0.63359,0.403513,0.67322
2,0.836601,0.575923,1104.467269,0.491188,5.643647,0.632857,0.401345,0.672199
3,0.836609,0.576061,1102.765087,0.491579,5.643352,0.632646,0.401952,0.672434


In [99]:
xg_reg03 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4, reg_lambda=0.5, colsample_bytree=0.3)
xg_reg05 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4, reg_lambda=0.5, colsample_bytree=0.5)
xg_reg07 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4, reg_lambda=0.5, colsample_bytree=0.7)
xg_reg09 = xgb.XGBClassifier(nthread=-1, n_estimators=200, learning_rate=0.4, reg_lambda=0.5, colsample_bytree=0.9)

xgb_est = compare_models([xg_reg03,xg_reg05,xg_reg07,xg_reg09],
               ['xg_n100','xg_n150','xg_n200','xg_n250','xg_n300'],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss,
                precision_score, recall_score, roc_auc_score])

In [100]:
x_new = np.zeros((xgb_est.shape[0],8))
x = xgb_est.values
for i in xrange(x.shape[0]):
    x_new[i,:]=x[i,i*8:i*8+8]

xgb_est=pd.DataFrame(x_new)
xgb_est.columns=['custom_cost', 'accuracy_score', 'average_precision_score', 'f1_score', 'log_loss',
                'precision_score', 'recall_score', 'roc_auc_score']

In [101]:
xgb_est

Unnamed: 0,custom_cost,accuracy_score,average_precision_score,f1_score,log_loss,precision_score,recall_score,roc_auc_score
0,0.836788,0.575355,1119.912206,0.487967,5.637169,0.636218,0.395749,0.670203
1,0.836882,0.576434,1107.240027,0.490942,5.633931,0.63475,0.40026,0.671965
2,0.836643,0.575471,1112.659393,0.48941,5.642174,0.634323,0.398395,0.671112
3,0.836925,0.576633,1105.870269,0.491306,5.632459,0.634765,0.400737,0.672172


In [105]:
float(sum(y_train))/float(len(y_train))

0.19294304167397278

In [261]:
def custom_loss(y_hat, y):
    a = 0.18
    b = 0.82
    p = 1.0 / (1.0 + np.exp(-1.0*y_hat))
    grad = -a*p*y+a*y+b*p*y-b*p
    hess = (-a*y+b*y-b)*p*(1-p)
    
    return grad, hess

def evalerror(preds, dtrain):
    y_hat = preds
    y_true = dtrain.get_label()
    return 'error', custom_cost(y_true, y_hat)

In [273]:
xg = xgb.XGBClassifier(nthread=-1,  n_estimators=200, learning_rate=0.4,
                       reg_lambda=0.5, objective=custom_loss)
xg.objective

<function __main__.custom_loss>

In [274]:
#eval_set = [(X_train, y_train), (X_test, y_test)]
_ = xg.fit(X_train, y_train)#,
       #eval_metric=evalerror, early_stopping_rounds=5, eval_set = eval_set)

In [275]:
compare_models([xg],
               ['xg'],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss,
                precision_score, recall_score, roc_auc_score], fit=False)

Unnamed: 0_level_0,accuracy_score,average_precision_score,custom_cost,f1_score,log_loss,precision_score,recall_score,roc_auc_score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
xg,0.836925,0.577161,1098.123935,0.493031,5.63246,0.63359,0.403513,0.67322
