<br>
<br>
**<font size=5><center>Predicting Default Rates for Lending Club</center></font>**

### Authors:
Devon Luongo <br>
Ankit Agarwal <br>
Bryn Clarke <br>
Ben Yuen

# IV. Baseline Models

*Libraries*

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_X = pd.read_pickle("./data/df_X.pkl")
df_y = pd.read_pickle("./data/df_y.pkl")

In [3]:
X_train = np.load("./data/X_train.npy")
X_test = np.load("./data/X_test.npy")
y_train = np.load("./data/y_train.npy")
y_test = np.load("./data/y_test.npy")

In [4]:
# Baseline models
# All positive (label every applicant as default)
class Pos_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.array([1] * len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

pos_model = Pos_model()
pos_model.fit(X_train, y_train)
pos_model.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1])

In [5]:
# All negative (label every applicant as not default)
class Neg_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.array([0] * len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

neg_model = Neg_model()
neg_model.fit(X_train, y_train)
neg_model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [6]:
# Random (randomly predict flu or not flu)
class Random_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.random.randint(0, 2, len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

random_model = Random_model()
random_model.fit(X_train, y_train)
random_model.predict(X_test)

array([1, 1, 0, ..., 0, 0, 0])

In [7]:
def custom_cost(y, y_pred):
    cost_fixed_application = 10.0
    cost_fixed_servicing = 100.0
    cost_default = 15000.0
    cost_interest = -1000.0
    
    # TRUE POSITIVE: Predict default, applicant would default
    n_true_positive = sum((y==1) & (y_pred==1))
    # We only pay fixed application processing costs as we decline the loan
    cost_true_positive = n_true_positive*(cost_fixed_application)
    
    # TRUE NEGATIVE: Predict pay on time, applicant will pay on time
    n_true_negative = sum((y==0) & (y_pred==0))
    # We pay fixed application processing costs, costs of servicing the loan, and gain profit on interest payments
    cost_true_negative = n_true_negative*(cost_fixed_application + cost_fixed_servicing + cost_interest)
    
    # FALSE POSITIVE: Predict default, applicant would pay on time
    n_false_positive = sum((y==0) & (y_pred==1))
    # We only pay fixed application processing costs as we decline the loan
    cost_false_positive = n_false_positive*(cost_fixed_application)
    
    # FALSE NEGATIVE: Predict pay on time, applicant will default
    n_false_negative = sum((y==1) & (y_pred==0))
    # We pay fixed application processing costs, costs of servicing the loan, and the principal as the loan defaults
    cost_false_negative = n_false_negative*(cost_fixed_application + cost_fixed_servicing + cost_default)
    
    return cost_true_positive + cost_true_negative + cost_false_positive + cost_false_negative

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

def compare_models(models, labels, scoring_funcs):
    all_scores = []
    
    for scoring_func in scoring_funcs:    
        scores = []
    
        for model in models:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            scores.append(scoring_func(y_test, y_pred))
    
        res = pd.DataFrame({"Model": labels, scoring_func.__name__: scores})
        res.set_index("Model", inplace=True)
        all_scores.append(res)
    
    return pd.concat(all_scores, axis=1)
    
compare_models([pos_model, neg_model, random_model],
               ["Positive Model (All Default)", "Negative Model (None Default)", "Random Model (Half Default)"],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss, precision_score, recall_score, roc_auc_score])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,custom_cost,accuracy_score,average_precision_score,f1_score,log_loss,precision_score,recall_score,roc_auc_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Positive Model (All Default),1173200.0,0.196514,0.598257,0.328477,27.752072,0.196514,1.0,0.5
Negative Model (None Default),264465200.0,0.803486,0.598257,0.0,6.787346,0.0,0.0,0.5
Random Model (Half Default),132893100.0,0.499957,0.397321,0.283777,17.276186,0.196172,0.497202,0.500884


# V. Prediction Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.svm import SVC

log1 = LogisticRegression(penalty='l2', C=1.0, class_weight=None)
log2 = LogisticRegression(penalty='l2', C=1.0, class_weight='balanced')
lda1 = LDA(priors=None)
lda2 = LDA(priors=[0.5, 0.5])
lda3 = LDA(priors=[0.2, 0.8])
qda1 = QDA(priors=None, reg_param=0.0)
qda2 = QDA(priors=[0.2, 0.8], reg_param=0.0)
qda3 = QDA(priors=[0.2, 0.8], reg_param=0.9)
knn1 = KNN(n_neighbors=1, weights='uniform', p=2)
knn2 = KNN(n_neighbors=5, weights='uniform', p=2)
knn3 = KNN(n_neighbors=5, weights='distance', p=2)
knn4 = KNN(n_neighbors=5, weights='uniform', p=1)
tree1 = DecisionTree(criterion='gini', max_depth=3, class_weight=None)
tree2 = DecisionTree(criterion='entropy', max_depth=3, class_weight=None)
tree3 = DecisionTree(criterion='gini', max_depth=10, class_weight=None)
tree4 = DecisionTree(criterion='gini', max_depth=3, class_weight='balanced')
rf1 = RandomForest(n_estimators=10, max_depth=3)
rf2 = RandomForest(n_estimators=10, max_depth=3, class_weight='balanced')
rf3 = RandomForest(n_estimators=10, max_depth=1, class_weight='balanced')
svc1 = SVC(C=1.0, kernel='linear', class_weight=None)
svc2 = SVC(C=1.0, kernel='linear', class_weight='balanced')
svc3 = SVC(C=0.5, kernel='linear', class_weight='balanced')
svc4 = SVC(C=2.0, kernel='linear', class_weight='balanced')
svc5 = SVC(C=1.0, kernel='rbf', class_weight='balanced')

compare_models([log1, log2,
                lda1, lda2, lda3,
                qda1, qda2, qda3,
                knn1, knn2, knn3, knn4,
                tree1, tree2, tree3, tree4,
                rf1, rf2, rf3,
                svc1, svc2, svc3, svc4, svc5],
               ["Logistic Regression (unweighted)",
                "Logistic Regression (balanced)",
                "LDA (no priors)",
                "LDA (equal weight priors)",
                "LDA (penalizing priors)",
                "QDA (no priors/no reg)",
                "QDA (penal. priors/no reg)",
                "QDA (penal. priors/with reg)",
                "1-NN (uniform L2 weights)",
                "5-NN (uniform L2 weights)",
                "5-NN (distance L2 weights)",
                "5-NN (uniform L1 weights)",
                "DTree (gini/3-depth)",
                "DTree (entropy/3-depth)",
                "DTree (gini/10-depth)",
                "DTree (gini/10-depth/balanced)",
                "RForest (10 est/3-depth)",
                "RForest (10 est/3-depth/balanced)",
                "RForest (10 est/1-depth/balanced)",
                "SVC (linear/C=1.0)",
                "SVC (linear/C=1.0/balanced)",
                "SVC (linear/C=0.5/balanced)",
                "SVC (linear/C=2.0/balanced)",
                "SVC (rbf/C=1.0/balanced)"],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss, precision_score, recall_score, roc_auc_score])

In [None]:
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.grid_search import GridSearchCV
import time

msk = np.random.rand(len(X_train)) < 0.2
X_train2 = X_train[msk]
y_train2 = y_train[msk]


def scoring(estimator, X, y):
    y_pred = estimator.predict(X)
    return custom_cost(y, y_pred)

# build a classifier
clf = RandomForest(n_estimators=200)

param_grid = {"max_depth": [3, 5, None],
              "max_features": [1, 3, 10, 100],
              "min_samples_split": [10, 30, 100],
              "min_samples_leaf": [10, 30, 100],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring=scoring, verbose=5, n_jobs=4)
start = time.time()
grid_search.fit(X_train2, y_train2)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time.time() - start, len(grid_search.cv_results_['params'])))

Fitting 3 folds for each of 432 candidates, totalling 1296 fits
[CV] bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=10, max_depth=3 
[CV] bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=10, max_depth=3 
[CV] bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=10, max_depth=3 
[CV] bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=30, max_depth=3 
[CV]  bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=10, max_depth=3, score=39291030.000000 -   2.2s
[CV] bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=30, max_depth=3 
[CV]  bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=10, max_depth=3, score=39291030.000000 -   2.3s
[CV] bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=30, max_depth=3 


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    7.6s


[CV]  bootstrap=True, min_samples_leaf=10, max_features=1, criterion=gini, min_samples_split=100, max_depth=3, score=39291030.000000 -   2.9s
[CV] bootstrap=True, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=3 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=30, max_depth=3, score=39306140.000000 -   2.5s
[CV] bootstrap=True, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=3 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=3, score=39306140.000000 -   2.1s
[CV]  bootstrap=True, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=30, max_depth=3, score=39291030.000000 -   2.4s
[CV] bootstrap=True, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=3 
[CV] bootstrap=True, min_samples_leaf=100, max_features=1, criterion=gini, min_samples_split=10, max_dep

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   57.6s


[CV]  bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=10, max_depth=3, score=39291030.000000 -   8.2s
[CV] bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=30, max_depth=3 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=10, max_depth=3, score=39291030.000000 -   7.8s
[CV] bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=100, max_depth=3 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=30, max_depth=3, score=39306140.000000 -   8.5s
[CV] bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=100, max_depth=3 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=30, max_depth=3, score=39291030.000000 -   8.3s
[CV] bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=100, ma

[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed: 19.8min


[CV]  bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=10, max_depth=None, score=39276830.000000 -  15.4s
[CV] bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=100, max_depth=None 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=30, max_depth=None, score=39291040.000000 -  15.5s
[CV] bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=100, max_depth=None 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=30, max_depth=None, score=39291030.000000 -  15.0s
[CV] bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=100, max_depth=None 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=10, criterion=gini, min_samples_split=30, max_depth=None, score=39248430.000000 -  15.1s
[CV] bootstrap=True, min_samples_leaf=100, max_features=10, criterion=gini, mi

[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 40.8min


[CV]  bootstrap=True, min_samples_leaf=30, max_features=1, criterion=entropy, min_samples_split=30, max_depth=5, score=39306140.000000 -   2.7s
[CV] bootstrap=True, min_samples_leaf=30, max_features=1, criterion=entropy, min_samples_split=30, max_depth=5 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=1, criterion=entropy, min_samples_split=30, max_depth=5, score=39291030.000000 -   2.5s
[CV] bootstrap=True, min_samples_leaf=30, max_features=1, criterion=entropy, min_samples_split=100, max_depth=5 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=1, criterion=entropy, min_samples_split=30, max_depth=5, score=39291030.000000 -   2.5s
[CV] bootstrap=True, min_samples_leaf=30, max_features=1, criterion=entropy, min_samples_split=100, max_depth=5 
[CV]  bootstrap=True, min_samples_leaf=30, max_features=1, criterion=entropy, min_samples_split=100, max_depth=5, score=39306140.000000 -   3.6s
[CV] bootstrap=True, min_samples_leaf=30, max_features=1, criterion=entropy, min_samp

[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 68.6min


[CV]  bootstrap=True, min_samples_leaf=30, max_features=100, criterion=entropy, min_samples_split=100, max_depth=None, score=37525930.000000 - 2.2min
[CV] bootstrap=True, min_samples_leaf=100, max_features=100, criterion=entropy, min_samples_split=30, max_depth=None 
[CV]  bootstrap=True, min_samples_leaf=100, max_features=100, criterion=entropy, min_samples_split=10, max_depth=None, score=37839030.000000 - 1.7min
[CV] bootstrap=True, min_samples_leaf=100, max_features=100, criterion=entropy, min_samples_split=100, max_depth=None 
[CV]  bootstrap=True, min_samples_leaf=100, max_features=100, criterion=entropy, min_samples_split=30, max_depth=None, score=38089040.000000 - 1.6min
[CV] bootstrap=True, min_samples_leaf=100, max_features=100, criterion=entropy, min_samples_split=100, max_depth=None 
[CV]  bootstrap=True, min_samples_leaf=100, max_features=100, criterion=entropy, min_samples_split=30, max_depth=None, score=38426430.000000 - 1.7min
[CV] bootstrap=True, min_samples_leaf=100, m

[Parallel(n_jobs=4)]: Done 874 tasks      | elapsed: 92.0min


[CV]  bootstrap=False, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=30, max_depth=None, score=39291030.000000 -   3.0s
[CV] bootstrap=False, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=None 
[CV]  bootstrap=False, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=30, max_depth=None, score=39291030.000000 -   3.2s
[CV] bootstrap=False, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=None 
[CV]  bootstrap=False, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=None, score=39306140.000000 -   2.9s
[CV] bootstrap=False, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=None 
[CV]  bootstrap=False, min_samples_leaf=30, max_features=1, criterion=gini, min_samples_split=100, max_depth=None, score=39291030.000000 -   3.0s
[CV] bootstrap=False, min_samples_leaf=100, max_features=1, criterion=gini, 

[Parallel(n_jobs=4)]: Done 1144 tasks      | elapsed: 122.9min


[CV]  bootstrap=False, min_samples_leaf=30, max_features=10, criterion=entropy, min_samples_split=10, max_depth=5, score=39291030.000000 -  12.2s
[CV] bootstrap=False, min_samples_leaf=30, max_features=10, criterion=entropy, min_samples_split=100, max_depth=5 
[CV]  bootstrap=False, min_samples_leaf=30, max_features=10, criterion=entropy, min_samples_split=30, max_depth=5, score=39306140.000000 -  11.7s
[CV] bootstrap=False, min_samples_leaf=30, max_features=10, criterion=entropy, min_samples_split=100, max_depth=5 
[CV]  bootstrap=False, min_samples_leaf=30, max_features=10, criterion=entropy, min_samples_split=30, max_depth=5, score=39291030.000000 -  11.9s
[CV] bootstrap=False, min_samples_leaf=30, max_features=10, criterion=entropy, min_samples_split=100, max_depth=5 
[CV]  bootstrap=False, min_samples_leaf=30, max_features=10, criterion=entropy, min_samples_split=30, max_depth=5, score=39291030.000000 -  12.2s
[CV] bootstrap=False, min_samples_leaf=100, max_features=10, criterion=

[Parallel(n_jobs=4)]: Done 1296 out of 1296 | elapsed: 161.5min finished


AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [None]:
print X_train2.shape