<br>
<br>
**<font size=5><center>Predicting Default Rates for Lending Club</center></font>**

### Authors:
Devon Luongo <br>
Ankit Agarwal <br>
Bryn Clarke <br>
Ben Yuen

# IV. Baseline Models

*Libraries*

In [None]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy
%matplotlib inline


#new imports for milestone 4
import StringIO
from IPython.display import Image
import pydotplus
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, f1_score, make_scorer, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.metrics  import confusion_matrix
import itertools
from sklearn.cross_validation import train_test_split
import pickle

In [None]:
print "hi"

In [None]:
##df_X = pd.read_pickle("./data/df_X.pkl")
df_y = pd.read_pickle("./data/df_y.pkl")
with open('./data/df_X_imputed.pkl', 'rb') as f:
    df_X = pickle.load(f)

In [None]:
X_train = np.load("./data/X_train.npy")
X_test = np.load("./data/X_test.npy")
y_train = np.load("./data/y_train.npy")
y_test = np.load("./data/y_test.npy")

In [None]:
X_sm = df_X.values[0:1000, :]
y_sm = df_y.values[1000:1200]

print df_y.values.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X.values, df_y.values, test_size=0.2)

In [None]:
# Baseline models
# All positive (label every applicant as default)
class Pos_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.array([1] * len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

pos_model = Pos_model()
pos_model.fit(X_train, y_train)
pos_model.predict(X_test)

In [None]:
# All negative (label every applicant as not default)
class Neg_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.array([0] * len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

neg_model = Neg_model()
neg_model.fit(X_train, y_train)
neg_model.predict(X_test)

In [None]:
# Random (randomly predict flu or not flu)
class Random_model(object):
    def fit(self, X, y):
        return
    
    def predict(self, x):
        return np.random.randint(0, 2, len(x))
    
    def score(self, x, y):
        y_pred = self.predict(x)
        y_err = y - y_pred
        return len(y_err[y_err == 0]) * 1. / len(y_err)

random_model = Random_model()
random_model.fit(X_train, y_train)
random_model.predict(X_test)

In [None]:
def custom_cost(y, y_pred):
    cost_fixed_application = 10.0
    cost_fixed_servicing = 100.0
    cost_default = 15000.0
    cost_interest = -1000.0
    
    # TRUE POSITIVE: Predict default, applicant would default
    n_true_positive = sum((y==1) & (y_pred==1))
    # We only pay fixed application processing costs as we decline the loan
    cost_true_positive = n_true_positive*(cost_fixed_application)
    
    # TRUE NEGATIVE: Predict pay on time, applicant will pay on time
    n_true_negative = sum((y==0) & (y_pred==0))
    # We pay fixed application processing costs, costs of servicing the loan, and gain profit on interest payments
    cost_true_negative = n_true_negative*(cost_fixed_application + cost_fixed_servicing + cost_interest)
    
    # FALSE POSITIVE: Predict default, applicant would pay on time
    n_false_positive = sum((y==0) & (y_pred==1))
    # We only pay fixed application processing costs as we decline the loan
    cost_false_positive = n_false_positive*(cost_fixed_application)
    
    # FALSE NEGATIVE: Predict pay on time, applicant will default
    n_false_negative = sum((y==1) & (y_pred==0))
    # We pay fixed application processing costs, costs of servicing the loan, and the principal as the loan defaults
    cost_false_negative = n_false_negative*(cost_fixed_application + cost_fixed_servicing + cost_default)
    
    return cost_true_positive + cost_true_negative + cost_false_positive + cost_false_negative

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

def compare_models(models, labels, scoring_funcs):
    all_scores = []
    
    for scoring_func in scoring_funcs:    
        scores = []
    
        for model in models:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            scores.append(scoring_func(y_test, y_pred))
    
        res = pd.DataFrame({"Model": labels, scoring_func.__name__: scores})
        res.set_index("Model", inplace=True)
        all_scores.append(res)
    
    return pd.concat(all_scores, axis=1)
    
compare_models([pos_model, neg_model, random_model],
               ["Positive Model (All Default)", "Negative Model (None Default)", "Random Model (Half Default)"],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss, precision_score, recall_score, roc_auc_score])

# V. Prediction Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.svm import SVC

log1 = LogisticRegression(penalty='l2', C=1.0, class_weight=None)
log2 = LogisticRegression(penalty='l2', C=1.0, class_weight='balanced')
lda1 = LDA(priors=None)
lda2 = LDA(priors=[0.5, 0.5])
lda3 = LDA(priors=[0.2, 0.8])
qda1 = QDA(priors=None, reg_param=0.0)
qda2 = QDA(priors=[0.2, 0.8], reg_param=0.0)
qda3 = QDA(priors=[0.2, 0.8], reg_param=0.9)
knn1 = KNN(n_neighbors=1, weights='uniform', p=2)
knn2 = KNN(n_neighbors=5, weights='uniform', p=2)
knn3 = KNN(n_neighbors=5, weights='distance', p=2)
knn4 = KNN(n_neighbors=5, weights='uniform', p=1)
tree1 = DecisionTree(criterion='gini', max_depth=3, class_weight=None)
tree2 = DecisionTree(criterion='entropy', max_depth=3, class_weight=None)
tree3 = DecisionTree(criterion='gini', max_depth=10, class_weight=None)
tree4 = DecisionTree(criterion='gini', max_depth=3, class_weight='balanced')
rf1 = RandomForest(n_estimators=10, max_depth=3)
rf2 = RandomForest(n_estimators=10, max_depth=3, class_weight='balanced')
rf3 = RandomForest(n_estimators=10, max_depth=1, class_weight='balanced')
svc1 = SVC(C=1.0, kernel='linear', class_weight=None)
svc2 = SVC(C=1.0, kernel='linear', class_weight='balanced')
svc3 = SVC(C=0.5, kernel='linear', class_weight='balanced')
svc4 = SVC(C=2.0, kernel='linear', class_weight='balanced')
svc5 = SVC(C=1.0, kernel='rbf', class_weight='balanced')

compare_models([log1, log2,
                lda1, lda2, lda3,
                qda1, qda2, qda3,
                knn1, knn2, knn3, knn4,
                tree1, tree2, tree3, tree4,
                rf1, rf2, rf3,
                svc1, svc2, svc3, svc4, svc5],
               ["Logistic Regression (unweighted)",
                "Logistic Regression (balanced)",
                "LDA (no priors)",
                "LDA (equal weight priors)",
                "LDA (penalizing priors)",
                "QDA (no priors/no reg)",
                "QDA (penal. priors/no reg)",
                "QDA (penal. priors/with reg)",
                "1-NN (uniform L2 weights)",
                "5-NN (uniform L2 weights)",
                "5-NN (distance L2 weights)",
                "5-NN (uniform L1 weights)",
                "DTree (gini/3-depth)",
                "DTree (entropy/3-depth)",
                "DTree (gini/10-depth)",
                "DTree (gini/10-depth/balanced)",
                "RForest (10 est/3-depth)",
                "RForest (10 est/3-depth/balanced)",
                "RForest (10 est/1-depth/balanced)",
                "SVC (linear/C=1.0)",
                "SVC (linear/C=1.0/balanced)",
                "SVC (linear/C=0.5/balanced)",
                "SVC (linear/C=2.0/balanced)",
                "SVC (rbf/C=1.0/balanced)"],
               [custom_cost, accuracy_score, average_precision_score, f1_score, log_loss, precision_score, recall_score, roc_auc_score])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

#Compute ROC Curve

# Learn to predict each class against the other
classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', fit_intercept=True, solver='lbfgs', multi_class='multinomial'))
y_score = classifier.fit(x_train_sm, y_train_sm).decision_function(x_test_sm)

In [None]:

import timeit

start = timeit.default_timer()

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(10):
    fpr[i], tpr[i], _ = roc_curve(y_test_sm[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_sm.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])


stop = timeit.default_timer()

print stop - start 

In [None]:
#Plot Roc Curve
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

#Compute ROC Curve

# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
y_score = classifier.fit(x_train_sm, y_train_sm).decision_function(x_test_sm)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(10):
    fpr[i], tpr[i], _ = roc_curve(y_test_sm[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_sm.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
#Plot Roc Curve
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()