# Lending Club Loan Data Modeling

In this section we will attempt to determine the best model to predict whether or not a borrower will default in the Lending Club Loan data.

We will start with a **_Logistic Regresion_** model, using **_Forward Selection_** to determine the features. We will then try a **_Support Vector Machine_**, **_Decision Tree_** and **_Random Forest_**. 

After we'll wrap it all up with a summary of what we have learned.

First, let's import a butt load of packages per usual!

In [3]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# Cleaning

In [4]:
def features():
    lc_dd = pd.read_excel(r'data/LCDataDictionary.xlsx')
    qstr = 'Include == 1'
    cols = [x.strip() for x in lc_dd.query(qstr).LoanStatNew.values.tolist()]
    cols.remove('id')
    return cols


def loan_status_filter(input_df):
    df = input_df.copy()
    loan_status_lst = ['Fully Paid', 'Charged Off', 'Late (31-120 days)', 'Default']
    mask = df.loan_status.isin(loan_status_lst)
    return df.loc[mask, :]

def make_dummy(input_df, column):
    df = input_df.copy()
    dummies = (df.loc[:, [column]]
               .pipe(pd.get_dummies))
    return df.join(dummies).drop(column, axis=1)

def mcnulty_preprocessing():
    print('Initiating MAXIMUM data munging power')
    df = (pd.read_csv('data/loan.csv', low_memory=False)
          .set_index('id')
          .pipe(loan_status_filter)
          .query('application_type == "INDIVIDUAL"')
          .loc[:, features()]
          .assign(issue_d=lambda x: x.issue_d.astype('datetime64'),
                  default=lambda x: np.where(x.loan_status=='Fully Paid', 0, 1),
                  term=lambda x: x.term.str.strip(),
                  emp_length=lambda x: x.emp_length.fillna('Not provided'))
          .pipe(make_dummy, 'term')
          .pipe(make_dummy, 'home_ownership')
          .pipe(make_dummy, 'emp_length')
          .pipe(make_dummy, 'grade')
          .pipe(make_dummy, 'purpose'))
    print('Luther Preprocessing Successful Woo Woo!\n')
    return df

In [5]:
fname = os.path.join('data', 'loan_cleaned_2019-08-19T20.20.43.csv')
df = mcnulty_preprocessing()

Initiating MAXIMUM data munging power
Luther Preprocessing Successful Woo Woo!



# Single Feature Logistic Regressions

In [6]:
independents = [
    ['dti'],
    ['int_rate'],
    ['annual_inc'],
    ['loan_amnt'],
    ['revol_bal'],
    ['term_36 months', 'term_60 months'],
    ['delinq_2yrs'],
    ['home_ownership_ANY', 'home_ownership_MORTGAGE', 'home_ownership_NONE',
     'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'],
    ['grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G'],
    ['purpose_car', 'purpose_credit_card', 'purpose_debt_consolidation', 'purpose_educational',
     'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase', 'purpose_medical',
     'purpose_moving', 'purpose_other', 'purpose_renewable_energy', 'purpose_small_business',
     'purpose_vacation', 'purpose_wedding'],
    ['emp_length_1 year', 'emp_length_10+ years', 'emp_length_2 years', 'emp_length_3 years',
     'emp_length_4 years', 'emp_length_5 years', 'emp_length_6 years', 'emp_length_7 years',
     'emp_length_8 years', 'emp_length_9 years', 'emp_length_< 1 year', 'emp_length_Not provided']
]
dependent = 'default'

In [7]:
features = independents[0]
X, y = df.loc[:, features], df.loc[:, dependent]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
def results_to_df(results):
    col_ord = ['model_type', 'features', 'degree', 
               'train_accuracy', 'test_accuracy', 'precision', 'recall', 'f1_score', 
               'true_negatives', 'false_positives', 'false_negatives', 'true_positives'] 
    return (pd.DataFrame(results)
            .reindex(columns=col_ord)
            .sort_values('test_accuracy', ascending=False))

def scores_formatted(input_df):
    df = input_df.copy()
    scores = ['train_accuracy', 'test_accuracy', 'precision', 'recall', 'f1_score']
    for s in scores:
        df[s] = df[s].map('{:0.2%}'.format)
    gross_scores = ['true_negatives', 'false_positives', 'false_negatives', 'true_positives']
    for gs in gross_scores:
        df[gs] = df[gs].map('{:,.0f}'.format)
    return df
    
def log_clf_model(results, model, model_type, X, y, features, degree=1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11,
                                                        stratify=y)
    model.fit(X_train, y_train)
    train_accuracy = clf.score(X_train, y_train)
    test_accuracy = clf.score(X_test, y_test)
    y_hat = clf.predict(X_test)
    precision = metrics.precision_score(y_test, y_hat)
    recall = metrics.recall_score(y_test, y_hat)
    f1 = metrics.f1_score(y_test, y_hat)
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_hat).ravel()
    record = {'model_type': model_type,
              'features': features,
              'degree': degree,
              'train_accuracy': train_accuracy,
              'test_accuracy': test_accuracy,
              'precision': precision,
              'recall': recall,
              'f1_score': f1,
              'true_negatives': tn,
              'false_positives': fp,
              'false_negatives': fn,
              'true_positives': tp}
    results.append(record)

In [139]:
import warnings
warnings.filterwarnings('ignore')

results = list()
for variable in independents:
    X, y = df.loc[:, variable], df.loc[:, dependent]
    clf = LogisticRegression(solver='lbfgs')
    if len(variable) > 1:
        log_clf_model(results, clf, 'Logistic Regression', X, y, variable)
    else:
        for degree in range(1, 4):
            if degree == 1:
                clf = LogisticRegression(solver='lbfgs')
                log_clf_model(results, clf, 'Logistic Regression', X, y, variable)
            else:
                clf = Pipeline([('poly', PolynomialFeatures(degree)), 
                                ('clf', LogisticRegression(solver='lbfgs'))])
                log_clf_model(results, clf, 'Logistic Regression', X, y, variable, degree)
# Let's also add a bias model
X = np.ones((df.shape[0], 1))
y = df.loc[:, dependent]
clf = LogisticRegression(solver='lbfgs')
log_clf_model(results, clf, 'Logistic Regression', X, y, 'bias')
(results_to_df(results)
 .pipe(scores_formatted))

Unnamed: 0,model_type,features,degree,train_accuracy,test_accuracy,precision,recall,f1_score,true_negatives,false_positives,false_negatives,true_positives
4,Logistic Regression,[int_rate],2,78.16%,78.16%,55.56%,0.04%,0.09%,41541,4,11606,5
0,Logistic Regression,[dti],1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
1,Logistic Regression,[dti],2,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
22,Logistic Regression,"[emp_length_1 year, emp_length_10+ years, emp_length_2 years, emp_length_3 years, emp_length_4 years,...",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
21,Logistic Regression,"[purpose_car, purpose_credit_card, purpose_debt_consolidation, purpose_educational, purpose_home_impr...",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
20,Logistic Regression,"[grade_A, grade_B, grade_C, grade_D, grade_E, grade_F, grade_G]",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
19,Logistic Regression,"[home_ownership_ANY, home_ownership_MORTGAGE, home_ownership_NONE, home_ownership_OTHER, home_ownersh...",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
18,Logistic Regression,[delinq_2yrs],3,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
17,Logistic Regression,[delinq_2yrs],2,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
15,Logistic Regression,"[term_36 months, term_60 months]",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0


We can see that all our models except 2 are guess 100 percent non-default. This is common with imbalanced classes. We are basically dealing with a high-bias problem here. We need to add features to **_reduce bias_** and **_add variance_**.

We'll start slow by adding only one additional feature.

## Logistic Regression with Two Features

In [22]:
import itertools

def unpack_list(lst_2d):
    return list(itertools.chain.from_iterable(lst_2d))

In [19]:
results = list()
for features_2d in itertools.combinations(independents, 2):
    features = unpack_list(features_2d)
    X, y = df.loc[:, features], df.loc[:, dependent]
    clf = LogisticRegression(solver='lbfgs')
    log_clf_model(results, clf, 'Logistic Regression', X, y, features)
# Let's also add a bias model
X = np.ones((df.shape[0], 1))
y = df.loc[:, dependent]
clf = LogisticRegression(solver='lbfgs')
log_clf_model(results, clf, 'Logistic Regression', X, y, 'bias')
(results_to_df(results)
 .pipe(scores_formatted))

Unnamed: 0,model_type,features,degree,train_accuracy,test_accuracy,precision,recall,f1_score,true_negatives,false_positives,false_negatives,true_positives
7,Logistic Regression,"[dti, grade_A, grade_B, grade_C, grade_D, grade_E, grade_F, grade_G]",1,78.28%,78.29%,55.28%,3.29%,6.21%,41236,309,11229,382
0,Logistic Regression,"[dti, int_rate]",1,78.20%,78.26%,51.92%,6.62%,11.75%,40833,712,10842,769
4,Logistic Regression,"[dti, term_36 months, term_60 months]",1,78.18%,78.17%,51.63%,0.82%,1.61%,41456,89,11516,95
53,Logistic Regression,"[grade_A, grade_B, grade_C, grade_D, grade_E, grade_F, grade_G, emp_length_1 year, emp_length_10+ yea...",1,78.14%,78.16%,50.58%,1.13%,2.21%,41417,128,11480,131
15,Logistic Regression,"[int_rate, home_ownership_ANY, home_ownership_MORTGAGE, home_ownership_NONE, home_ownership_OTHER, ho...",1,78.11%,78.16%,50.07%,5.79%,10.38%,40875,670,10939,672
8,Logistic Regression,"[dti, purpose_car, purpose_credit_card, purpose_debt_consolidation, purpose_educational, purpose_home...",1,78.15%,78.16%,53.85%,0.06%,0.12%,41539,6,11604,7
29,Logistic Regression,"[loan_amnt, delinq_2yrs]",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
30,Logistic Regression,"[loan_amnt, home_ownership_ANY, home_ownership_MORTGAGE, home_ownership_NONE, home_ownership_OTHER, h...",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
31,Logistic Regression,"[loan_amnt, grade_A, grade_B, grade_C, grade_D, grade_E, grade_F, grade_G]",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0
32,Logistic Regression,"[loan_amnt, purpose_car, purpose_credit_card, purpose_debt_consolidation, purpose_educational, purpos...",1,78.16%,78.16%,0.00%,0.00%,0.00%,41545,0,11611,0


Now we're getting somewhere! A few of our models actually guessed some positives (although not many)!

# Logistic Regression with Two Features and Polynomials