In [7]:
import pandas as pd
import numpy as np
import datetime as dt
import pickle

import patsy
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

from sklearn.cross_validation import cross_val_score

import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [8]:
 pd.set_option('display.max_colwidth', -1)

In [9]:
!ls -l

total 903920
-rw-rw-r-- 1 brendanfitzpatrick brendanfitzpatrick     80682 Jul 30 00:55 03_mcnulty_brendan_modeling-Copy1.ipynb
-rw-rw-r-- 1 brendanfitzpatrick brendanfitzpatrick     78939 Jul 29 23:52 03_mcnulty_brendan_modeling.ipynb
-rwxr-xr-x 1 brendanfitzpatrick brendanfitzpatrick    172221 Jul 28 21:30 challenge_set_1_brendan.ipynb
-rw-r--r-- 1 brendanfitzpatrick brendanfitzpatrick    178337 Jul 28 15:12 challenge_set_7_brendan.ipynb
-rw-rw-r-- 1 brendanfitzpatrick brendanfitzpatrick     28379 Jul 28 20:40 challenge_set_8_brendan.ipynb
-rw-r--r-- 1 brendanfitzpatrick brendanfitzpatrick 477673733 Jul 27 20:07 df_inv.pkl
-rw-r--r-- 1 brendanfitzpatrick brendanfitzpatrick    195373 Jul 27 20:00 df_joint.pkl
-rw-r--r-- 1 brendanfitzpatrick brendanfitzpatrick   2746559 Jul 27 19:59 df_sub_policy.pkl
-rw-r--r-- 1 brendanfitzpatrick brendanfitzpatrick   1737231 Jul 29 15:52 Intro_to_Regression_Solutions.ipynb
-rw-r--r-- 1 brendanfitzpatrick brendanfitzpatrick     20995 Jul 27 2

In [10]:
with open("df_inv.pkl", 'rb') as picklefile: 
    df_inv = pickle.load(picklefile)

In [11]:
with open("df_joint.pkl", 'rb') as picklefile: 
    df_joint = pickle.load(picklefile)

In [12]:
with open("df_sub_policy.pkl",'rb') as picklefile:
    df_sub_policy = pickle.load(picklefile)

In [13]:
df_key = pd.read_excel('LCDataDictionary.xlsx')
df_key.set_index('LoanStatNew')

Unnamed: 0_level_0,Description
LoanStatNew,Unnamed: 1_level_1
addr_state,The state provided by the borrower in the loan application
annual_inc,The self-reported annual income provided by the borrower during registration.
annual_inc_joint,The combined self-reported annual income provided by the co-borrowers during registration
application_type,Indicates whether the loan is an individual application or a joint application with two co-borrowers
collection_recovery_fee,post charge off collection fee
collections_12_mths_ex_med,Number of collections in 12 months excluding medical collections
delinq_2yrs,The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years
desc,Loan description provided by the borrower
dti,"A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income."
dti_joint,"A ratio calculated using the co-borrowers' total monthly payments on the total debt obligations, excluding mortgages and the requested LC loan, divided by the co-borrowers' combined self-reported monthly income"


In [14]:
df_key

Unnamed: 0,LoanStatNew,Description
0,addr_state,The state provided by the borrower in the loan application
1,annual_inc,The self-reported annual income provided by the borrower during registration.
2,annual_inc_joint,The combined self-reported annual income provided by the co-borrowers during registration
3,application_type,Indicates whether the loan is an individual application or a joint application with two co-borrowers
4,collection_recovery_fee,post charge off collection fee
5,collections_12_mths_ex_med,Number of collections in 12 months excluding medical collections
6,delinq_2yrs,The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years
7,desc,Loan description provided by the borrower
8,dti,"A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income."
9,dti_joint,"A ratio calculated using the co-borrowers' total monthly payments on the total debt obligations, excluding mortgages and the requested LC loan, divided by the co-borrowers' combined self-reported monthly income"


# Additional Feature Engineering

In [15]:
df_inv = df_inv[(df_inv.loan_status == 'Fully Paid')
                | (df_inv.loan_status == 'Charged Off')
                | (df_inv.loan_status == 'Default')]

In [16]:
df_inv['loan_status'] = df_inv.loan_status.apply(lambda x: 'Fully Paid' 
                                              if x == 'Fully Paid' 
                                              else 'Default')

In [17]:
df_inv.loan_status.unique()

array(['Fully Paid', 'Default'], dtype=object)

In [18]:
df_inv = df_inv[df_inv.revol_util.notnull()]

In [19]:
df_inv['lc_allocation'] = df_inv.loan_amnt - df_inv.funded_amnt_inv

In [20]:
df_inv['term'] = pd.to_numeric(df_inv.term.str.strip().str.replace('months',''))

In [21]:
def yes_no_binary(x):
    if (x=='y'):
        return 1
    if (x=='n'):
        return 0

In [22]:
df_inv['pymnt_plan'] = df_inv['pymnt_plan'].apply(yes_no_binary)

In [23]:
def whole_fractional_binary(x):
    if (x=='w'):
        return 1
    if (x=='f'):
        return 0

In [24]:
df_inv['initial_list_status'] = df_inv['initial_list_status'].apply(whole_fractional_binary)

In [25]:
def event_last_year(x):
    if (x < 12*1):
        return 1
    else:
        return 0

In [26]:
def event_last_two_years(x):
    if (x < 12*2):
        return 1
    else:
        return 0

In [27]:
def event_last_five_years(x):
    if (x < 12*5):
        return 1
    else:
        return 0

In [28]:
df_inv['major_derog_within_1'] = df_inv.mths_since_last_major_derog.apply(event_last_year)
df_inv['major_derog_within_2'] = df_inv.mths_since_last_major_derog.apply(event_last_two_years)
df_inv['major_derog_within_5'] = df_inv.mths_since_last_major_derog.apply(event_last_five_years)

In [29]:
df_inv['record_within_1'] = df_inv.mths_since_last_record.apply(event_last_year)
df_inv['record_within_2'] = df_inv.mths_since_last_record.apply(event_last_two_years)
df_inv['record_within_5'] = df_inv.mths_since_last_record.apply(event_last_five_years)

In [30]:
df_inv['emp_length'].unique()

array(['10+ years', '< 1 year', '3 years', '9 years', '4 years', '5 years',
       '1 year', '6 years', '2 years', '7 years', '8 years', 'n/a'], dtype=object)

In [31]:
def emp_length_10_map(x):
    if(x == '10+ years'):
        return 1
    else:
        return 0

In [32]:
def emp_length_5_map(x):
    if(x == '10+ years' or x == '9 years' or x == '8 years' 
       or x == '7 years' or x == '6 years' or x == '5 years'):
        return 1
    else:
        return 0

In [33]:
def emp_length_1_map(x):
    if(x == '< 1 year' or x == 'n/a'):
        return 0
    else:
        return 1

In [34]:
df_inv['emp_length_greater_1_yr'] = df_inv.emp_length.apply(emp_length_1_map)
df_inv['emp_length_greater_5_yrs'] = df_inv.emp_length.apply(emp_length_5_map)
df_inv['emp_length_greater_10_yrs'] = df_inv.emp_length.apply(emp_length_10_map)

In [35]:
df_inv['earliest_cr_line'] = pd.to_datetime(df_inv.earliest_cr_line,format='%b-%Y')
df_inv['issue_d'] = pd.to_datetime(df_inv.issue_d,format='%b-%Y')

In [36]:
df_inv['earliest_cr_line_delta'] = (df_inv.issue_d - df_inv.earliest_cr_line) / np.timedelta64(1, 'D')

In [37]:
grades_df = patsy.dmatrix('grade',data=df_inv,return_type='dataframe')
grades_columns = ['grade_intercept',
                  'B_grade',
                  'C_grade',
                  'D_grade',
                  'E_grade',
                  'F_grade',
                  'G_grade']
grades_df.columns = grades_columns

In [38]:
df_inv = df_inv.join(grades_df)

In [39]:
home_ownership_df = patsy.dmatrix('home_ownership',data=df_inv,return_type='dataframe')
home_ownership_columns = ['home_ownership_intercept',
                          'home_ownership_mortgage',
                          'home_ownership_none',
                          'home_ownership_other',
                          'home_ownership_own',
                          'home_ownership_rent']
home_ownership_df.columns = home_ownership_columns

In [40]:
df_inv = df_inv.join(home_ownership_df)

In [41]:
verification_df = patsy.dmatrix('verification_status',data=df_inv,return_type='dataframe')
verification_columns = ['verification_intercept',
                        'source_verified',
                        'verified']
verification_df.columns = verification_columns

In [42]:
df_inv = df_inv.join(verification_df)

In [43]:
"""emp_length_df = patsy.dmatrix('emp_length',data=df_inv,return_type='dataframe')
emp_length_columns = ['emp_length_intercept', 
                      'emp_length_greater_10_yrs',
                      'emp_length_2_yrs',
                      'emp_length_3_yrs',
                      'emp_length_4_yrs',
                      'emp_length_5_yrs',
                      'emp_length_6_yrs',
                      'emp_length_7_yrs',
                      'emp_length_8_yrs',
                      'emp_length_9_yrs', 
                      'emp_length_less_1_yr',
                      'emp_length_n/a]']
emp_length_df.columns = emp_length_columns"""

"emp_length_df = patsy.dmatrix('emp_length',data=df_inv,return_type='dataframe')\nemp_length_columns = ['emp_length_intercept', \n                      'emp_length_greater_10_yrs',\n                      'emp_length_2_yrs',\n                      'emp_length_3_yrs',\n                      'emp_length_4_yrs',\n                      'emp_length_5_yrs',\n                      'emp_length_6_yrs',\n                      'emp_length_7_yrs',\n                      'emp_length_8_yrs',\n                      'emp_length_9_yrs', \n                      'emp_length_less_1_yr',\n                      'emp_length_n/a]']\nemp_length_df.columns = emp_length_columns"

In [44]:
"""df_inv = df_inv.join(emp_length_df)"""

'df_inv = df_inv.join(emp_length_df)'

BF NOTE: Try outstanding principal LC allocation if in need of additional features

# End Additional Feature Engineering

In [45]:
X = df_inv[['loan_amnt',
            'lc_allocation',
            'term',
            'int_rate',
            'installment',
            'grade_intercept',
            'B_grade',
            'C_grade',
            'D_grade',
            'E_grade',
            'F_grade',
            'emp_length_greater_1_yr',
            'emp_length_greater_5_yrs',
            'emp_length_greater_10_yrs',
            'home_ownership_intercept',
            'home_ownership_mortgage',
            'home_ownership_none',
            'home_ownership_other',
            'home_ownership_own',
            'home_ownership_rent',
            'annual_inc',
            'verification_intercept',
            'source_verified',
            'verified',
            'pymnt_plan',
            'dti',
            'delinq_2yrs',
            'earliest_cr_line_delta',
            'inq_last_6mths',
            'record_within_1',
            'record_within_2',
            'record_within_5',
            'open_acc',
            'pub_rec',
            'revol_bal',
            'revol_util',
            'total_acc',
            'initial_list_status',
            'out_prncp',
            'major_derog_within_1',
            'major_derog_within_2',
            'major_derog_within_5']]
y = df_inv.loan_status.values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [46]:
logr = LogisticRegression()

In [47]:
models = [LogisticRegression(),
          GaussianNB(),
          SVC(probability=True),
          DecisionTreeClassifier(),
          RandomForestClassifier()]
model_names = ['KNN', 'Logistic', 'Naive Bayes', 'SVM', 'Decision Tree', 'Random Forest']

In [3]:
logr = LogisticRegression()

In [48]:
logr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
accuracy_df = pd.DataFrame({'accuracy':accuracy, 
                            'precision_default':precision_default, 
                            'precision_fully_paid':precision_fully_paid,
                            'recall_default':recall_default, 
                            'recall_fully_paid':recall_fully_paid, 
                            'f1_default':f1_default, 
                            'f1_fully_paid':f1_fully_paid})
accuracy_df.set_index([model_names])