In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
accept = pd.read_csv("LoanStats3a.csv")
reject = pd.read_csv("RejectStatsA.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
accept['dti'] = np.array(accept['dti'], dtype=float)

The variables that I include in the model are 

a) "Amount Requested", which is the amount of money that was requested (if the loan was rejected) and requested and granted (if the loan was accepted)
b) "Debt-to-Income Ratio"
c) "State" where the application was made
d) "Employment Length
e) "Loan given" is a 0 if in the rejected loans data set and 1 if in the accepted loans data set.

I choose these variables because they are shared across both the rejected and the accepted loans data sets. 

In [4]:
reject_new = reject[['Amount Requested', 'Debt-To-Income Ratio', 'State','Employment Length']]
reject_new.columns = ['Amount Requested', 'DTI', 'State','Employment Length']
accept_new = accept[['loan_amnt','dti','addr_state','emp_length']]
accept_new.columns = ['Amount Requested', 'DTI', 'State','Employment Length']
accept_new['loan_given'] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


To clean the data, I 

a) Convert "dti" (debt-to-income ratio) in the accepted loans data frame into a float rather than an object 

b) Strip the percentage sign from the debt-to-income ratio 

c) Concatenate both the accepted and rejected loans data set into one big data frame

d) Drop NA values from the data

e) Encode the employment length variable, which is a string from less than 1 year, to 10+ years into an ordinal variable

f) One-hot-encode the 50 states to create 50 dummy variables\


In [5]:
def strip_percentage(data):
    string = data['DTI']
    percentage = [pos for pos, char in enumerate(string) if char == "%"]
    if len(percentage) == 1:
        return float(string[:percentage[0]])
    else:
        return float(string)
reject_new = reject_new.assign(DTI=reject_new.apply(strip_percentage, axis=1))
reject_new['loan_given'] = 0


In [6]:
data = reject_new.append(accept_new)
data.columns = ['Amount Requested', 'DTI', 'State','Employment Length','loan_given']

In [7]:
data = data.dropna(axis=0)
data = data[data['Employment Length'] != "n/a"]

In [8]:
data.shape

(495939, 5)

In [9]:
def extract_emp_length(data):
    string = data['Employment Length']
    digit = string[0]
    if digit == "<":
        return 0
    elif digit == "1":
        if string[1] == "0":
            return 10
        else:
            return 1
    else:
        return int(digit)

In [10]:
data = data.assign(employment=data.apply(extract_emp_length, axis=1))


In [11]:
state_data = pd.concat([data[['Amount Requested', 'DTI', 'employment']], pd.get_dummies(data['State'])],axis=1)

To start, I split the data set (with all the states encoded as dummy variables) into a training set and a test set using stratified splitting to ensure that the training set and test set have classes that reflect their class probabilities. This would help avoid overfitting because I will run all of my training on the training set, and use cross-validation to help predict my performance on the test-set. I will evaluate the model I believe will best perform on the test set only once, which does not allow knowledge of the test set to leak into the model.

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
clf = RandomForestClassifier()
state_data_train, state_data_test, loan_train, loan_test = train_test_split(state_data, data["loan_given"], train_size = 0.8,random_state=42)
# state_data_train, state_data_test, loan_train, loan_test = train_test_split(state_data, data["loan_given"], train_size = 0.8, random_state=42)




I will train several models and use cross-validation on the state_data training set in order to determine the best classifier that will then be inspected to determine its decision boundary. 

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

I first train random forest models, one with the class weights balanced, and another without the balancing. Balancing the class weights does not seem to improve the accuracy greatly.

In [14]:

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(state_data_train, loan_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = state_data_train.iloc[train_index], state_data_train.iloc[test_index]
#     print X_train
    print len(train_index)
    print len(test_index)
#     print state_data.isnull().values.any()
#     print X_train[pd.isnull(X_train).any(axis=1)]
#     print X_train.isnull().values.any()
#     print X_train[pd.isnull(X_train)]
    y_train, y_test = loan_train.iloc[train_index], loan_train.iloc[test_index]
    clf = RandomForestClassifier(class_weight="balanced")
    clf.fit(X_train, y_train)
    print metrics.confusion_matrix(y_test, clf.predict(X_test))
    print metrics.accuracy_score(y_test, clf.predict(X_test))
    for i in range(len(clf.feature_importances_)):
        print state_data_train.columns[i], clf.feature_importances_[i]
    

('TRAIN:', array([     0,      1,      3, ..., 396747, 396749, 396750]), 'TEST:', array([     2,      7,     18, ..., 396737, 396742, 396748]))
317400
79351
[[70343  2370]
 [ 3820  2818]]
0.921992161409
Amount Requested 0.128807250625
DTI 0.331192462594
employment 0.508145751366
AK 0.000328949739674
AL 0.000713777059094
AR 0.000763414786175
AZ 0.000783070665848
CA 0.00194483065891
CO 0.000800689194832
CT 0.000635349774246
DC 0.000486696701736
DE 0.000304379924417
FL 0.000902591859165
GA 0.00088986866364
HI 0.000412212788242
IA 0.00010256397347
ID 0.000126111039003
IL 0.00116730626865
IN 0.000179438950854
KS 0.000497543397365
KY 0.000654845481808
LA 0.000650622965992
MA 0.0008778701137
MD 0.000954482404421
ME 7.19504067824e-05
MI 0.000942771169028
MN 0.00074179356093
MO 0.000731244420457
MS 7.7950854942e-05
MT 0.000294703050535
NC 0.0011025602369
ND 1.87044708918e-07
NE 0.000180599359593
NH 0.000480173875455
NJ 0.0010559814548
NM 0.000486154167181
NV 0.00074349827988
NY 0.00129844975393

In [15]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(state_data_train, loan_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = state_data_train.iloc[train_index], state_data_train.iloc[test_index]
#     print X_train
    print len(train_index)
    print len(test_index)
#     print state_data.isnull().values.any()
#     print X_train[pd.isnull(X_train).any(axis=1)]
#     print X_train.isnull().values.any()
#     print X_train[pd.isnull(X_train)]
    y_train, y_test = loan_train.iloc[train_index], loan_train.iloc[test_index]
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    print metrics.confusion_matrix(y_test, clf.predict(X_test))
    print metrics.accuracy_score(y_test, clf.predict(X_test))
    
    for i in range(len(clf.feature_importances_)):
        print state_data_train.columns[i], clf.feature_importances_[i]

('TRAIN:', array([     0,      1,      3, ..., 396747, 396749, 396750]), 'TEST:', array([     2,      7,     18, ..., 396737, 396742, 396748]))
317400
79351
[[70393  2320]
 [ 3714  2924]]
0.923958110169
Amount Requested 0.198317580044
DTI 0.408919717728
employment 0.349429370297
AK 0.000462154495986
AL 0.00100820855062
AR 0.000704125596481
AZ 0.00141697941982
CA 0.00157133430338
CO 0.00112433693148
CT 0.00116208507019
DC 0.00049485081339
DE 0.000505573991532
FL 0.00143487094715
GA 0.00159467712118
HI 0.000497649903939
IA 0.000177728634879
ID 8.49752497265e-05
IL 0.00149357745234
IN 0.000219945232067
KS 0.0010014509248
KY 0.000701797527829
LA 0.00112582134649
MA 0.000931713783738
MD 0.0011075591346
ME 4.43209248619e-05
MI 0.000823409612983
MN 0.00101492332037
MO 0.00118397310013
MS 0.000170838295757
MT 0.000474700135868
NC 0.000857232876393
ND 1.95780788567e-08
NE 7.61978027195e-05
NH 0.000662968838652
NJ 0.00112432663463
NM 0.000694564431066
NV 0.00108727577278
NY 0.00159900977048
OH 0

Next, I train 2 Logistic Regression models, one with the class weights balanced and one without balancing the class weights. If I balance the class weights, the logistic regression model has a much higher false positive rate (deciding to accept the loan when the true state was reject) but a much lower false negative rate (rejecting the loan when the true state was accept).

In [16]:
from sklearn.linear_model import LogisticRegression
for train_index, test_index in kf.split(state_data_train, loan_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = state_data_train.iloc[train_index], state_data_train.iloc[test_index]
#     print X_train
    print len(train_index)
    print len(test_index)
#     print state_data.isnull().values.any()
#     print X_train[pd.isnull(X_train).any(axis=1)]
#     print X_train.isnull().values.any()
#     print X_train[pd.isnull(X_train)]
    y_train, y_test = loan_train.iloc[train_index], loan_train.iloc[test_index]
    clf = LogisticRegression(class_weight='balanced')
    clf.fit(X_train, y_train)
    print metrics.confusion_matrix(y_test, clf.predict(X_test))
    print metrics.accuracy_score(y_test, clf.predict(X_test))
for i in range(len(state_data_train.columns)):
    print state_data_train.columns[i], clf.coef_[0][i]
    
    

('TRAIN:', array([     0,      1,      3, ..., 396747, 396749, 396750]), 'TEST:', array([     2,      7,     18, ..., 396737, 396742, 396748]))
317400
79351
[[65661  7052]
 [ 1666  4972]]
0.89013370972
('TRAIN:', array([     1,      2,      3, ..., 396747, 396748, 396749]), 'TEST:', array([     0,      4,      6, ..., 396735, 396743, 396750]))
317400
79351
[[65112  7601]
 [ 1652  4986]]
0.883391513655
('TRAIN:', array([     0,      2,      3, ..., 396746, 396748, 396750]), 'TEST:', array([     1,      8,      9, ..., 396736, 396747, 396749]))
317400
79351
[[65574  7139]
 [ 1646  4992]]
0.889289359932
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     3,     15,     16, ..., 396738, 396739, 396745]))
317402
79349
[[65646  7066]
 [ 1716  4921]]
0.889324377119
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     5,     14,     17, ..., 396741, 396744, 396746]))
317402
79349
[[65579  7133]
 [ 1650  4987]]
0

In [17]:
from sklearn.linear_model import LogisticRegression
for train_index, test_index in kf.split(state_data_train, loan_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = state_data_train.iloc[train_index], state_data_train.iloc[test_index]
#     print X_train
    print len(train_index)
    print len(test_index)
#     print state_data.isnull().values.any()
#     print X_train[pd.isnull(X_train).any(axis=1)]
#     print X_train.isnull().values.any()
#     print X_train[pd.isnull(X_train)]
    y_train, y_test = loan_train.iloc[train_index], loan_train.iloc[test_index]
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    print metrics.confusion_matrix(y_test, clf.predict(X_test))
    print metrics.accuracy_score(y_test, clf.predict(X_test))
for i in range(len(state_data_train.columns)):
    print state_data_train.columns[i], clf.coef_[0][i]
    
    

('TRAIN:', array([     0,      1,      3, ..., 396747, 396749, 396750]), 'TEST:', array([     2,      7,     18, ..., 396737, 396742, 396748]))
317400
79351
[[71304  1409]
 [ 5208  1430]]
0.916611006793
('TRAIN:', array([     1,      2,      3, ..., 396747, 396748, 396749]), 'TEST:', array([     0,      4,      6, ..., 396735, 396743, 396750]))
317400
79351
[[71136  1577]
 [ 5137  1501]]
0.915388589936
('TRAIN:', array([     0,      2,      3, ..., 396746, 396748, 396750]), 'TEST:', array([     1,      8,      9, ..., 396736, 396747, 396749]))
317400
79351
[[71224  1489]
 [ 5110  1528]]
0.916837847034
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     3,     15,     16, ..., 396738, 396739, 396745]))
317402
79349
[[71158  1554]
 [ 5107  1530]]
0.91605439262
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     5,     14,     17, ..., 396741, 396744, 396746]))
317402
79349
[[70977  1735]
 [ 4939  1698]]
0

I next train a K-Nearest Neighbors Classifier on the state data.

In [18]:
from sklearn.neighbors import KNeighborsClassifier
# from sklearn import metrics
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(state_data_train, loan_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = state_data_train.iloc[train_index], state_data_train.iloc[test_index]
#     print X_train
    print len(train_index)
    print len(test_index)
#     print state_data.isnull().values.any()
#     print X_train[pd.isnull(X_train).any(axis=1)]
#     print X_train.isnull().values.any()
#     print X_train[pd.isnull(X_train)]
    y_train, y_test = loan_train.iloc[train_index], loan_train.iloc[test_index]
    clf = KNeighborsClassifier(10)
    clf.fit(X_train, y_train)
    print metrics.confusion_matrix(y_test, clf.predict(X_test))
    print metrics.accuracy_score(y_test, clf.predict(X_test))
print clf.get_params()
# for i in range(len(clf.feature_importances_)):
#     print state_data_train.columns[i], clf.feature_importances_[i]
    

('TRAIN:', array([     0,      1,      3, ..., 396747, 396749, 396750]), 'TEST:', array([     2,      7,     18, ..., 396737, 396742, 396748]))
317400
79351
[[71227  1486]
 [ 4007  2631]]
0.930775919648
('TRAIN:', array([     1,      2,      3, ..., 396747, 396748, 396749]), 'TEST:', array([     0,      4,      6, ..., 396735, 396743, 396750]))
317400
79351
[[71187  1526]
 [ 3984  2654]]
0.930561681642
('TRAIN:', array([     0,      2,      3, ..., 396746, 396748, 396750]), 'TEST:', array([     1,      8,      9, ..., 396736, 396747, 396749]))
317400
79351
[[71246  1467]
 [ 3982  2656]]
0.931330418016
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     3,     15,     16, ..., 396738, 396739, 396745]))
317402
79349
[[71285  1427]
 [ 3985  2652]]
0.931794981663
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     5,     14,     17, ..., 396741, 396744, 396746]))
317402
79349
[[71182  1530]
 [ 4035  2602]]


Because I thought the 50 dummy variables for the states are cumbersome and might not be informative (i.e. the dependence of the loan acceptance/rejection decision on the state might not be large). Therefore, I will eliminate state features and cross-validate predictive performance to see if there are any changes in prediction accuracy.

In [19]:
no_state_data = state_data[['Amount Requested', 'DTI','employment']]

In [20]:
no_state_data_train, no_state_data_test, loan_train, loan_test = train_test_split(no_state_data, data["loan_given"], train_size = 0.8, stratify=data["loan_given"],random_state=42)

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(no_state_data_train, loan_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = no_state_data_train.iloc[train_index], no_state_data_train.iloc[test_index]
#     print X_train
    print len(train_index)
    print len(test_index)
#     print state_data.isnull().values.any()
#     print X_train[pd.isnull(X_train).any(axis=1)]
#     print X_train.isnull().values.any()
#     print X_train[pd.isnull(X_train)]
    y_train, y_test = loan_train.iloc[train_index], loan_train.iloc[test_index]
    clf = RandomForestClassifier(class_weight="balanced")
    clf.fit(X_train, y_train)
    print metrics.confusion_matrix(y_test, clf.predict(X_test))
    print metrics.accuracy_score(y_test, clf.predict(X_test))
for i in range(len(clf.feature_importances_)):
    print state_data_train.columns[i], clf.feature_importances_[i]
    

('TRAIN:', array([     0,      1,      3, ..., 396748, 396749, 396750]), 'TEST:', array([     2,      8,     17, ..., 396732, 396734, 396742]))
317400
79351
[[69601  3122]
 [ 3468  3160]]
0.916951267155
('TRAIN:', array([     1,      2,      3, ..., 396746, 396747, 396749]), 'TEST:', array([     0,      4,      6, ..., 396743, 396748, 396750]))
317400
79351
[[69595  3128]
 [ 3495  3133]]
0.916535393379
('TRAIN:', array([     0,      2,      3, ..., 396746, 396748, 396750]), 'TEST:', array([     1,      7,      9, ..., 396737, 396747, 396749]))
317400
79351
[[69629  3094]
 [ 3469  3159]]
0.917291527517
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     3,     15,     19, ..., 396739, 396740, 396745]))
317402
79349
[[69662  3060]
 [ 3509  3118]]
0.917213827521
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     5,     14,     16, ..., 396741, 396744, 396746]))
317402
79349
[[69699  3023]
 [ 3450  3177]]


In [22]:
no_state_data_train[['Amount Requested']]

Unnamed: 0,Amount Requested
264662,9000
88670,18000
223365,2000
207764,1150
389459,15000
166133,15000
279050,2000
7765,8400
304413,29000
58399,35000


In [23]:
from sklearn.neighbors import KNeighborsClassifier
# from sklearn import metrics
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(no_state_data_train, loan_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = no_state_data_train.iloc[train_index], no_state_data_train.iloc[test_index]
#     print X_train
    print len(train_index)
    print len(test_index)
#     print state_data.isnull().values.any()
#     print X_train[pd.isnull(X_train).any(axis=1)]
#     print X_train.isnull().values.any()
#     print X_train[pd.isnull(X_train)]
    y_train, y_test = loan_train.iloc[train_index], loan_train.iloc[test_index]
    clf = KNeighborsClassifier(10)
    clf.fit(X_train, y_train)
    print metrics.confusion_matrix(y_test, clf.predict(X_test))
    print metrics.accuracy_score(y_test, clf.predict(X_test))
print clf.get_params()
# for i in range(len(clf.feature_importances_)):
#     print state_data_train.columns[i], clf.feature_importances_[i]
    

('TRAIN:', array([     0,      1,      3, ..., 396748, 396749, 396750]), 'TEST:', array([     2,      8,     17, ..., 396732, 396734, 396742]))
317400
79351
[[71158  1565]
 [ 3944  2684]]
0.930574283878
('TRAIN:', array([     1,      2,      3, ..., 396746, 396747, 396749]), 'TEST:', array([     0,      4,      6, ..., 396743, 396748, 396750]))
317400
79351
[[71191  1532]
 [ 3935  2693]]
0.931103577775
('TRAIN:', array([     0,      2,      3, ..., 396746, 396748, 396750]), 'TEST:', array([     1,      7,      9, ..., 396737, 396747, 396749]))
317400
79351
[[71167  1556]
 [ 3911  2717]]
0.931103577775
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     3,     15,     19, ..., 396739, 396740, 396745]))
317402
79349
[[71208  1514]
 [ 3926  2701]]
0.931442110172
('TRAIN:', array([     0,      1,      2, ..., 396748, 396749, 396750]), 'TEST:', array([     5,     14,     16, ..., 396741, 396744, 396746]))
317402
79349
[[71195  1527]
 [ 3909  2718]]


In [24]:
test_thing = X_test.iloc[[0]]
test_thing['Amount Requested']=100
# test_thing = test_thing.reshape((len(test_thing),1))
print test_thing
print clf.predict(test_thing)
print y_test.iloc[0]

        Amount Requested    DTI  employment
166133               100  14.13           0
[1]
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


We can see that the K_nearest Neighbor model, without using the state features, performs marginally better (above 93%) compared to the other models. While the model has an impressive looking score, we can see that by purely predicting rejection we can obtain an accuracy of 91.6%

Thus, I will select the latest model because it has highest accuracy. To predict the largest loan amount that will be successfully funded, I inspect the decision boundary by increasing the "Amount_Requested" variable until the model changes its decision from accept to reject. This assumes that in general, the model varies smoothly from accept to reject as we increase the "Amount Requested" variable. Because of the nonparametric nature of K-Nearest Neighbors, it is conceivable that the decision boundary between accept and reject is non-linear, which invalidates our prediction. However, I assume nonlinearity in the model, which seems to agree with the data.

In [25]:
def converge(data_copy, clf, interval):
    if interval[1] - interval[0] < 2:
        return interval[1]
    else:
        data_copy['Amount Requested'] = interval[1]
        large_value = clf.predict([data_copy])
        data_copy['Amount Requested'] = interval[0]
        small_value = clf.predict([data_copy])
#         assert large_value != small_value
        data_copy['Amount Requested'] = (interval[1] + interval[0]) / 2.0
        avg_value = clf.predict([data_copy])
        if avg_value == large_value:
            return converge(data_copy, clf, [interval[0], (interval[1] + interval[0]) / 2.0])
        else:
            return converge(data_copy, clf, [(interval[1] + interval[0]) / 2.0, interval[1]])
        

In [26]:
def loan_max(data_copy, clf):
#     print "new"
#     string = data_copy['Amount Requested']
#     print string
    interval = [0, 2000000]
    return converge(data_copy, clf, interval)
    

Because the model takes a while to predict the maximum loan amount, I will attempt prediction only for a subset of the test data. To evaluate the decision boundary found by the model, if the Amount Requested is greater than the predicted amount, I predict that the bank will reject the loan, and if the Amount Requested is Smaller than the predicted amount, I predict that the bank will accept the loan.

In [29]:
subset = no_state_data_test[:1000].copy()
subset = subset.assign(predicted = subset.apply(loan_max, axis=1, args=(clf,)))


In [31]:
new_data = pd.concat((subset, loan_test[:1000]), axis=1)

In [32]:
new_data

Unnamed: 0,Amount Requested,DTI,employment,predicted,loan_given
344363,6000,43.32,0,1956.939697,0
305878,20000,45.32,0,1956.939697,0
400573,7000,1.16,5,900.268555,0
260072,15000,41.00,0,1955.032349,0
133019,15000,22.00,0,7884.979248,0
256993,3000,3.18,0,900.268555,0
10786,35000,1.90,0,900.268555,0
144001,16000,19.11,5,15626.907349,0
285369,1000,41.28,0,1956.939697,0
416199,10000,30.50,0,900.268555,0


In [33]:
def evaluate(new_data):
    predicted = new_data['predicted']
    empirical = new_data['Amount Requested']
    if empirical >= predicted:
        return 0
    else:
        return 1

In [34]:
new_data = new_data.assign(predicted_class = new_data.apply(evaluate, axis = 1))

My prediction of the accuracy of the model for unseen test data is:

In [37]:
print metrics.accuracy_score(new_data['loan_given'], new_data['predicted_class'])

0.803


In [38]:
print metrics.accuracy_score(new_data['loan_given'], np.repeat(0,len(new_data)))

0.93


Thus, this model performs worse than the naive model of simply predicting the most common class (reject the loan) at all times. However, hopefully with more features and data this general scheme of inspecting the auxiliary classifier for a decision boundary would perform better.