In [1]:
import pandas as pd
data = pd.read_csv("LoansImputed.csv")
print(data.head())

   credit.policy             purpose  int.rate  installment  log.annual.inc  \
0              1  debt_consolidation    0.1496       194.02       10.714418   
1              1           all_other    0.1114       131.22       11.002100   
2              1         credit_card    0.1343       678.08       11.884489   
3              1           all_other    0.1059        32.55       10.433822   
4              1      small_business    0.1501       225.37       12.269047   

     dti  fico  days.with.cr.line  revol.bal  revol.util  inq.last.6mths  \
0   4.00   667        3180.041667       3839        76.8               0   
1  11.08   722        5116.000000      24220        68.6               0   
2  10.15   682        4209.958333      41674        74.1               0   
3  14.47   687        1110.000000       4485        36.9               1   
4   6.45   677        6240.000000      56411        75.3               0   

   delinq.2yrs  pub.rec  not.fully.paid  annualincome  
0           

<h4>Variables in the dataset</h4>

<h5>Dependent Variable</h5>

<b>not.fully.paid</b>: A binary variable. 1 means borrower defaulted and 0 means monthly payments are made on time

<h5>Independent Variables</h5>

<b>credit.policy</b>: 1 if borrower meets credit underwriting criteria and 0 otherwise<br />
<b>purpose</b>: The reason for the loan<br />
<b>int.rate</b>: Interest rate for the loan (14% is stored as 0.14)<br />
<b>installment</b>: Monthly payment to be made for the loan<br />
<b>log.annual.inc</b>: Natural log of self reported annual income of the borrower<br />
<b>dti</b>: Debt to Income ratio of the borrower<br />
<b>fico</b>: FICO credit score of the borrower<br />
<b>days.with.cr.line</b>: Number of days borrower has had credit line<br />
<b>revol.bal</b>: The borrower's rovolving balance (Principal loan amount still remaining)<bv />
<b>revol.util</b>: Amount of credit line utilized by borrower as percentage of total available credit<br />
<b>inq.last.6mths</b>: Borrowers credit inquiry in last 6 months<br />
<b>delinq.2yrs</b>: Number of times borrower was deliquent in last 2 years<br />
<b>pub.rec</b>: Number of derogatory pulic record borrower has (Bankruptcy, tax liens and judgements etc.)<br />
<b>annualincome</b>: Annual income of borrpwer<br />

In [2]:
print(data.describe())

       credit.policy     int.rate  installment  log.annual.inc          dti  \
count    5000.000000  5000.000000  5000.000000     5000.000000  5000.000000   
mean        0.896200     0.120816   308.325968       10.911819    12.308698   
std         0.305031     0.025336   197.307080        0.598897     6.754521   
min         0.000000     0.060000    15.690000        7.600902     0.000000   
25%         1.000000     0.100800   163.550000       10.545341     7.067500   
50%         1.000000     0.121800   260.640000       10.915088    12.300000   
75%         1.000000     0.137900   407.510000       11.277203    17.652500   
max         1.000000     0.216400   926.830000       14.528354    29.960000   

              fico  days.with.cr.line     revol.bal   revol.util  \
count  5000.000000        5000.000000  5.000000e+03  5000.000000   
mean    710.926000        4510.713433  1.587253e+04    46.395622   
std      37.026757        2418.553606  3.111632e+04    29.138604   
min     617.0000

In [3]:
print(data[data['not.fully.paid'] == 1].describe())

       credit.policy     int.rate  installment  log.annual.inc          dti  \
count    1533.000000  1533.000000  1533.000000     1533.000000  1533.000000   
mean        0.661448     0.132452   342.785114       10.885023    13.195838   
std         0.473372     0.025495   223.948527        0.666718     7.006769   
min         0.000000     0.070500    15.910000        7.600902     0.000000   
25%         0.000000     0.115400   168.640000       10.491274     7.830000   
50%         1.000000     0.131600   287.310000       10.878047    13.340000   
75%         1.000000     0.148200   491.300000       11.276633    18.830000   
max         1.000000     0.216400   926.830000       13.458836    29.960000   

              fico  days.with.cr.line     revol.bal   revol.util  \
count  1533.000000        1533.000000  1.533000e+03  1533.000000   
mean    697.828441        4393.541259  2.106629e+04    52.255075   
std      33.756808        2431.785491  4.990569e+04    29.057906   
min     617.0000

In [4]:
print(data[data['not.fully.paid'] == 0].describe())

       credit.policy     int.rate  installment  log.annual.inc          dti  \
count         3467.0  3467.000000  3467.000000     3467.000000  3467.000000   
mean             1.0     0.115671   293.089201       10.923667    11.916432   
std              0.0     0.023498   182.272593        0.566024     6.603058   
min              1.0     0.060000    15.690000        8.342840     0.000000   
25%              1.0     0.096300   159.920000       10.585573     6.775000   
50%              1.0     0.116600   249.680000       10.915088    11.860000   
75%              1.0     0.131600   394.360000       11.277203    17.120000   
max              1.0     0.208600   914.420000       14.528354    29.420000   

              fico  days.with.cr.line      revol.bal   revol.util  \
count  3467.000000        3467.000000    3467.000000  3467.000000   
mean    716.717335        4562.523339   13576.013268    43.804753   
std      36.935882        2411.216297   16685.502884    28.800678   
min     627.

In [5]:
print("Number of loans that have 50% credit utilization and defaulted: ")
print(len(data[(data['revol.util'] > 50.00) & (data['not.fully.paid'] == 1)]))
print("Number of loans that have 50% credit utilization and not defaulted: ")
print(len(data[(data['revol.util'] > 50.00) & (data['not.fully.paid'] == 0)]))

Number of loans that have 50% credit utilization and defaulted: 
837
Number of loans that have 50% credit utilization and not defaulted: 
1433


In [6]:
from sklearn import preprocessing
from scipy.stats import boxcox
from sklearn.preprocessing import OneHotEncoder

#Print unique values of purpose
print(pd.Series.unique(data['purpose']))
# Convert purpose to category
data['purpose'] = data['purpose'].astype('category')
#OneHotEcoding
data['purpose'] = pd.get_dummies(data['purpose'], prefix=['purpose'])

#extract dependent variable as label
Y = data['not.fully.paid']
#Drop dependent variable and categorical variable
X = data.drop('not.fully.paid', 1)

#scale dependent variable
X = preprocessing.scale(X)

#print first row of X
print(X[0])
print(X[0].mean(axis=0))
print(X[0].std(axis=0))

['debt_consolidation' 'all_other' 'credit_card' 'small_business'
 'home_improvement' 'educational' 'major_purchase']
[ 0.34032684 -0.57026784  1.13619619 -0.57938823 -0.32964068 -1.23021742
 -1.18644994 -0.55024824 -0.38676608  1.04354413 -0.70711203 -0.32478726
  3.62321686 -0.37391434]
-0.006822001516728127
1.2104049514994222




In [7]:
#Shuffle and split data into 70% in training and 30% in testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
print("X_train size = {0}".format(len(X_train)))
print("Y_train size = {0}".format(len(Y_train)))
print("X_test size = {0}".format(len(X_test)))

X_train size = 3500
Y_train size = 3500
X_test size = 1500


In [8]:
#Using Artificial Neural Network
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import time
from sklearn import metrics
from sklearn.metrics import confusion_matrix

#Fit ANN model to data
startTrain = time.time()
classifier = Sequential()
classifier.add(Dense(activation="relu", kernel_initializer="uniform", units= 6, input_dim=14))
classifier.add(Dropout(rate=0.1)) 
classifier.add(Dense(activation="relu", kernel_initializer="uniform", units=6)) 
classifier.add(Dense(activation="sigmoid", kernel_initializer="uniform", units=1))
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
classifier.fit(np.array(X_train), np.array(Y_train), batch_size=10, epochs=100)
endTrain = time.time()

print("ANN training time (secs): {0}".format(endTrain - startTrain))

#Predict using ANN
startTest = time.time()
Y_pred = classifier.predict_classes(np.array(X_test))
endTest = time.time()

print("ANN prediction time (secs): {0}".format(endTest - startTest))
print("ANN accuracy in testing set: {0}".format(metrics.accuracy_score(Y_test, Y_pred)))
print("ANN F1 score in testing set: {0}".format(metrics.f1_score(Y_test, Y_pred, average='micro')))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)

print("ANN Confusion Matrix: {0}".format(cm))

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 6

Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
ANN training time (secs): 36.300463914871216
ANN prediction time (secs): 0.048564910888671875
ANN accuracy in testing set: 0.7813333333333333
ANN F1 score in testing set: 0.7813333333333333
ANN Confusion Matrix: [[992  28]
 [300 180]]
