In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from __future__ import division
%matplotlib inline

In [79]:
X = pd.DataFrame.from_csv("X_data.csv")
y = pd.DataFrame.from_csv("y_data.csv")
y = np.ravel(y)

In [81]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [5]:
# Train neural network
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train.iloc[:,1:], y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [6]:
y_test_predict = pd.DataFrame(clf.predict_proba(X_test), index= X_test.index)

In [8]:
grades = [grade for grade in X_test.columns if grade[:6] == "grade_"]

results = {}

for grade in grades:
    good = y_test_predict[y_test == False][X_test[grade] == True][0].mean()
    bad = y_test_predict[y_test == True][X_test[grade] == True][0].mean()
    results[grade[-1]] = {'Good': good, "Bad": bad}

print pd.DataFrame(results)

             A         B         C         D         E         F         G
Bad   0.917442  0.841667  0.794434  0.738691  0.715175  0.656285  0.659189
Good  0.935606  0.855116  0.808373  0.754008  0.723817  0.680087  0.641830




Since the probability of repayment for any given loan is high, out model will tend to predict that a loan will get repaid. However, the actual probability of repayment varies. From the above, the model shows potential as, even when broken out by grade, the model shows some predictive power. The only grade this did not hold for is grade G, which may be due to the small size.

Since the model will tend to predict repayment for every loan, the score function is not very useful. Another way to score how well we're doing is to use the model to decide what loans to buy to maximize the loss-adjusted return

## Testing number of hidden layers and neurons

In [61]:
num_layers = range(1,3)
num_neurons = range(1,6)

scores = {}

for num_layer in num_layers:
    for num_neuron in num_neurons:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(num_neuron, num_layer), random_state=1)
        clf.fit(X_train, y_train)
        clf.predict()
        scores[(num_neuron, num_layer)] = score

scores

{(1, 1): 0.84441272430668846,
 (1, 2): 0.84434475258292552,
 (2, 1): 0.84441272430668846,
 (2, 2): 0.84434475258292552,
 (3, 1): 0.84441272430668846,
 (3, 2): 0.84441272430668846,
 (4, 1): 0.84441272430668846,
 (4, 2): 0.84339314845024471,
 (5, 1): 0.84441272430668846,
 (5, 2): 0.84359706362153342}

In [None]:
clf

In [78]:
# alphas = 1 / 10 ** np.arange(1,7)
# 0.1, 0.01, ... , 0.0000001
alphas = np.arange(1,10)
alpha_scores = {}

for alpha in alphas:
    clf = MLPClassifier(solver='lbfgs', alpha=alpha, hidden_layer_sizes=(1, 1), random_state=1)
    clf.fit(X_train, y_train)
    alpha_scores[alpha] = clf.score(X_test, y_test)

alpha_scores

{1: 0.84441272430668846,
 2: 0.84441272430668846,
 3: 0.84441272430668846,
 4: 0.84441272430668846,
 5: 0.84441272430668846,
 6: 0.84441272430668846,
 7: 0.84441272430668846,
 8: 0.84441272430668846,
 9: 0.84441272430668846}

In [85]:
print 1 - sum(y_test==True) / len(y_test)


# plt.scatter(y_test_predict[1], y_test)
# plt.show()
# y_test_predict

0.844412724307


In [77]:
help(plt.scatter)

Help on function scatter in module matplotlib.pyplot:

scatter(x, y, s=20, c=None, marker=u'o', cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, edgecolors=None, hold=None, data=None, **kwargs)
    Make a scatter plot of x vs y, where x and y are sequence like objects
    of the same lengths.
    
    Parameters
    ----------
    x, y : array_like, shape (n, )
        Input data
    
    s : scalar or array_like, shape (n, ), optional, default: 20
        size in points^2.
    
    c : color or sequence of color, optional, default : 'b'
        `c` can be a single color format string, or a sequence of color
        specifications of length `N`, or a sequence of `N` numbers to be
        mapped to colors using the `cmap` and `norm` specified via kwargs
        (see below). Note that `c` should not be a single numeric RGB or
        RGBA sequence because that is indistinguishable from an array of
        values to be colormapped.  `c` can be a 2-D arr