In [33]:
from __future__ import print_function, division
from builtins import range
import os
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import matplotlib.pyplot as plt


def forward(X, W1, b1, W2, b2):
    # sigmoid
    # Z = 1 / (1 + np.exp(-( X.dot(W1) + b1 )))

    # relu
    Z = X.dot(W1) + b1
    Z[Z < 0] = 0

    A = Z.dot(W2) + b2
    expA = np.exp(A)
    Y = expA / expA.sum(axis=1, keepdims=True)
    return Y, Z

def derivative_w2(Z, T, Y):
    return Z.T.dot(Y - T)

def derivative_b2(T, Y):
    return (Y - T).sum(axis=0)

def derivative_w1(X, Z, T, Y, W2):
    # return X.T.dot( ( ( Y-T ).dot(W2.T) * ( Z*(1 - Z) ) ) ) # for sigmoid
    return X.T.dot( ( ( Y-T ).dot(W2.T) * (Z > 0) ) ) # for relu

def derivative_b1(Z, T, Y, W2):
    # return (( Y-T ).dot(W2.T) * ( Z*(1 - Z) )).sum(axis=0) # for sigmoid
    return (( Y-T ).dot(W2.T) * (Z > 0)).sum(axis=0) # for relu

In [34]:
def get_normalized_data():
    print("Reading in and transforming data...")

    if not os.path.exists('Mnist/train.csv'):
        print('Looking for Mnist/train.csv')
        print('You have not downloaded the data and/or not placed the files in the correct location.')
        print('Please get the data from: https://www.kaggle.com/c/digit-recognizer')
        print('Place train.csv in the folder Mnist adjacent to the class folder')
        exit()

    df = pd.read_csv('Mnist/train.csv')
    data = df.values.astype(np.float32)
    np.random.shuffle(data)
    X = data[:, 1:]
    Y = data[:, 0]

    Xtrain = X[:-1000]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:]
    Ytest  = Y[-1000:]

    # normalize the data
    mu = Xtrain.mean(axis=0)
    std = Xtrain.std(axis=0)
    np.place(std, std == 0, 1)
    Xtrain = (Xtrain - mu) / std
    Xtest = (Xtest - mu) / std
    
    return Xtrain, Xtest, Ytrain, Ytest

In [35]:
def predict(p_y):
    return np.argmax(p_y, axis=1)


def error_rate(p_y, t):
    prediction = predict(p_y)
    return np.mean(prediction != t)


def cost(p_y, t):
    tot = t * np.log(p_y)
    return -tot.sum()

def y2indicator(y):
    N = len(y)
    y = y.astype(np.int32)
    ind = np.zeros((N, 10))
    for i in range(N):
        ind[i, y[i]] = 1
    return ind

In [None]:
max_iter = 20
print_period = 10 
Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
lr = 0.00004
reg = 0.01

Reading in and transforming data...


In [None]:
Ytrain_ind = y2indicator(Ytrain)
Ytest_ind = y2indicator(Ytest)
N, D =Xtrain.shape
batch_sz = 500
n_batches = N // batchSize
M = 300
K = 10
W1 = np.random.randn(D, M) / 28
b1 = np.zeros(M)
W2 = np.random.randn(M, K) / np.sqrt(M)
b2 = np.zeros(K)

In [None]:
#Normal batch gradient descent
LL_batch = []
CR_batch = []
for i in range(max_iter):
    for j in range(n_batches):
        Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
        Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
        pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
        # print "first batch cost:", cost(pYbatch, Ybatch)

        # updates
        W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
        b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
        W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
        b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

        if j % print_period == 0:
            # calculate just for LL
            pY, _ = forward(Xtest, W1, b1, W2, b2)
            # print "pY:", pY
            ll = cost(pY, Ytest_ind)
            LL_batch.append(ll)
            print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

            err = error_rate(pY, Ytest)
            CR_batch.append(err)
            print("Error rate:", err)

pY, _ = forward(Xtest, W1, b1, W2, b2)
print("Final error rate:", error_rate(pY, Ytest))


Cost at iteration i=0, j=0: 2442.265174
Error rate: 0.898
Cost at iteration i=0, j=10: 1823.122959
Error rate: 0.55
Cost at iteration i=0, j=20: 1457.599911
Error rate: 0.371
Cost at iteration i=0, j=30: 1220.597502
Error rate: 0.284
Cost at iteration i=0, j=40: 1057.496295
Error rate: 0.247
Cost at iteration i=0, j=50: 939.242476
Error rate: 0.218
Cost at iteration i=0, j=60: 851.552137
Error rate: 0.2
Cost at iteration i=0, j=70: 783.180781
Error rate: 0.187
Cost at iteration i=0, j=80: 728.860318
Error rate: 0.182
Cost at iteration i=1, j=0: 718.721981
Error rate: 0.178
Cost at iteration i=1, j=10: 677.182219
Error rate: 0.168
Cost at iteration i=1, j=20: 642.376981
Error rate: 0.159
Cost at iteration i=1, j=30: 613.508027
Error rate: 0.157
Cost at iteration i=1, j=40: 587.414407
Error rate: 0.15
Cost at iteration i=1, j=50: 565.449532
Error rate: 0.148
Cost at iteration i=1, j=60: 546.763824
Error rate: 0.147
Cost at iteration i=1, j=70: 529.954742
Error rate: 0.138
Cost at iterati

Cost at iteration i=15, j=60: 248.311085
Error rate: 0.076
Cost at iteration i=15, j=70: 247.759288
Error rate: 0.078
Cost at iteration i=15, j=80: 247.057839
Error rate: 0.076
Cost at iteration i=16, j=0: 246.689679
Error rate: 0.077
Cost at iteration i=16, j=10: 245.844723
Error rate: 0.074
Cost at iteration i=16, j=20: 245.167490
Error rate: 0.074
Cost at iteration i=16, j=30: 245.239564
Error rate: 0.074
Cost at iteration i=16, j=40: 244.367384
Error rate: 0.075
Cost at iteration i=16, j=50: 243.416123
Error rate: 0.075
Cost at iteration i=16, j=60: 243.610372
Error rate: 0.074
Cost at iteration i=16, j=70: 243.101740
Error rate: 0.074
Cost at iteration i=16, j=80: 242.441448
Error rate: 0.073
Cost at iteration i=17, j=0: 242.094681
Error rate: 0.073
Cost at iteration i=17, j=10: 241.305817
Error rate: 0.073
Cost at iteration i=17, j=20: 240.682596
Error rate: 0.072
Cost at iteration i=17, j=30: 240.804672
Error rate: 0.072
Cost at iteration i=17, j=40: 239.987460
Error rate: 0.073

In [None]:
# RMSprop to speed up
W1 = np.random.randn(D, M) / np.sqrt(D)
b1 = np.zeros(M)
W2 = np.random.randn(M, K) / np.sqrt(M)
b2 = np.zeros(K)
LL_rms = []
CR_rms = []
lr0 = 0.001 # if you set this too high you'll get NaN!
cache_W2 = 1
cache_b2 = 1
cache_W1 = 1
cache_b1 = 1
decay_rate = 0.999
eps = 1e-10
for i in range(max_iter):
    for j in range(n_batches):
        Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
        Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
        pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
        # print "first batch cost:", cost(pYbatch, Ybatch)

        # updates
        gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
        cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
        W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

        gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
        cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
        b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

        gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
        cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
        W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

        gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
        cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
        b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

        if j % print_period == 0:
            # calculate just for LL
            pY, _ = forward(Xtest, W1, b1, W2, b2)
            # print "pY:", pY
            ll = cost(pY, Ytest_ind)
            LL_rms.append(ll)
            print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

            err = error_rate(pY, Ytest)
            CR_rms.append(err)
            print("Error rate:", err)

pY, _ = forward(Xtest, W1, b1, W2, b2)
print("Final error rate:", error_rate(pY, Ytest))

Cost at iteration i=0, j=0: 1406.028863
Error rate: 0.41
Cost at iteration i=0, j=10: 443.089164
Error rate: 0.132
Cost at iteration i=0, j=20: 365.526283
Error rate: 0.11
Cost at iteration i=0, j=30: 343.911460
Error rate: 0.101
Cost at iteration i=0, j=40: 315.161752
Error rate: 0.095
Cost at iteration i=0, j=50: 290.523600
Error rate: 0.086
Cost at iteration i=0, j=60: 270.800031
Error rate: 0.084
Cost at iteration i=0, j=70: 265.197148
Error rate: 0.077
Cost at iteration i=0, j=80: 257.420551
Error rate: 0.07
Cost at iteration i=1, j=0: 252.976401
Error rate: 0.07
Cost at iteration i=1, j=10: 237.497603
Error rate: 0.07
Cost at iteration i=1, j=20: 230.907663
Error rate: 0.069
Cost at iteration i=1, j=30: 225.839297
Error rate: 0.065
Cost at iteration i=1, j=40: 224.311280
Error rate: 0.067
Cost at iteration i=1, j=50: 217.280458
Error rate: 0.061
Cost at iteration i=1, j=60: 212.162417
Error rate: 0.06
Cost at iteration i=1, j=70: 209.871763
Error rate: 0.059
Cost at iteration i=1

In [None]:
plt.plot(LL_batch, label='const')
plt.plot(LL_rms, label='rms')
plt.legend()
plt.show()