In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize

NUM_INPUT = 784  # Number of input neurons
NUM_HIDDEN = 40  # Number of hidden neurons
NUM_OUTPUT = 10  # Number of output neurons
NUM_CHECK = 5  # Number of examples on which to check the gradient

### Helper Functions

In [2]:
def reluprime(x):
    return 1 * (x > 0)

In [3]:
def relu(x):
    return np.maximum(x,0)

In [4]:
def hotten(x):
    return np.eye(NUM_OUTPUT)[x]

In [5]:
# Each row returned sum equals 1, representing probability of its the guess
# Returns a (N,10) matrix
def softmax(x):
    exp = np.exp(x)
    sum_exp = np.sum(exp,axis=0)
    return (exp / sum_exp[None,:]).T

In [61]:
## Function that performs transformation and gives the softmax of that
def get_yHat(X,w):
    W1, b1, W2, b2 = unpack(w)
    z1 = W1.dot(X.T) + b1[:,None] # (40,5)
    h = relu(z1)    #(40,5)
    z2 = W2.dot(h) + b2[:,None] # (10,5)
    y_Hat = softmax(z2) #(10.5)
    return y_Hat;

In [60]:
get_yHat(trainX[idxs,:],w)

array([[0.1076665 , 0.10665026, 0.08690104, 0.08913068, 0.10268867,
        0.10442921, 0.09664178, 0.10249014, 0.0920084 , 0.11139332],
       [0.11553261, 0.09758634, 0.08084534, 0.08251579, 0.11084937,
        0.10490977, 0.10799747, 0.08970749, 0.10157406, 0.10848175],
       [0.11504553, 0.10080747, 0.08302698, 0.0952252 , 0.11605032,
        0.10165582, 0.10692433, 0.08974994, 0.08898615, 0.10252826],
       [0.11992702, 0.09370014, 0.0865971 , 0.09176387, 0.11307983,
        0.09605632, 0.09832468, 0.09610266, 0.09781562, 0.10663276],
       [0.1154949 , 0.10139947, 0.08964884, 0.08843979, 0.10261602,
        0.10958189, 0.09905593, 0.10086694, 0.08865278, 0.10424345]])

### Given Functions

In [28]:
# Given a vector w containing all the weights and biased vectors, extract
# and return the individual weights and biases W1, b1, W2, b2.
# This is useful for performing a gradient check with check_grad.
def unpack (w):
    idx1 = NUM_INPUT*NUM_HIDDEN            # W1
    idx2 = idx1 + NUM_HIDDEN               # b1
    idx3 = idx2 + NUM_HIDDEN*NUM_OUTPUT    # W2
    W1 = w[    : idx1].reshape(NUM_HIDDEN, NUM_INPUT)
    b1 = w[idx1: idx2].reshape(NUM_HIDDEN)
    W2 = w[idx2: idx3].reshape(NUM_OUTPUT, NUM_HIDDEN)
    b2 = w[idx3:     ].reshape(NUM_OUTPUT)
    return W1, b1, W2, b2

In [29]:
# Given individual weights and biases W1, b1, W2, b2, concatenate them and
# return a vector w containing all of them.
# This is useful for performing a gradient check with check_grad.
def pack (W1, b1, W2, b2):
    return np.concatenate((W1.flatten(),b1.flatten(),W2.flatten(),b2.flatten()))

In [30]:
# Load the images and labels from a specified dataset (train or test).
def loadData (which):
    images = np.load("fashion_mnist_{}_images.npy".format(which))
    labels = np.load("fashion_mnist_{}_labels.npy".format(which))
    images = images/255.00
    labels = hotten(labels)
    return images, labels


In [31]:
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the cross-entropy (CE) loss. You might
# want to extend this function to return multiple arguments (in which case you
# will also need to modify slightly the gradient check code below).
def fCE (X, Y, w):
    y_Hat = get_yHat(X,w)
    return -np.sum((Y) * np.log(y_Hat)) / len(Y)

In [32]:
fCE((trainX[idxs, :]),(trainY[idxs]), w)

2.435439167462153

In [75]:
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the gradient of fCE. You might
# want to extend this function to return multiple arguments (in which case you
# will also need to modify slightly the gradient check code below).
def gradCE (X, Y, w):
    W1, b1, W2, b2 = unpack(w)
    y_hat = get_yHat(X,w);

    z1 = W1.dot(X.T) + b1[:,None] # (40,5)
    h = relu(z1) #(40,5)  

    grad_W2 = (y_hat-(Y)).T.dot(h.T) # (10,40)
    grad_b2 = np.mean(y_hat-(Y),axis=0) # (10,)

    g_T = ((y_hat-(Y)).dot(W2)) * reluprime(z1.T) # (N,40)

    grad_W1 = (g_T.T.dot(X))
    grad_b1 = np.mean(g_T.T, axis=1) # (40,1)
    gradPacked =  pack(grad_W1,grad_b1,grad_W2,grad_b2)
    return gradPacked

In [79]:
# scipy.optimize.check_grad(
# lambda w_:fCE(trainX[idxs,:],trainY[idxs],w_,),
# lambda w_:gradCE(trainX[idxs,:],trainY[idxs],w_),
# w)
# gradCE(trainX[idxs,:],trainY[idxs],w)
print_fprim()

True
False
True
False


In [35]:
# Given training and testing datasets and an initial set of weights/biases b,
# train the NN.
def train (trainX, trainY, testX, testY, w):
    pass

## Main

In [36]:
trainX, trainY = loadData("train")
testX, testY = loadData("test")


# Initialize weights randomly
W1 = 2*(np.random.random(size=(NUM_HIDDEN, NUM_INPUT))/NUM_INPUT**0.5) - 1./NUM_INPUT**0.5
b1 = 0.01 * np.ones(NUM_HIDDEN)
W2 = 2*(np.random.random(size=(NUM_OUTPUT, NUM_HIDDEN))/NUM_HIDDEN**0.5) - 1./NUM_HIDDEN**0.5
b2 = 0.01 * np.ones(NUM_OUTPUT)

In [37]:
# Concatenate all the weights and biases into one vector; this is necessary for check_grad
w = pack(W1, b1, W2, b2)
w.shape

(31810,)

In [38]:
# idxs = np.random.permutation(trainX.shape[0])[0:NUM_CHECK]
# fCE_ = fCE((trainX[idxs, :]),(trainY[idxs]), w)
# gradCE_ = gradCE((trainX[idxs, :]),(trainY[idxs]), w)

In [39]:
idxs = np.random.permutation(trainX.shape[0])[0:NUM_CHECK]

scipy.optimize.check_grad(
lambda w_:fCE(trainX[idxs,:],trainY[idxs],w_,),
lambda w_:gradCE(trainX[idxs,:],trainY[idxs],w_),
w)

9.651886507445322

In [69]:
idxs = np.random.permutation(trainX.shape[0])[0:NUM_CHECK]


approx_fprime = scipy.optimize.approx_fprime(w, lambda W_: fCE(trainX[idxs,:], trainY[idxs,:], W_), 1e-8)
aW1, ab1, aW2, ab2 = unpack(approx_fprime)
W1, b1, W2, b2 = unpack(gradCE(trainX[idxs,:], trainY[idxs,:],w))

In [70]:
print(W1[0,544])
print(aW1[0,544])

0.0
0.0


In [73]:
print(W2[0])
print(aW2[0])

[0.         0.14218534 0.02277712 0.         0.03154347 0.11805835
 0.01787337 0.         0.         0.01325445 0.02040825 0.28036803
 0.09263068 0.06686429 0.         0.07279565 0.16934526 0.00764875
 0.01265497 0.10799873 0.         0.11325885 0.09668441 0.12966254
 0.00203075 0.01910966 0.15900512 0.01881027 0.10420783 0.1447109
 0.         0.01156827 0.04706492 0.         0.         0.12949529
 0.         0.         0.16393208 0.        ]
[0.         0.02843703 0.00455542 0.         0.00630864 0.02361169
 0.00357465 0.         0.         0.00265086 0.00408167 0.05607355
 0.01852611 0.01337281 0.         0.01455911 0.03386904 0.00152975
 0.00253095 0.02159974 0.         0.02265175 0.01933689 0.0259325
 0.00040612 0.00382192 0.03180101 0.00376206 0.02084151 0.02894218
 0.         0.00231362 0.009413   0.         0.         0.02589902
 0.         0.         0.0327864  0.        ]


In [74]:
print(np.allclose(ab2,b2, 1e-3))
print(np.allclose(aW1,W1, 1e-3))
print(np.allclose(ab1,b1, 1e-3))
print(np.allclose(aW2,W2, 1e-3))

True
False
True
False


In [78]:
def print_fprim():
    approx_fprime = scipy.optimize.approx_fprime(w, lambda W_: fCE(trainX[idxs,:], trainY[idxs,:], W_), 1e-8)
    aW1, ab1, aW2, ab2 = unpack(approx_fprime)
    W1, b1, W2, b2 = unpack(gradCE(trainX[idxs,:], trainY[idxs,:],w))
    print(np.allclose(ab2,b2, 1e-3))
    print(np.allclose(aW1,W1, 1e-3))
    print(np.allclose(ab1,b1, 1e-3))
    print(np.allclose(aW2,W2, 1e-3))