In [1566]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize

NUM_INPUT = 784  # Number of input neurons
NUM_HIDDEN = 40  # Number of hidden neurons
NUM_OUTPUT = 10  # Number of output neurons
NUM_CHECK = 5  # Number of examples on which to check the gradient

### Helper Functions

In [1567]:
def reluprime(x):
    return 1 * (x > 0)

In [1568]:
def relu(x):
    return np.maximum(x,0)

In [1569]:
def hotten(y):
    new_Y = np.zeros((10,y.shape[0]))
    for i,val in enumerate(y):
        new_Y[val][i] = 1
    return new_Y.T

In [1570]:
def softmax(x):
#     x -= np.max(x,axis=0).reshape(1,-1)
    z = np.exp(x) / np.sum(np.exp(x),axis=0)
    return z.T

In [1571]:
def get_yHat(x,w):
    W1, b1, W2, b2 = unpack(w)
    z1 = np.dot(W1,x.T) + b1[:,None]
    h = relu(z1)
    z2 = np.dot(W2,h) + b2[:,None]
    return softmax(z2)

In [1572]:
trainX

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [1573]:
get_yHat(trainX[idxs,:], w)

array([[0.0853229 , 0.10178025, 0.12737272, 0.14128194, 0.11025431,
        0.0783632 , 0.086476  , 0.0820824 , 0.08800928, 0.099057  ],
       [0.09790313, 0.09906477, 0.10891927, 0.11078495, 0.10273045,
        0.09730091, 0.09515759, 0.0896983 , 0.09859567, 0.09984497]])

### Given Functions

In [1574]:
# Given a vector w containing all the weights and biased vectors, extract
# and return the individual weights and biases W1, b1, W2, b2.
# This is useful for performing a gradient check with check_grad.
def unpack (w):
    W1 = w[0:NUM_INPUT * NUM_HIDDEN]
    W1 = np.reshape(W1, (NUM_HIDDEN, NUM_INPUT))
    w = np.delete(w, np.arange(NUM_INPUT * NUM_HIDDEN))
    b1 = w[0:NUM_HIDDEN]
    w = np.delete(w, np.arange(NUM_HIDDEN))
    W2 = w[0:NUM_HIDDEN * NUM_OUTPUT]
    W2 = np.reshape(W2, (NUM_OUTPUT, NUM_HIDDEN))
    w = np.delete(w, np.arange(NUM_HIDDEN * NUM_OUTPUT))
    b2 = w[0:NUM_OUTPUT]
    return W1, b1, W2, b2

In [1575]:
# Given individual weights and biases W1, b1, W2, b2, concatenate them and
# return a vector w containing all of them.
# This is useful for performing a gradient check with check_grad.
def pack (W1, b1, W2, b2):
    W = W1.flatten()
    W = np.append(W, b1)
    W = np.append(W, W2.flatten())
    W = np.append(W, b2)
    return W

In [1576]:
# Load the images and labels from a specified dataset (train or test).
def loadData (which):
    images = np.load("fashion_mnist_{}_images.npy".format(which))
    labels = np.load("fashion_mnist_{}_labels.npy".format(which))
    labels = hotten(labels)
    return images, labels

In [1577]:
from sklearn.metrics import log_loss
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the cross-entropy (CE) loss. You might
# want to extend this function to return multiple arguments (in which case you
# will also need to modify slightly the gradient check code below).
def fCE (X, Y, w):
    yhat = get_yHat(X, w) #(5,10)
    cost = np.sum((Y*np.log(yhat)))
    cost = cost * (-1/Y.shape[0])
    return cost

In [1578]:
fCE((trainX[idxs, :]),(trainY[idxs]), w)

2.2064293026141795

In [1579]:
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the gradient of fCE. You might
# want to extend this function to return multiple arguments (in which case you
# will also need to modify slightly the gradient check code below).
def gradCE (X, Y, w):
    W1, b1, W2, b2 = unpack(w)
    yHat = get_yHat(X,w) #(10,5)
    z1 = W1.dot(X.T) + b1[:,None]
    h = reluprime(z1)
    
    gradW2fCE = (yHat - Y).T.dot(h.T) # (10,40)
    gradb2fCE = np.mean(yHat - Y,axis=0).reshape(10,1) #(10,1)
    
    g_Trans = np.multiply(((yHat-Y).dot(W2)),reluprime(z1.T)) #(5,40)
    gradW1fCE = g_Trans.T.dot(X) # (40,785)
    gradb1fCE = np.mean(g_Trans,axis=0).reshape(40,1) # (40,1)
    
    return pack(gradW1fCE,gradb1fCE,gradW2fCE,gradb2fCE)

In [1580]:
gradCE((trainX[idxs, :]),(trainY[idxs]), w)

array([0.        , 0.        , 0.        , ..., 0.08589035, 0.09330248,
       0.09945099])

In [1581]:
# Given training and testing datasets and an initial set of weights/biases b,
# train the NN.
def train (trainX, trainY, testX, testY, w):
    pass

## Main

In [1582]:
trainX, trainY = loadData("train")
testX, testY = loadData("test")

trainX = trainX/255.00

# Initialize weights randomly
W1 = 2*(np.random.random(size=(NUM_HIDDEN, NUM_INPUT))/NUM_INPUT**0.5) - 1./NUM_INPUT**0.5
b1 = 0.01 * np.ones(NUM_HIDDEN)
W2 = 2*(np.random.random(size=(NUM_OUTPUT, NUM_HIDDEN))/NUM_HIDDEN**0.5) - 1./NUM_HIDDEN**0.5
b2 = 0.01 * np.ones(NUM_OUTPUT)

In [1583]:
# Concatenate all the weights and biases into one vector; this is necessary for check_grad
w = pack(W1, b1, W2, b2)
w.shape

(31810,)

In [1584]:
idxs = np.random.permutation(trainX.shape[0])[0:NUM_CHECK]
fCE_ = fCE((trainX[idxs, :]),(trainY[idxs]), w)
gradCE_ = gradCE((trainX[idxs, :]),(trainY[idxs]), w)

In [1585]:
fCE(trainX[idxs,:],trainY[idxs],w)

2.3149076274637608

In [1586]:
gradCE(trainX[idxs,:],trainY[idxs],w)[:100].shape

(100,)

In [1589]:
NUM_CHECK = 2
idxs = np.random.permutation(trainX.shape[0])[0:NUM_CHECK]

scipy.optimize.check_grad(
lambda w_:fCE(trainX[idxs,:],trainY[idxs],w_,),
lambda w_:gradCE(trainX[idxs,:],trainY[idxs],w_),
w)

7.03391742811975