In [16]:
import numpy as np
from impl.solver import *
import sys

In [17]:
n_iter = 1000 # number of epochs
alpha = 1e-3
mb_size = 64 # minibatch size usually compatible to the Cache/RAM size
n_experiment = 1
reg = 1e-5
print_after = 100
p_dropout = 0.8
loss = 'cross_ent'
nonlin = 'relu'
solver = 'sgd'

In [18]:
# import hipsternet.input_data as input_data  # NOT used for MNIST
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('data/MNIST_data/', one_hot=False)
X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels
y_test.shape, y_val.shape, y_train.shape

Extracting data/MNIST_data/train-images-idx3-ubyte.gz
Extracting data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting data/MNIST_data/t10k-labels-idx1-ubyte.gz


((10000,), (5000,), (55000,))

In [19]:
M, D, C = X_train.shape[0], X_train.shape[1], y_train.max() + 1
M, D, C

(55000, 784, 10)

In [20]:
import impl.utils as utils
X_train, X_val, X_test = utils.prepro(X_train, X_val, X_test)
X_train.shape, X_val.shape, X_test.shape

((55000, 784), (5000, 784), (10000, 784))

In [21]:
# # if net_type == 'cnn':
# img_shape = (1, 28, 28)
# X_train = X_train.reshape(-1, *img_shape)
# X_val = X_val.reshape(-1, *img_shape)
# X_test = X_test.reshape(-1, *img_shape)
# X_train.shape, X_val.shape, X_test.shape

In [22]:
solvers = dict(
    sgd=sgd,
    momentum=momentum,
    nesterov=nesterov,
    adagrad=adagrad,
    rmsprop=rmsprop,
    adam=adam
)
solvers

{'adagrad': <function impl.solver.adagrad>,
 'adam': <function impl.solver.adam>,
 'momentum': <function impl.solver.momentum>,
 'nesterov': <function impl.solver.nesterov>,
 'rmsprop': <function impl.solver.rmsprop>,
 'sgd': <function impl.solver.sgd>}

In [23]:
solver_fun = solvers[solver] # solver functions
accs = np.zeros(n_experiment)
solver_fun, accs
# print()
print('Experimenting on {}'.format(solver))
# print()

Experimenting on sgd


In [None]:
import numpy as np
import impl.loss as loss_fun
import impl.layer as l
import impl.regularization as reg
import impl.utils as util
import impl.NN as nn
import impl.pyramidnet as pyramidnet

class SPNN(nn.NN):

    def __init__(self, D, C, H, lam=1e-3, p_dropout=.8, loss='cross_ent', nonlin='relu'):
        super().__init__(D, C, H, lam, p_dropout, loss, nonlin)

    def forward(self, X, train=False):
        
        # 1st layer: Conv layer from input
        h1, h1_cache = l.fc_forward(X, self.model['W1'], self.model['b1'])
        h1, bn1_cache, self.bn_caches['bn1_mean'], self.bn_caches['bn1_var'] = l.bn_forward(h1, 
                                                        self.model['gamma1'], self.model['beta1'], 
                                                        (self.bn_caches['bn1_mean'], self.bn_caches['bn1_var']), 
                                                        train=train)
        h1, nl_cache1 = self.forward_nonlin(h1)
        u1 = None # referenced before assigned ERROR!
        if train: h1, u1 = l.dropout_forward(h1, self.p_dropout)

        # midst layer: Pyrmidnet depth
        h2, h2_cache = l.fc_forward(h1, self.model['W2'], self.model['b2'])
        h2, bn2_cache, self.bn_caches['bn2_mean'], self.bn_caches['bn2_var'] = l.bn_forward(h2, 
                                                        self.model['gamma2'], self.model['beta2'], 
                                                        (self.bn_caches['bn2_mean'], self.bn_caches['bn2_var']), 
                                                        train=train)
        h2, nl_cache2 = self.forward_nonlin(h2)
        u2 = None # referenced before assigned ERROR!
        if train: h2, u2 = l.dropout_forward(h2, self.p_dropout)

        # last layer: FC to the output layer
        h3, h3_cache = l.fc_forward(h2, self.model['W3'], self.model['b3'])
        cache = (X, h1_cache, h2_cache, h3_cache, nl_cache1, nl_cache2, u1, u2, bn1_cache, bn2_cache)
        return h3, cache

    def backward(self, y_pred, y_train, cache):
        X, h1_cache, h2_cache, score_cache, nl_cache1, nl_cache2, u1, u2, bn1_cache, bn2_cache = cache

        # Output layer
        grad_y = self.dloss_funs[self.loss](y_pred, y_train)

        # Third layer
        dh2, dW3, db3 = l.fc_backward(grad_y, score_cache)
        dW3 += reg.dl2_reg(self.model['W3'], self.lam)
        dh2 = self.backward_nonlin(dh2, nl_cache2)
        dh2 = l.dropout_backward(dh2, u2)
        dh2, dgamma2, dbeta2 = l.bn_backward(dh2, bn2_cache)

        # Second layer
        dh1, dW2, db2 = l.fc_backward(dh2, h2_cache)
        dW2 += reg.dl2_reg(self.model['W2'], self.lam)
        dh1 = self.backward_nonlin(dh1, nl_cache1)
        dh1 = l.dropout_backward(dh1, u1)
        dh1, dgamma1, dbeta1 = l.bn_backward(dh1, bn1_cache)

        # First layer
        _, dW1, db1 = l.fc_backward(dh1, h1_cache)
        dW1 += reg.dl2_reg(self.model['W1'], self.lam)

        grad = dict(
            W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3, gamma1=dgamma1,
            gamma2=dgamma2, beta1=dbeta1, beta2=dbeta2
        )

        return grad

    def _init_model(self, D, C, H):
        self.model = dict(
            W1=np.random.randn(D, H) / np.sqrt(D / 2.),
            W2=np.random.randn(H, H) / np.sqrt(H / 2.),
            W3=np.random.randn(H, C) / np.sqrt(H / 2.),
            b1=np.zeros((1, H)),
            b2=np.zeros((1, H)),
            b3=np.zeros((1, C)),
            gamma1=np.ones((1, H)),
            gamma2=np.ones((1, H)),
            beta1=np.zeros((1, H)),
            beta2=np.zeros((1, H))
        )

        self.bn_caches = dict(
            bn1_mean=np.zeros((1, H)),
            bn2_mean=np.zeros((1, H)),
            bn1_var=np.zeros((1, H)),
            bn2_var=np.zeros((1, H))
        )

In [42]:
for k in range(n_experiment):
    print('Experiment-{}'.format(k + 1))

    net = FFNN(C=C, D=D, H=128) #, lam=, loss=, nonlin=, p_dropout=, self=

    net = solver_fun(nn=net, X_train=X_train, y_train=y_train, val_set=(X_val, y_val), 
                     mb_size=mb_size, alpha=alpha, n_iter=n_iter, print_after=print_after)

    y_pred = net.predict(X_test)
    accs[k] = np.mean(y_pred == y_test)

print()
print('Test Mean accuracy: {:.4f}, std: {:.4f}'.format(accs.mean(), accs.std()))

Experiment-1
Iter-100 loss: 2.7898 validation: 0.102600
Iter-200 loss: 2.9163 validation: 0.217600
Iter-300 loss: 2.3936 validation: 0.374800
Iter-400 loss: 2.0737 validation: 0.493200
Iter-500 loss: 1.9784 validation: 0.578200
Iter-600 loss: 1.9278 validation: 0.637400
Iter-700 loss: 1.7345 validation: 0.670400
Iter-800 loss: 1.6652 validation: 0.696800
Iter-900 loss: 1.6225 validation: 0.721600
Iter-1000 loss: 1.5532 validation: 0.741000

Test Mean accuracy: 0.7501, std: 0.0000
