In [1]:
import numpy as np
# import impl.neuralnet as nn
# import impl.CNN as cnn
# import impl.CNN2 as cnn2
from impl.solver import *
import sys

In [2]:
n_iter = 1000 # number of epochs
alpha = 1e-3
mb_size = 64 # minibatch size usually compatible to the Cache/RAM size
n_experiment = 1
reg = 1e-5
print_after = 100
p_dropout = 0.8 # dropout/keep_prob
loss = 'cross_ent'
nonlin = 'relu'
solver = 'sgd'

In [3]:
# import hipsternet.input_data as input_data  # NOT used for MNIST
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('data/MNIST_data/', one_hot=False)
X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels
y_test.shape, y_val.shape, y_train.shape

Extracting data/MNIST_data/train-images-idx3-ubyte.gz
Extracting data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting data/MNIST_data/t10k-labels-idx1-ubyte.gz


((10000,), (5000,), (55000,))

In [4]:
M, D, C = X_train.shape[0], X_train.shape[1], y_train.max() + 1
M, D, C

(55000, 784, 10)

In [5]:
import impl.utils as utils
X_train, X_val, X_test = utils.prepro(X_train, X_val, X_test)
X_train.shape, X_val.shape, X_test.shape

((55000, 784), (5000, 784), (10000, 784))

In [6]:
# if net_type == 'cnn':
img_shape = (1, 28, 28)
X_train = X_train.reshape(-1, *img_shape)
X_val = X_val.reshape(-1, *img_shape)
X_test = X_test.reshape(-1, *img_shape)
X_train.shape, X_val.shape, X_test.shape

((55000, 1, 28, 28), (5000, 1, 28, 28), (10000, 1, 28, 28))

In [7]:
solvers = dict(
    sgd=sgd,
    momentum=momentum,
    nesterov=nesterov,
    adagrad=adagrad,
    rmsprop=rmsprop,
    adam=adam
)
solvers

{'adagrad': <function impl.solver.adagrad>,
 'adam': <function impl.solver.adam>,
 'momentum': <function impl.solver.momentum>,
 'nesterov': <function impl.solver.nesterov>,
 'rmsprop': <function impl.solver.rmsprop>,
 'sgd': <function impl.solver.sgd>}

In [8]:
solver_fun = solvers[solver] # solver functions
accs = np.zeros(n_experiment)
solver_fun, accs
# print()
print('Experimenting on {}'.format(solver))
# print()

Experimenting on sgd


In [13]:
import numpy as np
import impl.loss as loss_fun
import impl.layer as l
# import impl.regularization as reg
# import impl.utils as util
import impl.NN as nn

class PNN(nn.NN):

    def __init__(self, D, C, H, lam=1e-3, p_dropout=.8, loss='cross_ent', nonlin='relu'):
        super().__init__(D, C, H, lam, p_dropout, loss, nonlin)

    def forward(self, X, train=False):
        # 1st layer
        # FC layer forward
        # make sure about the image size/input size/dimensionality & matrix rank
        x_tx1x28x28 = X
        # 1*28*28 =784 =400+2*20*8+64 =400+320+64 =720+64 =784 ==num_dim/pixels
        x_tx784 = x_tx1x28x28.reshape([x_tx1x28x28.shape[0], -1])
        # The output is classification/discrete with 10 classes --> y1
        y_tx10_logit, y_cache = l.fc_forward(X=x_tx784, W=self.model['W1'], b=self.model['b1'])

        # 2nd layer
        # h21 is the conv, bn, and relu
        h21_tx10x28x28_logit, h21_cache = l.conv_forward(b=self.model['b21'], padding=1, 
                                                      stride=1, W=self.model['W21'], X=x_tx1x28x28)
        # batch normalization EMPTY
        h21_tx10x28x28_act, nl_cache21 = l.relu_forward(h21_tx10x28x28_logit)

        # h22 is fully connected to the output layer (FC layer) --> y2
        # flatten for getting fully connected to the output layer (FC layer)
        h21_tx7840_act = h21_tx10x28x28_act.reshape([h21_tx10x28x28_act.shape[0], -1])
        y2_tx10_logit, y2_cache = l.fc_forward(b=self.model['b22'], W=self.model['W22'], X=h21_tx7840_act)
        

        y_tx10_logit += y2_tx10_logit
        # Before output activations y = wx+wf(wx) in the output layer
        # y_prob = softmax_fwd(X=y) (included in loss/error function)

        # Output forward
        cache = x_tx1x28x28, y_cache, h21_cache, nl_cache21, y2_cache
        return y_tx10_logit, cache

    def backward(self, y, y_train, cache):
        X_tx1x28x28, y_cache, h21_cache, nl_cache21, y2_cache = cache

        # Output layer backward
        dy_tx10_logit = self.dloss_funs[self.loss](y, y_train) # y==y_logits

        # 1st layer
        # FC layer backward
        # dy = softmax_bwd(dX=dy_prob, X=y_logits/y) (included in the loss/error function)
        dx_tx784, dW1_784x10, db1_1x10 = l.fc_backward(dout=dy_tx10_logit, cache=y_cache)
        
        # 2nd layer
        # FC layer backward
        # dy = softmax_bwd(dX=dy_prob, X=y_logits/y) (included in the loss/error function)
        dh21_tx7840_act, dW22_7840x10, db22_1x10 = l.fc_backward(dout=dy_tx10_logit, cache=y2_cache)
        # h21_tx784 = h21_tx1x28x28_act.reshape([h21_tx1x28x28_act[0], -1])
        dh21_tx10x28x28_act = dh21_tx7840_act.reshape(nl_cache21.shape)

        # h21_tx1x28x28_act, nl_cache21 = l.relu_forward(h21_tx1x28x28_logit)
        dh21_tx10x28x28_logit = l.relu_backward(cache=nl_cache21, dout=dh21_tx10x28x28_act)
        dx_tx1x28x28, dW21_10x1x3x3, db21_10x1 = l.conv_backward(dout=dh21_tx10x28x28_logit, cache=h21_cache) # X is visible/input layer, dX? No use??

        # grad for GD
        grad = dict(W1=dW1_784x10, b1=db1_1x10, # 1st layer/floor in pyrmid: FC layer
                    W21=dW21_10x1x3x3, b21=db21_10x1, W22=dW22_7840x10, b22=db22_1x10 # 2nd layer/floor in pyrmid: Conv+FC layers
                   )
        
        return grad

    def _init_model(self, D, C, H):
        self.model = dict(
            W1=np.random.randn(D, C) / np.sqrt(D / 2.), b1=np.zeros(shape=(1, C)), # 1st layer FC
            W21=np.random.randn(H, 1, 3, 3) / np.sqrt(H / 2.), b21=np.zeros((H, 1)), # 2nd layer Conv
            W22=np.random.randn(H * D, C) / np.sqrt(H * D / 2.), b22=np.zeros((1, C)) # 2nd layer FC
        )

In [14]:
for k in range(n_experiment):
    print('Experiment-{}'.format(k + 1))

#     net = PNN(C=C, D=10, H=128) # Mine
    net = PNN(C=C, D=D, H=10) #, lam=, loss=, nonlin=, p_dropout=, self=

    net = solver_fun(nn=net, X_train=X_train, y_train=y_train, val_set=(X_val, y_val), 
                     mb_size=mb_size, alpha=alpha, n_iter=n_iter, print_after=print_after)

    y_pred = net.predict(X_test)
    accs[k] = np.mean(y_pred == y_test)

print()
print('Test Mean accuracy: {:.4f}, std: {:.4f}'.format(accs.mean(), accs.std()))

Experiment-1
Iter-100 loss: 1.6086 validation: 0.622400
Iter-200 loss: 1.1318 validation: 0.763800
Iter-300 loss: 0.8523 validation: 0.805800
Iter-400 loss: 0.8192 validation: 0.829200
Iter-500 loss: 0.6903 validation: 0.843800
Iter-600 loss: 0.6378 validation: 0.854200
Iter-700 loss: 0.5634 validation: 0.861800
Iter-800 loss: 0.5594 validation: 0.866200
Iter-900 loss: 0.5895 validation: 0.868400
Iter-1000 loss: 0.5472 validation: 0.873000

Test Mean accuracy: 0.8733, std: 0.0000
