In [1]:
import numpy as np
from impl.solver import *
import sys

In [2]:
n_iter = 1000 # number of epochs
alpha = 1e-3
mb_size = 64 # minibatch size usually compatible to the Cache/RAM size
n_experiment = 1
reg = 1e-5
print_after = 100
p_dropout = 0.8
loss = 'cross_ent'
nonlin = 'relu'
solver = 'sgd'

In [3]:
# import hipsternet.input_data as input_data  # NOT used for MNIST
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('data/MNIST_data/', one_hot=False)
X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels
y_test.shape, y_val.shape, y_train.shape

Extracting data/MNIST_data/train-images-idx3-ubyte.gz
Extracting data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting data/MNIST_data/t10k-labels-idx1-ubyte.gz


((10000,), (5000,), (55000,))

In [4]:
M, D, C = X_train.shape[0], X_train.shape[1], y_train.max() + 1
M, D, C

(55000, 784, 10)

In [5]:
import impl.utils as utils
X_train, X_val, X_test = utils.prepro(X_train, X_val, X_test)
X_train.shape, X_val.shape, X_test.shape

((55000, 784), (5000, 784), (10000, 784))

In [6]:
# if net_type == 'cnn':
img_shape = (1, 28, 28)
X_train = X_train.reshape(-1, *img_shape)
X_val = X_val.reshape(-1, *img_shape)
X_test = X_test.reshape(-1, *img_shape)
X_train.shape, X_val.shape, X_test.shape

((55000, 1, 28, 28), (5000, 1, 28, 28), (10000, 1, 28, 28))

In [7]:
solvers = dict(
    sgd=sgd,
    momentum=momentum,
    nesterov=nesterov,
    adagrad=adagrad,
    rmsprop=rmsprop,
    adam=adam
)
solvers

{'adagrad': <function impl.solver.adagrad>,
 'adam': <function impl.solver.adam>,
 'momentum': <function impl.solver.momentum>,
 'nesterov': <function impl.solver.nesterov>,
 'rmsprop': <function impl.solver.rmsprop>,
 'sgd': <function impl.solver.sgd>}

In [26]:
solver_fun = solvers[solver] # solver functions
accs = np.zeros(n_experiment)
solver_fun, accs
# print()
print('Experimenting on {}'.format(solver))
# print()

Experimenting on sgd


In [32]:
import numpy as np
import impl.loss as loss_fun
import impl.layer as l
import impl.NN as nn

class SPNN(nn.NN):

    def __init__(self, D, C, H, lam=1e-3, p_dropout=.8, loss='cross_ent', nonlin='relu'):
        super().__init__(D, C, H, lam, p_dropout, loss, nonlin)
    
    def pyrmidnet_fwd(X, w1, b1, w21, b21, w22, b22):

        #         #         # 1st layer -  linear term --> wx=x
        #         #         h1_txhxwxd_logit, h1_txhxwxd_logit_cache = l.conv_forward(b=b1, #b_1xh
        #         #                                                     padding=1, #padding=true means image size stays the same: 'SAME'
        #         #                                                     stride=1, # stride one means include all and no jump
        #         #                                                     W=w1, # kernel size cx3x3xd for all layers
        #         #                                                     X=x_txcxhxw) # input image in SPNN for spatial PNN        # 2nd layer - adding to the linear layer output
        dx_txcxhxw__, dw1, db1 = l.conv_backward(dout=dh1_txhxwxd_logit, cache=h1_txhxwxd_logit_cache)


        #         #         # 1nd layer - adding to spnn output
        #         #         y_txhxwxd_logit = h1_txhxwxd_logit
        dh1_txhxwxd_logit = dy_txhxwxd_logit

        # 2nd layer -  non-linear term --> wf(wx)=f(x)
        # 2nd layer - 1st convolution
        h21_txhxwxd_logit, h21_txhxwxd_logit_cache = l.conv_forward(b=b21, #b_1xh
                                                    padding=1, #padding=true means image size stays the same: 'SAME'
                                                    stride=1, # stride one means include all and no jump
                                                    W=w21, # kernel size 3x3 for all layers
                                                    X=x_txcxhxw) # input image in SPNN for spatial PNN
        #         dx_txcxhxw__, dw21, db21 = l.conv_backward(dout=dh21_txhxwxd_logit, cache=h21_txhxwxd_logit_cache)
        
        # 2nd layer - non-linearity
        h21_txhxwxd_act, h21_txhxwxd_act_cache = l.relu_forward(h21_txhxwxd_logit)
        #         dh21_txhxwxd_logit = l.relu_backward(cache=h21_txhxwxd_act_cache, dout=dh21_txhxwxd_act)

        # 2nd layer - 2nd convolution
        h22_txhxwxd_logit, h22_txhxwxd_logit_cache = l.conv_forward(b=b22, #b_1xh
                                                    padding=1, #padding=true means image size stays the same: 'SAME'
                                                    stride=1, # stride one means include all and no jump
                                                    W=w22, # kernel size 3x3 for all layers
                                                    X=h21_txhxwxd_act) # input image in SPNN for spatial PNN
        #         dh21_txhxwxd_act, dw22, db22 = l.conv_backward(dout=dh22_txhxwxd_logit, cache=h22_txhxwxd_logit_cache) # X is visible/input layer, dX? No use??

        # 2nd layer - adding to spnn output
        y_txhxwxd_logit += h22_txhxwxd_logit
        #         dh22_txhxwxd_logit = dy_txhxwxd_logit
        
        # output layer - non-linearity
        y_txhxwxd_act, y_txhxwxd_act_cache = l.relu_forward(y_txhxwxd_logit)
        #         dy_txhxwxd_logit = l.relu_backward(cache=y_txhxwxd_act_cache, dout=dy_txhxwxd_act)

        # output cache
        cache = h1_txhxwxd_logit_cache, h21_txhxwxd_act_cache, h22_txhxwxd_logit_cache, y_txhxwxd_act_cache
        #         h1_txhxwxd_logit_cache, h21_txhxwxd_act_cache, h22_txhxwxd_logit_cache, y_txhxwxd_act_cache = cache
        
        return y_txhxwxd_act, cache

    def pyrmidnet_bwd(dy_txhxwxd_act, cache):

        #         # output cache
        #         cache = h1_txhxwxd_logit_cache, h21_txhxwxd_act_cache, h22_txhxwxd_logit_cache, y_txhxwxd_act_cache
        h1_txhxwxd_logit_cache, h21_txhxwxd_act_cache, h22_txhxwxd_logit_cache, y_txhxwxd_act_cache = cache

        #         # output layer - non-linearity
        #         y_txhxwxd_act, y_txhxwxd_act_cache = l.relu_forward(y_txhxwxd_logit)
        dy_txhxwxd_logit = l.relu_backward(cache=y_txhxwxd_act_cache, dout=dy_txhxwxd_act)

        #         # 2nd layer - adding to spnn output
        #         y_txhxwxd_logit += h22_txhxwxd_logit
        dh22_txhxwxd_logit = dy_txhxwxd_logit
        
        #         # 2nd layer - 2nd convolution
        #         h22_txhxwxd_logit, h22_txhxwxd_logit_cache = l.conv_forward(b=b22, #b_1xh
        #                                                     padding=1, #padding=true means image size stays the same: 'SAME'
        #                                                     stride=1, # stride one means include all and no jump
        #                                                     W=w22, # kernel size 3x3 for all layers
        #                                                     X=h21_txhxwxd_act) # input image in SPNN for spatial PNN
        dh21_txhxwxd_act, dw22, db22 = l.conv_backward(dout=dh22_txhxwxd_logit, cache=h22_txhxwxd_logit_cache) # X is visible/input layer, dX? No use??

        #         # 2nd layer - non-linearity
        #         h21_txhxwxd_act, h21_txhxwxd_act_cache = l.relu_forward(h21_txhxwxd_logit)
        dh21_txhxwxd_logit = l.relu_backward(cache=h21_txhxwxd_act_cache, dout=dh21_txhxwxd_act)

        #         # 2nd layer -  non-linear term --> wf(wx)=f(x)
        #         # 2nd layer - 1st convolution
        #         h21_txhxwxd_logit, h21_txhxwxd_logit_cache = l.conv_forward(b=b21, #b_1xh
        #                                                     padding=1, #padding=true means image size stays the same: 'SAME'
        #                                                     stride=1, # stride one means include all and no jump
        #                                                     W=w21, # kernel size 3x3 for all layers
        #                                                     X=x_txcxhxw) # input image in SPNN for spatial PNN
        dx_txcxhxw__, dw21, db21 = l.conv_backward(dout=dh21_txhxwxd_logit, cache=h21_txhxwxd_logit_cache)
        
        #         # 1nd layer - adding to spnn output
        #         y_txhxwxd_logit = h1_txhxwxd_logit
        dh1_txhxwxd_logit = dy_txhxwxd_logit

        #         # 1st layer -  linear term --> wx=x
        #         h1_txhxwxd_logit, h1_txhxwxd_logit_cache = l.conv_forward(b=b1, #b_1xh
        #                                                     padding=1, #padding=true means image size stays the same: 'SAME'
        #                                                     stride=1, # stride one means include all and no jump
        #                                                     W=w1, # kernel size cx3x3xd for all layers
        #                                                     X=x_txcxhxw) # input image in SPNN for spatial PNN        # 2nd layer - adding to the linear layer output
        dx_txcxhxw__, dw1, db1 = l.conv_backward(dout=dh1_txhxwxd_logit, cache=h1_txhxwxd_logit_cache)

        #         def pyrmidnet_fwd(X, w1, b1, w21, b21, w22, b22):
        return dX, dw1, db1, dw21, db21, dw22, db22
    
    def forward(self, X, train=False):

        # 1st layer -  convolution to change the size from x_txcxhxw to h_txhxwxd, c==channels of image, d=depth/num_units
        h1_txhxwxd_logit, h1_txhxwxd_logit_cache = l.conv_forward(b=self.model['b1'], #b_1xh
                                                    padding=1, #padding=true means image size stays the same: 'SAME'
                                                    stride=1, # stride one means include all and no jump
                                                    W=self.model['W1'], # kernel size cx3x3xd for all layers
                                                    X=X) # input image in SPNN for spatial PNN        # 2nd layer - adding to the linear layer output
        #         dX, dw1, db1 = l.conv_backward(dout=dh1_txhxwxd_logit, cache=h1_txhxwxd_logit_cache)

        # 1st layer - nonlinearity-relu
        h1_txhxwxd_act, h1_txhxwxd_act_cache = l.relu_forward(h1_txhxwxd_logit)
        #         dh1_txhxwxd_logit = l.relu_backward(cache=h1_txhxwxd_act_cache, dout=dh1_txhxwxd_act)
        
        # last layer : FC layer -  fully connected to the output layer (visible layer)
        # n=hxwxd flattened
        h1_txn_act = h1_txhxwxd_act.reshape([h1_txhxwxd_act_cache.shape[0], -1])
        #         dh1_txhxwxd_act = dh1_txn_act.reshape(h1_txhxwxd_act_cache.shape)

        y_tx10_logit, y_tx10_logit_cache = l.fc_forward(X=h1_txn_act, W=self.model['W2'], b=self.model['b2'])
        #         dh1_txn_act, dw2, db2 = l.fc_backward(dout=dy_tx10_logit, cache=y_tx10_logit_cache)

        # Output
        cache = X, h1_txhxwxd_logit_cache, h1_txhxwxd_act_cache, y_tx10_logit_cache
        #         X, h1_txhxwxd_logit_cache, h1_txhxwxd_act_cache, y_tx10_logit_cache = cache
        
        return y_tx10_logit, cache

    def backward(self, y, y_train, cache):

        #         # Output
        #         cache = X, h1_txhxwxd_logit_cache, h1_txhxwxd_act_cache, y_tx10_logit_cache
        X, h1_txhxwxd_logit_cache, h1_txhxwxd_act_cache, y_tx10_logit_cache = cache

        # Output layer backward
        dy_tx10_logit = self.dloss_funs[self.loss](y, y_train) # y==y_logits

        #         y_tx10_logit, y_tx10_logit_cache = l.fc_forward(X=h1_txn_act, W=self.model['W2'], b=self.model['b2'])
        dh1_txn_act, dw2, db2 = l.fc_backward(dout=dy_tx10_logit, cache=y_tx10_logit_cache)

        #         # last layer : FC layer -  fully connected to the output layer (visible layer)
        #         # n=hxwxd flattened
        #         h1_txn_act = h1_txhxwxd_act.reshape([h1_txhxwxd_act_cache.shape[0], -1])
        dh1_txhxwxd_act = dh1_txn_act.reshape(h1_txhxwxd_act_cache.shape)


        #         # 1st layer - nonlinearity-relu
        #         h1_txhxwxd_act, h1_txhxwxd_act_cache = l.relu_forward(h1_txhxwxd_logit)
        dh1_txhxwxd_logit = l.relu_backward(cache=h1_txhxwxd_act_cache, dout=dh1_txhxwxd_act)

        #         # 1st layer -  convolution to change the size from x_txcxhxw to h_txhxwxd, c==channels of image, d=depth/num_units
        #         h1_txhxwxd_logit, h1_txhxwxd_logit_cache = l.conv_forward(b=self.model['b1'], #b_1xh
        #                                                     padding=1, #padding=true means image size stays the same: 'SAME'
        #                                                     stride=1, # stride one means include all and no jump
        #                                                     W=self.model['W1'], # kernel size cx3x3xd for all layers
        #                                                     X=X) # input image in SPNN for spatial PNN        # 2nd layer - adding to the linear layer output
        dX, dw1, db1 = l.conv_backward(dout=dh1_txhxwxd_logit, cache=h1_txhxwxd_logit_cache)


        # grad for GD
        grad = dict(
            W1=dw1, 
            b1=db1, # 1st layer in SPNN: Conv layer from the input
            #             W1=np.random.randn(H, 1, 3, 3) / np.sqrt(H / 2.),
            #             b1=np.zeros((H, 1)), # 1st layer Conv the input
            W2=dw2, 
            b2=db2 # last layer in SPNN: FC layer to the output
            #             W2=np.random.randn(H * D, C) / np.sqrt(H * D / 2.), 
            #             b2=np.zeros((1, C)) # last layer FC to the output
            )
        
        return grad

    def _init_model(self, D, C, H):
        self.model = dict(
            #             W1=dw1, 
            #             b1=db1, # 1st layer in SPNN: Conv layer from the input
            W1=np.random.randn(H, 1, 3, 3) / np.sqrt(H / 2.),
            b1=np.zeros((H, 1)), # 1st layer Conv the input
            #             W2=dw2, 
            #             b2=db2 # last layer in SPNN: FC layer to the output
            W2=np.random.randn(H * D, C) / np.sqrt(H * D / 2.), 
            b2=np.zeros((1, C)) # last layer FC to the output
        )

In [33]:
for k in range(n_experiment):
    print('Experiment-{}'.format(k + 1))

#     net = PNN(C=C, D=10, H=128) # Mine
    net = SPNN(C=C, D=D, H=10) #, lam=, loss=, nonlin=, p_dropout=, self=

    net = solver_fun(nn=net, X_train=X_train, y_train=y_train, val_set=(X_val, y_val), 
                     mb_size=mb_size, alpha=alpha, n_iter=n_iter, print_after=print_after)

    y_pred = net.predict(X_test)
    accs[k] = np.mean(y_pred == y_test)

print()
print('Test Mean accuracy: {:.4f}, std: {:.4f}'.format(accs.mean(), accs.std()))

Experiment-1
Iter-100 loss: 1.7957 validation: 0.596600
Iter-200 loss: 1.3315 validation: 0.735000
Iter-300 loss: 1.0097 validation: 0.782800
Iter-400 loss: 0.8834 validation: 0.810600
Iter-500 loss: 0.7991 validation: 0.828000
Iter-600 loss: 0.8964 validation: 0.841200
Iter-700 loss: 0.6837 validation: 0.848800
Iter-800 loss: 0.6596 validation: 0.854600
Iter-900 loss: 0.7226 validation: 0.861400
Iter-1000 loss: 0.7086 validation: 0.867600

Test Mean accuracy: 0.8644, std: 0.0000
