In [8]:
# Data
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import impl.utils as utils

# Dataset preparation and pre-processing
mnist = input_data.read_data_sets('data/MNIST_data/', one_hot=False)
X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels
# y_test.shape, y_val.shape, y_train.shape
M, D, C = X_train.shape[0], X_train.shape[1], y_train.max() + 1
# M, D, C
X_train, X_val, X_test = utils.prepro(X_train, X_val, X_test)
# X_train.shape, X_val.shape, X_test.shape
# if net_type == 'cnn':
img_shape = (1, 28, 28)
X_train = X_train.reshape(-1, *img_shape)
X_val = X_val.reshape(-1, *img_shape)
X_test = X_test.reshape(-1, *img_shape)
X_train.shape, X_val.shape, X_test.shape

Extracting data/MNIST_data/train-images-idx3-ubyte.gz
Extracting data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting data/MNIST_data/t10k-labels-idx1-ubyte.gz


((55000, 1, 28, 28), (5000, 1, 28, 28), (10000, 1, 28, 28))

In [3]:
# Model
# import numpy as np
import impl.loss as loss_fun
import impl.layer as l
import impl.NN as nn

class DCNN(nn.NN):

    def __init__(self, D, C, H, lam=1e-3, p_dropout=.8, loss='cross_ent', nonlin='relu'):
        self.lam = lam
        self.p_dropout = p_dropout # prob, keep_prob, or prob_dropout
        self.loss = loss # minimum cross_entropy
        self.mode = 'classification'
        super().__init__(D, C, H, lam, p_dropout, loss, nonlin)
        
    def _init_model(self, D, C, H):
        self.model = dict(
            W1=np.random.randn(H, 1, 3, 3) / np.sqrt(H / 2.),
            b1=np.zeros((H, 1)),

            W2=np.random.randn(H, H, 3, 3) / np.sqrt(H / 2.),
            b2=np.zeros((H, 1)),
            gamma2=np.ones((1, H*D)),
            beta2=np.zeros((1, H*D)),

            W22=np.random.randn(H, H, 3, 3) / np.sqrt(H / 2.),
            b22=np.zeros((H, 1)),
            gamma22=np.ones((1, H*D)),
            beta22=np.zeros((1, H*D)),

            W3=np.random.randn(H*D, C) / np.sqrt(H*D / 2.),
            b3=np.zeros((1, C))
        )
        self.bn_caches = dict(            
            bn2_mean=np.zeros((1, H*D)),
            bn2_var=np.zeros((1, H*D)),

            bn22_mean=np.zeros((1, H*D)),
            bn22_var=np.zeros((1, H*D))
        )
        
    def forward(self, X, train=False):

        # 1st layer: Input to Conv
        h1, h1_cache = l.conv_forward(X=X, W=self.model['W1'], b=self.model['b1']) 
        h1, nl1_cache = l.relu_forward(h1)

        # midst layer: Convnet 1
        h2, h2_cache = l.conv_forward(X=h1, W=self.model['W2'], b=self.model['b2'])
        h2 = h2.reshape([nl1_cache.shape[0], -1])
        h2, bn2_cache, self.bn_caches['bn2_mean'], self.bn_caches['bn2_var'] = l.bn_forward(h2, 
                                                self.model['gamma2'], self.model['beta2'], 
                                                (self.bn_caches['bn2_mean'], self.bn_caches['bn2_var']), 
                                                train=train)
        h2, nl2_cache = l.relu_forward(h2)
        do2_cache = None # ERROR: referenced before assigned!
        if train: h2, do2_cache = l.dropout_forward(X=h2, p_dropout=p_dropout)
        
        # midst layer: Convnet 2
        h2 = h2.reshape(nl1_cache.shape)
        h2, h22_cache = l.conv_forward(X=h2, W=self.model['W22'], b=self.model['b22'])
        h2 = h2.reshape([nl1_cache.shape[0], -1])
        h2, bn22_cache, self.bn_caches['bn22_mean'], self.bn_caches['bn22_var'] = l.bn_forward(h2, 
                                                self.model['gamma22'], self.model['beta22'], 
                                                (self.bn_caches['bn22_mean'], self.bn_caches['bn22_var']), 
                                                train=train)
        h2, nl22_cache = l.relu_forward(h2)
        do22_cache = None # ERROR: referenced before assigned!
        if train: h2, do22_cache = l.dropout_forward(X=h2, p_dropout=p_dropout)

        # last layer : FC to Output
        h3, h3_cache = l.fc_forward(X=h2, W=self.model['W3'], b=self.model['b3'])

        cache = h1_cache, nl1_cache, h2_cache, bn2_cache, nl2_cache, do2_cache, h22_cache, bn22_cache, nl22_cache, do22_cache, h3_cache
        return h3, cache
    
    def loss_function(self, y, y_train):
        loss = self.loss_funs[self.loss](self.model, y, y_train, self.lam)
        dy = self.dloss_funs[self.loss](y, y_train)
        return loss, dy
    
    def backward(self, dy, cache):

        h1_cache, nl1_cache, h2_cache, bn2_cache, nl2_cache, do2_cache, h22_cache, bn22_cache, nl22_cache, do22_cache, h3_cache = cache

        # last layer
        dh2, dw3, db3 = l.fc_backward(dout=dy, cache=h3_cache)

        # midst layer 2
        dh2 = l.dropout_backward(dout=dh2, cache=do22_cache)
        dh2 = l.relu_backward(dout=dh2, cache=nl22_cache)
        dh2, dgamma22, dbeta22 = l.bn_backward(dout=dh2, cache=bn22_cache)
        dh2 = dh2.reshape(nl1_cache.shape)
        dh2, dw22, db22 = l.conv_backward(dout=dh2, cache=h22_cache)
        dh2 = dh2.reshape([nl1_cache.shape[0], -1])

        # midst layer 1
        dh2 = l.dropout_backward(dout=dh2, cache=do2_cache)
        dh2 = l.relu_backward(dout=dh2, cache=nl2_cache)
        dh2, dgamma2, dbeta2 = l.bn_backward(dout=dh2, cache=bn2_cache)
        dh2 = dh2.reshape(nl1_cache.shape)
        dh1, dw2, db2 = l.conv_backward(dout=dh2, cache=h2_cache)

        # 1st layer
        dh1 = l.relu_backward(dout=dh1, cache=nl1_cache)
        dX, dw1, db1 = l.conv_backward(dout=dh1, cache=h1_cache)

        # grad for GD
        grad = dict(
            W1=dw1, 
            b1=db1,

            W2=dw2, 
            b2=db2,
            gamma2=dgamma2,
            beta2=dbeta2,
            
            W22=dw22, 
            b22=db22,
            gamma22=dgamma22,
            beta22=dbeta22,

            W3=dw3, 
            b3=db3
            )
        
        return dX, grad
    
    def test(self, X):
        y_logit, cache = self.forward(X, train=False)
        y_prob = util.softmax(y_logit)
        if self.mode == 'classification':
            return np.argmax(y_prob, axis=1)
        else: # self.mode == 'regression'
            return np.round(y_logit)

In [9]:
# SGD
# import numpy as np
# import impl.utils as util
import impl.constant as c
import copy
from sklearn.utils import shuffle as skshuffle


def get_minibatch(X, y, minibatch_size, shuffle=True):
    minibatches = []

    if shuffle:
        X, y = skshuffle(X, y)

    for i in range(0, X.shape[0], minibatch_size):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]

        minibatches.append((X_mini, y_mini))

    return minibatches

def adam(nn, X_train, y_train, val_set=None, alpha=0.001, mb_size=256, n_iter=2000, print_after=100):
    M = {k: np.zeros_like(v) for k, v in nn.model.items()}
    R = {k: np.zeros_like(v) for k, v in nn.model.items()}
    beta1 = .9
    beta2 = .999

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        t = iter
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        #         grad, loss = nn.train_step(X_mini, y_mini)
        #         def train_step(self, X_train, y_train):
        #         """
        #         Single training step over minibatch: forward, loss, backprop
        #         """
        y, cache = nn.forward(X_mini, train=True)
        loss, dy = nn.loss_function(y, y_mini)
        dX, grad = nn.backward(dy, cache)
        #         return grad, loss


        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.test(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for k in grad:
            M[k] = util.exp_running_avg(M[k], grad[k], beta1)
            R[k] = util.exp_running_avg(R[k], grad[k]**2, beta2)

            m_k_hat = M[k] / (1. - beta1**(t))
            r_k_hat = R[k] / (1. - beta2**(t))

            nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)

    return nn


In [6]:
# Hyper-parameters
n_iter = 100 # number of epochs
alpha = 1e-3 # learning_rate
mb_size = 64 # width, timestep for sequential data or minibatch size
num_layers = 1 # depth 
n_experiment = 1
reg = 1e-5 # regularization
print_after = 1 # print loss for train, valid, and test
p_dropout = 0.5

In [None]:
# Train, valid, and test
accs = np.zeros(n_experiment)
for k in range(n_experiment):
    print('Experiment-{}'.format(k + 1))

    net = DCNN(C=C, D=D, H=8)

    net = adam(nn=net, X_train=X_train, y_train=y_train, val_set=(X_val, y_val), mb_size=mb_size, alpha=alpha, 
               n_iter=n_iter, print_after=print_after)

    y_pred = net.predict(X_test)
    accs[k] = np.mean(y_pred == y_test)

print()
print('Test Mean accuracy: {:.4f}, std: {:.4f}'.format(accs.mean(), accs.std()))

Experiment-1
Iter-1 loss: 3.1581 validation: 0.112400
Iter-2 loss: 2.9901 validation: 0.097600
Iter-3 loss: 2.7134 validation: 0.097600
Iter-4 loss: 2.3359 validation: 0.110000
Iter-5 loss: 2.3329 validation: 0.110000
Iter-6 loss: 2.0718 validation: 0.110000
Iter-7 loss: 2.0969 validation: 0.110000
Iter-8 loss: 2.0108 validation: 0.110000
Iter-9 loss: 2.0562 validation: 0.110000
Iter-10 loss: 2.0735 validation: 0.110000
Iter-11 loss: 1.9014 validation: 0.110000
Iter-12 loss: 1.7811 validation: 0.110000
Iter-13 loss: 1.6288 validation: 0.110200
Iter-14 loss: 1.5248 validation: 0.110000
Iter-15 loss: 1.4136 validation: 0.110000
Iter-16 loss: 1.6207 validation: 0.110000
Iter-17 loss: 1.7156 validation: 0.110000
Iter-18 loss: 1.5306 validation: 0.110000
Iter-19 loss: 1.5732 validation: 0.110000
Iter-20 loss: 1.4069 validation: 0.110000
Iter-21 loss: 1.1058 validation: 0.110000
Iter-22 loss: 1.1227 validation: 0.110000
Iter-23 loss: 1.1836 validation: 0.110000
Iter-24 loss: 1.6123 validatio