In [6]:
# Data
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import impl.utils as utils

# Dataset preparation and pre-processing
mnist = input_data.read_data_sets('data/MNIST_data/', one_hot=False)
X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels
# y_test.shape, y_val.shape, y_train.shape
M, D, C = X_train.shape[0], X_train.shape[1], y_train.max() + 1
# M, D, C
X_train, X_val, X_test = utils.prepro(X_train, X_val, X_test)
# X_train.shape, X_val.shape, X_test.shape
# if net_type == 'cnn':
img_shape = (1, 28, 28)
X_train = X_train.reshape(-1, *img_shape)
X_val = X_val.reshape(-1, *img_shape)
X_test = X_test.reshape(-1, *img_shape)
X_train.shape, X_val.shape, X_test.shape

Extracting data/MNIST_data/train-images-idx3-ubyte.gz
Extracting data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting data/MNIST_data/t10k-labels-idx1-ubyte.gz


((55000, 1, 28, 28), (5000, 1, 28, 28), (10000, 1, 28, 28))

In [18]:
# Model
# import numpy as np
import impl.loss as loss_fun
import impl.layer as l
import impl.NN as nn

class DCNN(nn.NN):

    def __init__(self, D, C, H, L, p_dropout, lam=1e-3, loss='cross_ent', nonlin='relu'):
        self.p_dropout = p_dropout # prob, keep_prob, or prob_dropout
        self.loss = loss # minimum cross_entropy
        self.mode = 'classification'
        self.L = L # number of layers or depth
        super().__init__(D, C, H, lam, p_dropout, loss, nonlin)
        
    def _init_model(self, D, C, H):
        self.model = []
        self.model.append(dict(
            W1=np.random.randn(H, 1, 3, 3) / np.sqrt(H / 2.),
            b1=np.zeros((H, 1))
        ))
        
        # The five conv layers/blocks
        m = []
        for layer in range(2):
            m.append(dict(
                    W2=np.random.randn(H, H, 3, 3) / np.sqrt(H / 2.),
                    b2=np.zeros((H, 1)),
                    gamma2=np.ones((1, H*D)),
                    beta2=np.zeros((1, H*D))
                    ))
        self.model.append(m) # self.model[0][]
        
        self.model.append(dict(
            W3=np.random.randn(H*D, C) / np.sqrt(H*D / 2.),
            b3=np.zeros((1, C))
        ))

        # The five layer con layers/blocks with batch normalization
        self.bn_caches = []
        for layer in range(2):
            self.bn_caches.append(dict(
                bn2_mean=np.zeros((1, H*D)),
                bn2_var=np.zeros((1, H*D)))
                                 )
            
    def forward(self, X, train=False):

        # 1st layer: Input to Conv
        h1, h1_cache = l.conv_forward(X=X, W=self.model[0]['W1'], b=self.model[0]['b1']) 
        h2, nl1_cache = l.relu_forward(h1)

        ###########################################################################################
        h2_caches = []
        for layer in range(2):
            # midst layer: Convnet 1
            if not layer == 0: h2 = h2.reshape(nl1_cache.shape)
            h2, h2_cache = l.conv_forward(X=h2, W=self.model[1][layer]['W2'], b=self.model[1][layer]['b2'])
            h2 = h2.reshape([nl1_cache.shape[0], -1])
            h2, bn2_cache, self.bn_caches[layer]['bn2_mean'], self.bn_caches[layer]['bn2_var'] = l.bn_forward(h2, 
                                                    self.model[1][layer]['gamma2'], self.model[1][layer]['beta2'], 
                                                    (self.bn_caches[layer]['bn2_mean'], self.bn_caches[layer]['bn2_var']), 
                                                    train=train)
            h2, nl2_cache = l.relu_forward(h2)
            do2_cache = None # ERROR: referenced before assigned!
            if train: h2, do2_cache = l.dropout_forward(X=h2, p_dropout=self.p_dropout)
            cache = (h2_cache, bn2_cache, nl2_cache, do2_cache)
            h2_caches.append(cache)
        ############################################################################################
            
        # last layer : FC to Output
        h3, h3_cache = l.fc_forward(X=h2, W=self.model[2]['W3'], b=self.model[2]['b3'])

        cache = (h1_cache, nl1_cache, h2_caches, h3_cache)
        return h3, cache
    
    def loss_function(self, y, y_train):
        loss = self.loss_funs[self.loss](y, y_train)
        dy = self.dloss_funs[self.loss](y, y_train)
        return loss, dy
    
    def backward(self, dy, cache):
        h1_cache, nl1_cache, h2_caches, h3_cache = cache

        # last layer
        dh2, dw3, db3 = l.fc_backward(dout=dy, cache=h3_cache)
        
        # midst layer 2
        g = []
        for layer in reversed(range(2)):
            h2_cache, bn2_cache, nl2_cache, do2_cache = h2_caches[layer]
            dh2 = l.dropout_backward(dout=dh2, cache=do2_cache)
            dh2 = l.relu_backward(dout=dh2, cache=nl2_cache)
            dh2, dgamma2, dbeta2 = l.bn_backward(dout=dh2, cache=bn2_cache)
            dh2 = dh2.reshape(nl1_cache.shape)
            dh2, dw2, db2 = l.conv_backward(dout=dh2, cache=h2_cache)
            if not layer==0: dh2 = dh2.reshape([nl1_cache.shape[0], -1])
            g.append(dict(
                    W2=dw2, 
                    b2=db2,
                    gamma2=dgamma2,
                    beta2=dbeta2
                    ))
            
        # 1st layer
        dh1 = l.relu_backward(dout=dh2, cache=nl1_cache)
        dX, dw1, db1 = l.conv_backward(dout=dh1, cache=h1_cache)

        # grad for GD
        grad = []
        grad.append(dict(
            W1=dw1, 
            b1=db1
        ))

        grad.append(g)
        
        grad.append(dict(
            W3=dw3, 
            b3=db3
        ))
        
        return dX, grad
    
    def test(self, X):
        y_logit, cache = self.forward(X, train=False)
        y_prob = util.softmax(y_logit)
        if self.mode == 'classification':
            return np.argmax(y_prob, axis=1)
        else: # self.mode == 'regression'
            return np.round(y_logit)

In [19]:
# SGD
# import numpy as np
import impl.utils as util
import impl.constant as c
import copy
from sklearn.utils import shuffle as skshuffle

def get_minibatch(X, y, minibatch_size, shuffle=True):
    minibatches = []

    if shuffle:
        X, y = skshuffle(X, y)

    for i in range(0, X.shape[0], minibatch_size):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]

        minibatches.append((X_mini, y_mini))

    return minibatches

def adam(nn, X_train, y_train, val_set=None, alpha=0.001, mb_size=256, n_iter=2000, print_after=100):
    M, R = [], []
    M.append({k: np.zeros_like(v) for k, v in nn.model[0].items()})
    R.append({k: np.zeros_like(v) for k, v in nn.model[0].items()})
    
    M_, R_ = [], []
    for layer in range(2):
        M_.append({k: np.zeros_like(v) for k, v in nn.model[1][layer].items()})
        R_.append({k: np.zeros_like(v) for k, v in nn.model[1][layer].items()})
    M.append(M_)
    R.append(R_)

    M.append({k: np.zeros_like(v) for k, v in nn.model[2].items()})
    R.append({k: np.zeros_like(v) for k, v in nn.model[2].items()})
    beta1 = .9
    beta2 = .999

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        t = iter
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        #         grad, loss = nn.train_step(X_mini, y_mini)
        #         def train_step(self, X_train, y_train):
        #         """
        #         Single training step over minibatch: forward, loss, backprop
        #         """
        y, cache = nn.forward(X_mini, train=True)
        loss, dy = nn.loss_function(y, y_mini)
        dX, grad = nn.backward(dy, cache)
        #         return grad, loss

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.test(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for k in grad[0]:
            M[0][k] = util.exp_running_avg(M[0][k], grad[0][k], beta1)
            R[0][k] = util.exp_running_avg(R[0][k], grad[0][k]**2, beta2)

            m_k_hat = M[0][k] / (1. - beta1**(t))
            r_k_hat = R[0][k] / (1. - beta2**(t))

            nn.model[0][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)
            
        for layer in range(2):
            for k in grad[1][0]:
                M[1][layer][k] = util.exp_running_avg(M[1][layer][k], grad[1][layer][k], beta1)
                R[1][layer][k] = util.exp_running_avg(R[1][layer][k], grad[1][layer][k]**2, beta2)

                m_k_hat = M[1][layer][k] / (1. - beta1**(t))
                r_k_hat = R[1][layer][k] / (1. - beta2**(t))

                nn.model[1][layer][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)
                        
        for k in grad[2]:
            M[2][k] = util.exp_running_avg(M[2][k], grad[2][k], beta1)
            R[2][k] = util.exp_running_avg(R[2][k], grad[2][k]**2, beta2)

            m_k_hat = M[2][k] / (1. - beta1**(t))
            r_k_hat = R[2][k] / (1. - beta2**(t))

            nn.model[2][k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)

    return nn

In [20]:
# Hyper-parameters
n_iter = 10 # number of epochs
alpha = 1e-3 # learning_rate
mb_size = 64 # width, timestep for sequential data or minibatch size
num_layers = 2 # depth 
print_after = 1 # print loss for train, valid, and test

In [None]:
# build the model/NN and learn it: running session.
net = DCNN(C=C, D=D, H=8, p_dropout=0.5, L=num_layers)

net = adam(nn=net, X_train=X_train, y_train=y_train, val_set=(X_val, y_val), mb_size=mb_size, alpha=alpha, 
           n_iter=n_iter, print_after=print_after)

y_pred = net.predict(X_test)
accs = np.mean(y_pred == y_test)

print()
print('Test Mean accuracy: {:.4f}, std: {:.4f}'.format(accs.mean(), accs.std()))

Iter-1 loss: 3.2841 validation: 0.086800
Iter-2 loss: 2.9359 validation: 0.105800
Iter-3 loss: 2.7432 validation: 0.113400
Iter-4 loss: 2.5935 validation: 0.112800
Iter-5 loss: 2.1706 validation: 0.112600
Iter-6 loss: 2.2129 validation: 0.112600
Iter-7 loss: 2.0974 validation: 0.112600
Iter-8 loss: 2.0749 validation: 0.107000
Iter-9 loss: 1.7712 validation: 0.106400
Iter-10 loss: 2.0549 validation: 0.110000
