In [18]:
import mxnet as mx
from mxnet import nd, autograd, gluon
import numpy as np
from __future__ import print_function

In [19]:
data_ctx = mx.cpu()
model_ctx = mx.gpu(0)

In [20]:
num_inputs = 28 * 28
num_labels = 10
batch_size = 64

In [23]:
def transform(data, label):
    return (data.astype(np.float32) / 255, label.astype(np.float32))

train_data = gluon.data.DataLoader(mx.gluon.data.vision.MNIST(train = True, transform=transform), batch_size = batch_size, shuffle = True)
test_data = gluon.data.DataLoader(mx.gluon.data.vision.MNIST(train = False, transform=transform), batch_size = batch_size, shuffle = False)

  label = np.fromstring(fin.read(), dtype=np.uint8).astype(np.int32)


  data = np.fromstring(fin.read(), dtype=np.uint8)


In [27]:
num_hidden_1 = 512
weight_scale = 0.01
W1 = nd.random_normal(shape=(num_inputs, num_hidden_1), ctx=model_ctx, scale=weight_scale)
b1 = nd.random_normal(shape=num_hidden_1, ctx=model_ctx, scale=weight_scale)

num_hidden_2 = 256
W2 = nd.random_normal(shape=(num_hidden_1, num_hidden_2), ctx=model_ctx, scale=weight_scale)
b2 = nd.random_normal(shape=num_hidden_2, ctx=model_ctx, scale=weight_scale)

W3 = nd.random_normal(shape=(num_hidden_2, num_labels), ctx=model_ctx, scale=weight_scale)
b3 = nd.random_normal(shape=num_labels, ctx=model_ctx, scale=weight_scale)

params = [W1, W2, W3, b1, b2, b3]

In [28]:
for param in params:
    param.attach_grad()

In [29]:
def relu(z):
    return nd.maximum(z, nd.zeros_like(z))

In [30]:
def net(X):
    with autograd.record():
        z1 = nd.dot(X,W1) + b1
        a1 = relu(z1)
        z2 = nd.dot(a1, W2) + b2
        a2 = relu(z2)
        return nd.dot(a2, W3) + b3


In [31]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

In [32]:
def softmax_cross_entropy(y_hat_linear, y):
    return - nd.nansum(y * nd.log_softmax(y_hat_linear), axis=0, exclude=True)

In [33]:
def evaluate_accuracy(data_iterator, net):
    numerator = 0.
    denominator = 0.
    for i, (data, label) in enumerate(data_iterator):
        data_in_context = data.as_in_context(model_ctx).reshape((-1, num_inputs))
        label_in_context = data.as_in_context(model_ctx)
        output = net(data_in_context)
        predictions = nd.argmax(output, axis=1)
        numerator += nd.sum(predictions==label_in_context)
        denominator += data.shape[0]
    return (numerator / denominator).asscalar()

In [34]:
epochs = 10
learning_rate = 0.01

for epoch in range(epochs):
    cumulative_loss = 0
    for i, (data, label) in enumerate(train_data):
        data_in_context = data.as_in_context(model_ctx).reshape((-1, num_inputs))
        label_in_context = label.as_in_context(model_ctx)
        label_one_hot = nd.one_hot(label_in_context, num_labels)
        with autograd.record():
            output = net(data_in_context)
            loss = softmax_cross_entropy(output, label_one_hot)
        loss.backward()
        SGD(params, learning_rate)
        cumulative_loss += nd.sum(loss).asscalar()
    test_acc = evaluate_accuracy(test_data, net)
    train_acc = evaluate_accuracy(train_data, net)
    print("Epoch %s, Loss: %s, Train acc: %s, Test acc: %s" %(epoch, cumulative_loss, train_acc, test_acc))

Epoch 0, Loss: 22870.68608736992, Train acc: 4003.1692, Test acc: 4029.7202


Epoch 1, Loss: 6371.416874974966, Train acc: 4151.567, Test acc: 4212.765


Epoch 2, Loss: 4213.716894470155, Train acc: 4051.6733, Test acc: 4049.2307


Epoch 3, Loss: 3173.8502270430326, Train acc: 4035.6172, Test acc: 4033.7441


Epoch 4, Loss: 2337.6467812918127, Train acc: 4027.812, Test acc: 4012.6226


Epoch 5, Loss: 1874.0931830760092, Train acc: 4006.0989, Test acc: 3935.8035


Epoch 6, Loss: 1528.5914667444304, Train acc: 4052.3386, Test acc: 4061.7146


Epoch 7, Loss: 1117.7837451065425, Train acc: 4034.6667, Test acc: 4009.0923


Epoch 8, Loss: 817.6338906649034, Train acc: 4056.0107, Test acc: 4097.613


Epoch 9, Loss: 697.2515440048883, Train acc: 4028.0862, Test acc: 4033.681
