In [1]:
from mxnet import nd,autograd
from IPython import display
from matplotlib import pyplot as plt
from mxnet.gluon import data as gdata
display.set_matplotlib_formats("svg")
%matplotlib inline

In [2]:
mnist_train = gdata.vision.FashionMNIST(root="./data",train=True)
mnist_test = gdata.vision.FashionMNIST(root="./data", train=False)

In [3]:
batch_size = 256
transformer = gdata.vision.transforms.ToTensor()
train_iter = gdata.DataLoader(mnist_train.transform_first(transformer), batch_size=batch_size, shuffle=True)
test_iter = gdata.DataLoader(mnist_test.transform_first(transformer), batch_size=batch_size, shuffle=True)

## Initialize model parameters
Since each image has 28*28 pixels, we can store it as a 784 dimensional vector.

In [4]:
num_inputs = 784
num_outputs = 10
W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs))
b = nd.zeros(num_outputs)

- attach a gradient to the model parameters

In [5]:
W.attach_grad()
b.attach_grad()

- softmax
$$
\mathrm{softmax}(\mathbf{X})_{ij}=\frac{\exp(\mathbf{X}_{ij})}{\sum_{k}\exp{(\mathbf{X}_{ik}})}
$$

In [6]:
def softmax(x):
    x_exp = nd.exp(x)
    partition = x_exp.sum(axis=1, keepdims=True)
    return x_exp/partition

In [7]:
x = nd.random.normal(shape=(2,5))
x


[[ 1.7974477   0.19594945 -1.7376398   0.04734707  0.14580931]
 [ 0.32604915  0.4578783  -0.89425814  0.4938394  -0.90434265]]
<NDArray 2x5 @cpu(0)>

In [8]:
x_prob = softmax(x)
x_prob


[[0.6264712  0.126293   0.01826552 0.10885343 0.12011679]
 [0.25569436 0.2917251  0.07546549 0.3024068  0.07470828]]
<NDArray 2x5 @cpu(0)>

In [9]:
x_prob.sum(axis=1)


[0.99999994 1.        ]
<NDArray 2 @cpu(0)>

## define the model

In [10]:
def net(x):
    return softmax(nd.dot(x.reshape(-1, num_inputs),W)+b)

## the loss function
- cross entropy
$$
l(\mathbf{y}, \hat{\mathbf{y}}) = - \sum_j y_j \log \hat{y}_j
$$

In [11]:
def cross_entropy(y_hat, y):
    return -nd.pick(y_hat,y).log()

## classification accuracy

In [14]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0., 0
    for x,y in data_iter:
        y = y.astype("float32")
        acc_sum += (net(x).argmax(axis=1)==y).sum().asscalar()
        n += y.size
    return acc_sum/n

- Because we initilized the net model with random weights, the accuracy of this model should be close to random guessing, i.e., 0.1 for 10 classes.

In [15]:
evaluate_accuracy(test_iter, net)

0.0856

## model training

In [27]:
def train(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr, trainer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0., 0., 0
        for x,y in train_iter:
            with autograd.record():
                y_hat = net(x)
                l = loss(y_hat, y).sum()
            l.backward()
            for param in params:
                param[:] = param - lr * param.grad / batch_size
            y = y.astype("float32")
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1)==y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net)
        print("epoch %d, loss %.4f, train_acc %.3f, test_acc %.3f" %(epoch+1, train_l_sum/n, train_acc_sum/n, test_acc)) 

In [28]:
num_epochs, lr = 5, 0.1
train(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W,b], lr)

epoch 1, loss 0.4783, train_acc 0.838, test_acc 0.837
epoch 2, loss 0.4694, train_acc 0.841, test_acc 0.847
epoch 3, loss 0.4620, train_acc 0.843, test_acc 0.845
epoch 4, loss 0.4555, train_acc 0.845, test_acc 0.850
epoch 5, loss 0.4509, train_acc 0.846, test_acc 0.850
