In [11]:
from mxnet.gluon import data as gdata
from mxnet import gluon, init, nd, autograd
from mxnet.gluon import nn

In [2]:
mnist_train = gdata.vision.FashionMNIST(root="./data", train=True)
mnist_test = gdata.vision.FashionMNIST(root="./data", train=False)
batch_size = 256
transformer = gdata.vision.transforms.ToTensor()
train_iter = gdata.DataLoader(mnist_train.transform_first(transformer), batch_size=batch_size, shuffle=True)
test_iter = gdata.DataLoader(mnist_test.transform_first(transformer), batch_size=batch_size, shuffle=True)

## initialize model parameters
The output layer of softmax regression is a fully connected layer. We initialize the weights at random with zero mean and standard deviation 0.01

In [3]:
net = nn.Sequential()
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

## log of softmax
$$
\begin{aligned}
\log{(\hat y_j)}  = \log\left( \frac{e^{z_j}}{\sum_{i=1}^{n} e^{z_i}}\right) 
 = z_j -\log{\left( \sum_{i=1}^{n} e^{z_i} \right)}
\end{aligned}
$$

In [4]:
loss = gluon.loss.SoftmaxCrossEntropyLoss()

## optimization algorithm

In [5]:
net.collect_params()

sequential0_ (
  Parameter dense0_weight (shape=(10, 0), dtype=float32)
  Parameter dense0_bias (shape=(10,), dtype=float32)
)

In [6]:
trainer = gluon.Trainer(net.collect_params(), "sgd", {"learning_rate":0.1})

## training

In [7]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0., 0
    for x,y in data_iter:
        y = y.astype("float32")
        acc_sum += (net(x).argmax(axis=1)==y).sum().asscalar()
        n += y.size
    return acc_sum/n

In [9]:
def train(net, train_iter, test_iter, loss, num_epochs, batch_size,trainer):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0., 0., 0
        for x,y in train_iter:
            with autograd.record():
                y_hat = net(x)
                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype("float32")
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1)==y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net)
        print("epoch %d, loss %.4f, train_acc %.3f, test_acc %.3f" %(epoch+1, train_l_sum/n, train_acc_sum/n, test_acc)) 

In [12]:
num_epochs = 5
train(net, train_iter, test_iter, loss, num_epochs, batch_size, trainer)

epoch 1, loss 0.7880, train_acc 0.748, test_acc 0.805
epoch 2, loss 0.5748, train_acc 0.810, test_acc 0.823
epoch 3, loss 0.5292, train_acc 0.823, test_acc 0.832
epoch 4, loss 0.5053, train_acc 0.830, test_acc 0.833
epoch 5, loss 0.4896, train_acc 0.835, test_acc 0.839
