[View in Colaboratory](https://colab.research.google.com/github/brucecmd/learn_gluon/blob/master/dropout_raw.ipynb)

In [0]:
from mxnet import autograd, nd
from mxnet.gluon import loss as gloss
from mxnet.gluon import data as gdata
from mxnet import gluon

In [0]:
# define dropout function
def dropout(x, drop_prob):
    assert drop_prob>=0 and drop_prob<=1
    keep_prob = 1 - drop_prob
    if keep_prob ==0:
        return x.zeros_like()
    mask = nd.random_uniform(0,1,shape=(x.shape)) < keep_prob
    return x * mask / keep_prob

In [93]:
x = nd.arange(20).reshape((4,5))
print(x)
print(dropout(x,0.5))


[[ 0.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]
 [10. 11. 12. 13. 14.]
 [15. 16. 17. 18. 19.]]
<NDArray 4x5 @cpu(0)>

[[ 0.  0.  0.  6.  0.]
 [10.  0.  0.  0. 18.]
 [20. 22. 24. 26. 28.]
 [ 0.  0. 34.  0. 38.]]
<NDArray 4x5 @cpu(0)>


In [0]:
# defien params
num_inputs = 784
num_outputs = 10
num_hidden1 = 256
num_hidden2 = 256

w1 = nd.random_normal(0,0.01,shape=(num_inputs,num_hidden1))
b1 = nd.zeros((num_hidden1,))
w2 = nd.random_normal(0,0.01,shape=(num_hidden1, num_hidden2))
b2 = nd.zeros((num_hidden2,))
w3 = nd.random_normal(0,0.01,shape=(num_hidden2,num_outputs))
b3 = nd.zeros((num_outputs,))

params = [w1,w2,w3,b1,b2,b3]
for p in params:
    p.attach_grad()

In [0]:
# load data
mnist_train = gdata.vision.FashionMNIST(train=True)
mnist_test = gdata.vision.FashionMNIST(train=False)

In [0]:
# prepare data iter
batch_size = 256
transformer = gdata.vision.transforms.ToTensor()
train_iter = gdata.DataLoader(mnist_train.transform_first(transformer), batch_size, shuffle=True)
test_iter = gdata.DataLoader(mnist_test.transform_first(transformer), batch_size, shuffle=True)

In [0]:
# define net
dropout_prob1 = 0.5
dropout_prob2 = 0.2

def net(x):
    x = x.reshape((-1, num_inputs))
    h1 = nd.relu(nd.dot(x,w1) + b1)
    if autograd.is_training():
        h1 = dropout(h1,dropout_prob1)
    h2 = nd.relu(nd.dot(h1,w2) + b2)
    if autograd.is_training():
        h2 = dropout(h2, dropout_prob2)
    return nd.dot(h2,w3) + b3
def net2(x):
    x = x.reshape((-1, num_inputs))
    h1 = nd.relu(nd.dot(x,w1) + b1)
    h2 = nd.relu(nd.dot(h1,w2) + b2)
    return nd.dot(h2,w3) + b3

In [0]:
def sgd(params, batch_size, lr):
    for p in params:
        p[:] -= p.grad * lr / batch_size

In [0]:
loss_func = gloss.SoftmaxCrossEntropyLoss()

In [0]:
def accuracy(y_hat, y):
    return (y_hat.argmax(axis=1)==y.astype('float32')).mean().asscalar()
  
def estimate_accuracy(data_iter, net):
    total_acc = 0
    for data, label in data_iter:
        y_hat = net(data)
        acc = accuracy(y_hat, label)
        total_acc += acc
    return total_acc / len(data_iter)

batch_size=256
lr=0.01
dropout_prob1=0.2
dropout_prob2=0.5

epoch[0], train acc[0.119310], test acc[0.116016]

epoch[1], train acc[0.146604], test acc[0.147168]

epoch[2], train acc[0.112417], test acc[0.115918]

epoch[3], train acc[0.180846], test acc[0.185840]

epoch[4], train acc[0.409081], test acc[0.409668]

epoch[5], train acc[0.455158], test acc[0.452734]

epoch[6], train acc[0.513725], test acc[0.504199]

epoch[7], train acc[0.589711], test acc[0.582129]

epoch[8], train acc[0.630059], test acc[0.627539]

epoch[9], train acc[0.661115], test acc[0.667676]

-==========================================

batch_size=256
lr=0.001

epoch[0], train acc[0.094753], test acc[0.096777]

epoch[1], train acc[0.108455], test acc[0.107813]

epoch[2], train acc[0.122025], test acc[0.120801]

epoch[3], train acc[0.136447], test acc[0.133594]

epoch[4], train acc[0.159403], test acc[0.157715]

epoch[5], train acc[0.186824], test acc[0.190625]

epoch[6], train acc[0.214484], test acc[0.212109]

epoch[7], train acc[0.240852], test acc[0.242773]

epoch[8], train acc[0.267010], test acc[0.269727]

epoch[9], train acc[0.290924], test acc[0.292480]

lr更小了，但是看loss，比lr大的时候更稳定了，loss没有出现一下变大，一下又变小的情况。后面加大epoch，观察一下loss最小能够到什么程度。
-==========================================

batch_size=256
lr=0.001
epochs=20

epoch[0], train acc[0.090841], test acc[0.085742]

epoch[1], train acc[0.098482], test acc[0.097754]

epoch[2], train acc[0.106992], test acc[0.105371]

epoch[3], train acc[0.117404], test acc[0.114355]

epoch[4], train acc[0.128801], test acc[0.122949]

epoch[5], train acc[0.142858], test acc[0.141309]

epoch[6], train acc[0.156483], test acc[0.153516]

epoch[7], train acc[0.168706], test acc[0.165527]

epoch[8], train acc[0.180563], test acc[0.179199]

epoch[9], train acc[0.191933], test acc[0.191309]

epoch[10], train acc[0.200449], test acc[0.202344]

epoch[11], train acc[0.206150], test acc[0.205469]

epoch[12], train acc[0.205884], test acc[0.208496]

epoch[13], train acc[0.197185], test acc[0.201953]

epoch[14], train acc[0.182968], test acc[0.181055]

epoch[15], train acc[0.166888], test acc[0.165820]

epoch[16], train acc[0.151313], test acc[0.147168]

epoch[17], train acc[0.139943], test acc[0.140137]

epoch[18], train acc[0.132430], test acc[0.132324]

epoch[19], train acc[0.126435], test acc[0.122949]

结果打脸了，这一版的参数下，结果也出现了波动。后面看一下lr=0.01的时候，loss最好情况能到达什么样的程度。

-==========================================

batch_size=256
lr=0.01
epochs=50

大概 train_loss 和 test_loss都达到了0.83左右。
调大lr试试。

-===========================================

batch_size=256
lr=0.1
epoch=20

epoch[0], train acc[0.608959], test acc[0.602930]

epoch[1], train acc[0.743146], test acc[0.742090]

epoch[2], train acc[0.787467], test acc[0.779785]

epoch[3], train acc[0.816162], test acc[0.810547]

epoch[4], train acc[0.823482], test acc[0.826172]

epoch[5], train acc[0.835699], test acc[0.830371]

epoch[6], train acc[0.842537], test acc[0.838867]

epoch[7], train acc[0.846947], test acc[0.843848]

epoch[8], train acc[0.856294], test acc[0.851855]

epoch[9], train acc[0.865187], test acc[0.859082]

epoch[10], train acc[0.867819], test acc[0.867676]

epoch[11], train acc[0.871022], test acc[0.864551]

epoch[12], train acc[0.873460], test acc[0.868457]

epoch[13], train acc[0.879000], test acc[0.870605]

epoch[14], train acc[0.880297], test acc[0.870801]

epoch[15], train acc[0.881189], test acc[0.871973]

epoch[16], train acc[0.886553], test acc[0.877637]

epoch[17], train acc[0.885145], test acc[0.875586]

epoch[18], train acc[0.885295], test acc[0.876660]

epoch[19], train acc[0.890647], test acc[0.874414]

看起来lr=0.1表现好多了，收敛的快了很多。后面加上dropout，看看会表现如何。

-=============================================

batch_size=256
lr=0.1
nodropout

epoch[0], train acc[0.628341], test acc[0.627441]

epoch[1], train acc[0.745191], test acc[0.737793]

epoch[2], train acc[0.801889], test acc[0.796875]

epoch[3], train acc[0.811647], test acc[0.811133]

epoch[4], train acc[0.831394], test acc[0.826270]

epoch[5], train acc[0.842919], test acc[0.837793]

epoch[6], train acc[0.848177], test acc[0.848047]

epoch[7], train acc[0.851446], test acc[0.850195]

epoch[8], train acc[0.859491], test acc[0.849805]

epoch[9], train acc[0.861807], test acc[0.860352]

epoch[10], train acc[0.867437], test acc[0.860645]

epoch[11], train acc[0.868484], test acc[0.861426]

epoch[12], train acc[0.875211], test acc[0.871875]

epoch[13], train acc[0.874014], test acc[0.868359]

epoch[14], train acc[0.881560], test acc[0.874219]

epoch[15], train acc[0.880918], test acc[0.872559]

epoch[16], train acc[0.884092], test acc[0.874219]

epoch[17], train acc[0.887129], test acc[0.878320]

epoch[18], train acc[0.887395], test acc[0.873145]

epoch[19], train acc[0.885062], test acc[0.878125]


-=========================================

batch_size=256
lr=0.1
dropout_prob1=0.5
dropout_prob2=0.2

epoch[0], train acc[0.606028], test acc[0.601172]

epoch[1], train acc[0.737932], test acc[0.735547]

epoch[2], train acc[0.779959], test acc[0.780566]

epoch[3], train acc[0.817797], test acc[0.816406]

epoch[4], train acc[0.833505], test acc[0.831445]

epoch[5], train acc[0.834713], test acc[0.834180]

epoch[6], train acc[0.850648], test acc[0.851953]

epoch[7], train acc[0.856056], test acc[0.855371]

epoch[8], train acc[0.856549], test acc[0.849805]

epoch[9], train acc[0.864583], test acc[0.862500]

epoch[10], train acc[0.868163], test acc[0.865723]

epoch[11], train acc[0.866872], test acc[0.863770]

epoch[12], train acc[0.875244], test acc[0.870996]

epoch[13], train acc[0.875017], test acc[0.871680]

epoch[14], train acc[0.878529], test acc[0.872949]

epoch[15], train acc[0.881682], test acc[0.875488]

epoch[16], train acc[0.875659], test acc[0.862207]

epoch[17], train acc[0.885888], test acc[0.875977]

epoch[18], train acc[0.887705], test acc[0.877148]

epoch[19], train acc[0.885289], test acc[0.874512]

看起来，加了dropout_prob也没什么改善，之后把模型变复杂一些，看看dropout的作用吧。


In [101]:
epochs = 20
lr = 0.1
batch_size = 256
for i in range(epochs):
    for data, label in train_iter:
        with autograd.record():
            y_hat = net(data)
            l = loss_func(y_hat, label)
        l.backward()
        sgd(params, batch_size, lr)
    train_acc = estimate_accuracy(train_iter, net)
    test_acc = estimate_accuracy(test_iter, net)
    print('epoch[%d], train acc[%f], test acc[%f]'%(i, train_acc, test_acc))

epoch[0], train acc[0.606028], test acc[0.601172]
epoch[1], train acc[0.737932], test acc[0.735547]
epoch[2], train acc[0.779959], test acc[0.780566]
epoch[3], train acc[0.817797], test acc[0.816406]
epoch[4], train acc[0.833505], test acc[0.831445]
epoch[5], train acc[0.834713], test acc[0.834180]
epoch[6], train acc[0.850648], test acc[0.851953]
epoch[7], train acc[0.856056], test acc[0.855371]
epoch[8], train acc[0.856549], test acc[0.849805]
epoch[9], train acc[0.864583], test acc[0.862500]
epoch[10], train acc[0.868163], test acc[0.865723]
epoch[11], train acc[0.866872], test acc[0.863770]
epoch[12], train acc[0.875244], test acc[0.870996]
epoch[13], train acc[0.875017], test acc[0.871680]
epoch[14], train acc[0.878529], test acc[0.872949]
epoch[15], train acc[0.881682], test acc[0.875488]
epoch[16], train acc[0.875659], test acc[0.862207]
epoch[17], train acc[0.885888], test acc[0.875977]
epoch[18], train acc[0.887705], test acc[0.877148]
epoch[19], train acc[0.885289], test acc[