## 3.13.2 从零开始实现

In [1]:
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 丢弃所有元素
    if keep_prob == 0:
        return X.zeros_like()
    mask = nd.random.uniform(0, 1, X.shape) < keep_prob  # 在0, 1范围内随机生成实数 小于keep_prob则为1
    # 输出期望改变 - 原始输出f(x)期望: f(x) 添加dropout后输出f(x)期望: p*0 + (1-p)*f(x)  = (1-p)*f(x)
    # 故需要在训练过程处以(1-p)或在测试过程乘以(1-p)
    return mask * X / keep_prob

In [2]:
X = nd.arange(16).reshape((2, 8))
dropout(X, 0)


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

In [3]:
dropout(X, 0.5)


[[ 0.  2.  4.  6.  0.  0.  0. 14.]
 [ 0. 18.  0.  0. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

In [4]:
dropout(X, 1)


[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>

定义模型参数

In [5]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

定义模型

In [6]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    if autograd.is_training(): # 只在训练模型时使⽤丢弃法
        H1 = dropout(H1, drop_prob1) # 在第⼀层全连接后添加丢弃层
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2) # 在第⼆层全连接后添加丢弃层
    return nd.dot(H2, W3) + b3

训练和测试模型

In [7]:
num_epochs, lr, batch_size = 5, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,params, lr)

epoch 1, loss 1.1521, train acc 0.557, test acc 0.773
epoch 2, loss 0.5856, train acc 0.780, test acc 0.834
epoch 3, loss 0.4916, train acc 0.820, test acc 0.846
epoch 4, loss 0.4485, train acc 0.834, test acc 0.850
epoch 5, loss 0.4250, train acc 0.845, test acc 0.863


## 3.13.3 简洁实现

In [16]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1), # 在第⼀个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob2), # 在第⼆个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [9]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 1.1725, train acc 0.553, test acc 0.771
epoch 2, loss 0.5408, train acc 0.798, test acc 0.837
epoch 3, loss 0.4602, train acc 0.828, test acc 0.839
epoch 4, loss 0.4178, train acc 0.845, test acc 0.862
epoch 5, loss 0.3910, train acc 0.856, test acc 0.857


In [11]:
# 如果把本节中的两个丢弃概率超参数对调，会有什么结果？
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 1.1941, train acc 0.534, test acc 0.778
epoch 2, loss 0.6114, train acc 0.774, test acc 0.814
epoch 3, loss 0.4970, train acc 0.814, test acc 0.846
epoch 4, loss 0.4308, train acc 0.839, test acc 0.854
epoch 5, loss 0.3995, train acc 0.851, test acc 0.866


In [17]:
# 增⼤迭代周期数，使⽤丢弃法结果。
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 200, batch_size, None, None, trainer)

epoch 1, loss 1.2075, train acc 0.531, test acc 0.742
epoch 2, loss 0.5902, train acc 0.780, test acc 0.830
epoch 3, loss 0.5004, train acc 0.816, test acc 0.844
epoch 4, loss 0.4535, train acc 0.834, test acc 0.862
epoch 5, loss 0.4261, train acc 0.844, test acc 0.864
epoch 6, loss 0.4018, train acc 0.853, test acc 0.868
epoch 7, loss 0.3948, train acc 0.857, test acc 0.875
epoch 8, loss 0.3752, train acc 0.863, test acc 0.870
epoch 9, loss 0.3677, train acc 0.867, test acc 0.873
epoch 10, loss 0.3562, train acc 0.870, test acc 0.871
epoch 11, loss 0.3489, train acc 0.872, test acc 0.877
epoch 12, loss 0.3386, train acc 0.877, test acc 0.878
epoch 13, loss 0.3322, train acc 0.878, test acc 0.881
epoch 14, loss 0.3237, train acc 0.882, test acc 0.884
epoch 15, loss 0.3203, train acc 0.881, test acc 0.884
epoch 16, loss 0.3144, train acc 0.885, test acc 0.888
epoch 17, loss 0.3095, train acc 0.886, test acc 0.884
epoch 18, loss 0.3054, train acc 0.888, test acc 0.884
epoch 19, loss 0.30

epoch 150, loss 0.1360, train acc 0.949, test acc 0.899
epoch 151, loss 0.1336, train acc 0.950, test acc 0.902
epoch 152, loss 0.1316, train acc 0.950, test acc 0.901
epoch 153, loss 0.1327, train acc 0.951, test acc 0.903
epoch 154, loss 0.1344, train acc 0.949, test acc 0.901
epoch 155, loss 0.1309, train acc 0.951, test acc 0.904
epoch 156, loss 0.1300, train acc 0.951, test acc 0.905
epoch 157, loss 0.1307, train acc 0.950, test acc 0.901
epoch 158, loss 0.1298, train acc 0.951, test acc 0.900
epoch 159, loss 0.1294, train acc 0.951, test acc 0.901
epoch 160, loss 0.1328, train acc 0.950, test acc 0.899
epoch 161, loss 0.1267, train acc 0.952, test acc 0.901
epoch 162, loss 0.1294, train acc 0.951, test acc 0.901
epoch 163, loss 0.1263, train acc 0.951, test acc 0.900
epoch 164, loss 0.1316, train acc 0.950, test acc 0.902
epoch 165, loss 0.1285, train acc 0.952, test acc 0.902
epoch 166, loss 0.1269, train acc 0.953, test acc 0.903
epoch 167, loss 0.1275, train acc 0.951, test ac

In [19]:
# 增⼤迭代周期数，不使⽤丢弃法结果。
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 200, batch_size, None, None, trainer)

epoch 1, loss 1.2988, train acc 0.497, test acc 0.769
epoch 2, loss 0.5734, train acc 0.782, test acc 0.834
epoch 3, loss 0.4753, train acc 0.823, test acc 0.839
epoch 4, loss 0.4262, train acc 0.842, test acc 0.849
epoch 5, loss 0.3974, train acc 0.852, test acc 0.864
epoch 6, loss 0.3738, train acc 0.862, test acc 0.856
epoch 7, loss 0.3595, train acc 0.867, test acc 0.870
epoch 8, loss 0.3446, train acc 0.872, test acc 0.869
epoch 9, loss 0.3328, train acc 0.876, test acc 0.878
epoch 10, loss 0.3206, train acc 0.881, test acc 0.878
epoch 11, loss 0.3163, train acc 0.882, test acc 0.879
epoch 12, loss 0.2991, train acc 0.888, test acc 0.881
epoch 13, loss 0.2929, train acc 0.890, test acc 0.880
epoch 14, loss 0.2960, train acc 0.892, test acc 0.883
epoch 15, loss 0.2798, train acc 0.895, test acc 0.883
epoch 16, loss 0.2761, train acc 0.896, test acc 0.884
epoch 17, loss 0.2707, train acc 0.898, test acc 0.888
epoch 18, loss 0.2608, train acc 0.902, test acc 0.892
epoch 19, loss 0.25

epoch 150, loss 0.3542, train acc 0.870, test acc 0.865
epoch 151, loss 0.3540, train acc 0.869, test acc 0.858
epoch 152, loss 0.3518, train acc 0.871, test acc 0.862
epoch 153, loss 0.3496, train acc 0.871, test acc 0.863
epoch 154, loss 0.3510, train acc 0.871, test acc 0.865
epoch 155, loss 0.3450, train acc 0.873, test acc 0.860
epoch 156, loss 0.3417, train acc 0.875, test acc 0.860
epoch 157, loss 0.3422, train acc 0.874, test acc 0.858
epoch 158, loss 0.3400, train acc 0.875, test acc 0.865
epoch 159, loss 0.3337, train acc 0.877, test acc 0.861
epoch 160, loss 1.3992, train acc 0.527, test acc 0.237
epoch 161, loss 1.5172, train acc 0.353, test acc 0.507
epoch 162, loss 0.8821, train acc 0.659, test acc 0.733
epoch 163, loss 0.6586, train acc 0.749, test acc 0.774
epoch 164, loss 0.5949, train acc 0.773, test acc 0.786
epoch 165, loss 0.5611, train acc 0.784, test acc 0.787
epoch 166, loss 0.5353, train acc 0.795, test acc 0.810
epoch 167, loss 0.6352, train acc 0.751, test ac

In [14]:
# 如果将模型改得更加复杂，如增加隐藏层单元，使⽤丢弃法应对过拟合的效果是否更加明显？
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
#         nn.Dropout(0.2), # 在第⼀个全连接层后添加丢弃层
        
        nn.Dense(256, activation="relu"),
#         nn.Dropout(0.2), # 在第⼆个全连接层后添加丢弃层
        
        nn.Dense(512, activation="relu"),
#         nn.Dropout(0.5), # 在第三个全连接层后添加丢弃层
        
        nn.Dense(512, activation="relu"),
#         nn.Dropout(0.5), # 在第四个全连接层后添加丢弃层
        
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [9]:
# 使⽤丢弃法结果。
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
d2l.train_ch3(net, train_iter, test_iter, loss, 100, batch_size, None, None, trainer)

epoch 1, loss 1.7772, train acc 0.300, test acc 0.641
epoch 2, loss 0.7640, train acc 0.700, test acc 0.759
epoch 3, loss 0.6006, train acc 0.779, test acc 0.823
epoch 4, loss 0.5058, train acc 0.814, test acc 0.836
epoch 5, loss 0.4594, train acc 0.830, test acc 0.854
epoch 6, loss 0.4330, train acc 0.843, test acc 0.855
epoch 7, loss 0.4096, train acc 0.851, test acc 0.864
epoch 8, loss 0.3976, train acc 0.855, test acc 0.866
epoch 9, loss 0.3810, train acc 0.861, test acc 0.876
epoch 10, loss 0.3692, train acc 0.864, test acc 0.874
epoch 11, loss 0.3640, train acc 0.867, test acc 0.875
epoch 12, loss 0.3481, train acc 0.873, test acc 0.879
epoch 13, loss 0.3389, train acc 0.875, test acc 0.871
epoch 14, loss 0.3320, train acc 0.879, test acc 0.878
epoch 15, loss 0.3253, train acc 0.881, test acc 0.884
epoch 16, loss 0.3204, train acc 0.883, test acc 0.885
epoch 17, loss 0.3121, train acc 0.884, test acc 0.885
epoch 18, loss 0.3084, train acc 0.888, test acc 0.884
epoch 19, loss 0.30

In [15]:
# 不使⽤丢弃法结果。
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
d2l.train_ch3(net, train_iter, test_iter, loss, 200, batch_size, None, None, trainer)

epoch 1, loss 2.3027, train acc 0.101, test acc 0.100
epoch 2, loss 2.3025, train acc 0.102, test acc 0.100
epoch 3, loss 2.3023, train acc 0.107, test acc 0.100
epoch 4, loss 2.3011, train acc 0.129, test acc 0.100
epoch 5, loss 2.0594, train acc 0.222, test acc 0.450
epoch 6, loss 1.4460, train acc 0.415, test acc 0.542
epoch 7, loss 1.1110, train acc 0.547, test acc 0.627
epoch 8, loss 0.9566, train acc 0.622, test acc 0.729
epoch 9, loss 0.7949, train acc 0.703, test acc 0.760
epoch 10, loss 0.6628, train acc 0.751, test acc 0.773
epoch 11, loss 0.6284, train acc 0.769, test acc 0.800
epoch 12, loss 0.5422, train acc 0.801, test acc 0.816
epoch 13, loss 0.5098, train acc 0.816, test acc 0.838
epoch 14, loss 0.4785, train acc 0.828, test acc 0.841
epoch 15, loss 0.4540, train acc 0.838, test acc 0.844
epoch 16, loss 0.4263, train acc 0.847, test acc 0.855
epoch 17, loss 0.4188, train acc 0.849, test acc 0.857
epoch 18, loss 0.4074, train acc 0.855, test acc 0.857
epoch 19, loss 0.39

epoch 150, loss 0.0797, train acc 0.971, test acc 0.890
epoch 151, loss 0.0885, train acc 0.967, test acc 0.894
epoch 152, loss 0.0921, train acc 0.967, test acc 0.890
epoch 153, loss 0.0808, train acc 0.971, test acc 0.889
epoch 154, loss 0.0753, train acc 0.973, test acc 0.893
epoch 155, loss 0.1184, train acc 0.959, test acc 0.890
epoch 156, loss 0.0826, train acc 0.970, test acc 0.894
epoch 157, loss 0.0834, train acc 0.969, test acc 0.893
epoch 158, loss 0.0840, train acc 0.969, test acc 0.894
epoch 159, loss 0.0808, train acc 0.971, test acc 0.888
epoch 160, loss 0.0807, train acc 0.971, test acc 0.896
epoch 161, loss 0.0785, train acc 0.971, test acc 0.895
epoch 162, loss 0.0728, train acc 0.973, test acc 0.893
epoch 163, loss 0.0924, train acc 0.967, test acc 0.884
epoch 164, loss 0.0683, train acc 0.975, test acc 0.896
epoch 165, loss 0.0672, train acc 0.976, test acc 0.894
epoch 166, loss 0.0706, train acc 0.974, test acc 0.888
epoch 167, loss 0.0645, train acc 0.976, test ac