## 3.13.2 从零开始实现

In [1]:
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 丢弃所有元素
    if keep_prob == 0:
        return X.zeros_like()
    mask = nd.random.uniform(0, 1, X.shape) < keep_prob  # 在0, 1范围内随机生成实数 小于keep_prob则为1
    # 输出期望改变 - 原始输出f(x)期望: f(x) 添加dropout后输出f(x)期望: p*0 + (1-p)*f(x)  = (1-p)*f(x)
    # 故需要在训练过程处以(1-p)或在测试过程乘以(1-p)
    return mask * X / keep_prob

In [2]:
X = nd.arange(16).reshape((2, 8))
dropout(X, 0)


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

In [3]:
dropout(X, 0.5)


[[ 0.  2.  4.  6.  0.  0.  0. 14.]
 [ 0. 18.  0.  0. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

In [4]:
dropout(X, 1)


[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>

定义模型参数

In [5]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

定义模型

In [6]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    if autograd.is_training(): # 只在训练模型时使⽤丢弃法
        H1 = dropout(H1, drop_prob1) # 在第⼀层全连接后添加丢弃层
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2) # 在第⼆层全连接后添加丢弃层
    return nd.dot(H2, W3) + b3

训练和测试模型

In [7]:
num_epochs, lr, batch_size = 5, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,params, lr)

epoch 1, loss 1.2234, train acc 0.522, test acc 0.770
epoch 2, loss 0.6022, train acc 0.775, test acc 0.830
epoch 3, loss 0.5074, train acc 0.814, test acc 0.835
epoch 4, loss 0.4610, train acc 0.834, test acc 0.853
epoch 5, loss 0.4332, train acc 0.845, test acc 0.854


## 3.13.3 简洁实现

In [29]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1), # 在第⼀个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob2), # 在第⼆个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [9]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 1.1274, train acc 0.566, test acc 0.778
epoch 2, loss 0.5590, train acc 0.790, test acc 0.822
epoch 3, loss 0.4570, train acc 0.831, test acc 0.845
epoch 4, loss 0.5334, train acc 0.815, test acc 0.847
epoch 5, loss 0.4298, train acc 0.840, test acc 0.864


practice

In [21]:
# 如果把本节中的两个丢弃概率超参数对调，会有什么结果？
# 此处实验分别运行代码4次 且每一次dropout率对调后的模型比对调之前收敛更慢 
# 猜测可能是因为输入层的神经元被dropout导致模型无法学习到特征从而更难拟合
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob2), # 在第⼀个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1), # 在第⼆个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 1.2032, train acc 0.531, test acc 0.741
epoch 2, loss 0.6153, train acc 0.768, test acc 0.825
epoch 3, loss 0.5253, train acc 0.806, test acc 0.844
epoch 4, loss 0.4847, train acc 0.821, test acc 0.853
epoch 5, loss 0.4538, train acc 0.833, test acc 0.853


In [24]:
# 为了验证猜想 至改变不同的输入层dropout 
def trainWithSmallDroput():
    net = nn.Sequential()
    net.add(nn.Dense(256, activation="relu"),
            nn.Dropout(0.2),
            nn.Dense(256, activation="relu"),
            nn.Dropout(0.2),
            nn.Dense(10))
    net.initialize(init.Normal(sigma=0.01))
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
    d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

def trainWithLargeDroput():
    net = nn.Sequential()
    net.add(nn.Dense(256, activation="relu"),
            nn.Dropout(0.5),
            nn.Dense(256, activation="relu"),
            nn.Dropout(0.2),
            nn.Dense(10))
    net.initialize(init.Normal(sigma=0.01))
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
    d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
    

for _ in range(5):
    trainWithSmallDroput()
    print()
    trainWithLargeDroput()
    print("\n\n")

epoch 1, loss 1.1043, train acc 0.568, test acc 0.777
epoch 2, loss 0.5781, train acc 0.780, test acc 0.819
epoch 3, loss 0.4791, train acc 0.825, test acc 0.854
epoch 4, loss 0.4354, train acc 0.839, test acc 0.859
epoch 5, loss 0.4091, train acc 0.849, test acc 0.855

epoch 1, loss 1.1918, train acc 0.526, test acc 0.779
epoch 2, loss 0.6226, train acc 0.765, test acc 0.827
epoch 3, loss 0.5307, train acc 0.805, test acc 0.841
epoch 4, loss 0.4803, train acc 0.823, test acc 0.849
epoch 5, loss 0.4529, train acc 0.834, test acc 0.854



epoch 1, loss 1.1103, train acc 0.562, test acc 0.793
epoch 2, loss 0.5674, train acc 0.787, test acc 0.840
epoch 3, loss 0.4769, train acc 0.823, test acc 0.851
epoch 4, loss 0.6609, train acc 0.794, test acc 0.816
epoch 5, loss 0.5014, train acc 0.818, test acc 0.856

epoch 1, loss 1.1259, train acc 0.567, test acc 0.780
epoch 2, loss 0.5897, train acc 0.781, test acc 0.838
epoch 3, loss 0.5125, train acc 0.812, test acc 0.847
epoch 4, loss 0.4677, t

In [26]:
# 增⼤迭代周期数，使⽤丢弃法结果。
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 10, batch_size, None, None, trainer)

epoch 1, loss 1.3107, train acc 0.499, test acc 0.734
epoch 2, loss 0.6193, train acc 0.766, test acc 0.821
epoch 3, loss 0.5127, train acc 0.813, test acc 0.850
epoch 4, loss 0.4666, train acc 0.829, test acc 0.857
epoch 5, loss 0.4348, train acc 0.841, test acc 0.856
epoch 6, loss 0.4146, train acc 0.848, test acc 0.865
epoch 7, loss 0.4003, train acc 0.855, test acc 0.866
epoch 8, loss 0.3863, train acc 0.859, test acc 0.869
epoch 9, loss 0.3773, train acc 0.863, test acc 0.874
epoch 10, loss 0.3647, train acc 0.866, test acc 0.862


In [30]:
# 增⼤迭代周期数，不使⽤丢弃法结果。
# 拟合更快
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 10, batch_size, None, None, trainer)

epoch 1, loss 1.1386, train acc 0.552, test acc 0.772
epoch 2, loss 0.5442, train acc 0.796, test acc 0.837
epoch 3, loss 0.4511, train acc 0.832, test acc 0.852
epoch 4, loss 0.4092, train acc 0.847, test acc 0.860
epoch 5, loss 0.3835, train acc 0.856, test acc 0.867
epoch 6, loss 0.3634, train acc 0.864, test acc 0.867
epoch 7, loss 0.3519, train acc 0.869, test acc 0.872
epoch 8, loss 0.3309, train acc 0.876, test acc 0.874
epoch 9, loss 0.3233, train acc 0.879, test acc 0.876
epoch 10, loss 0.3079, train acc 0.885, test acc 0.882


In [8]:
# 如果将模型改得更加复杂，如增加隐藏层单元，使⽤丢弃法应对过拟合的效果是否更加明显？
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
#         nn.Dropout(0.2), # 在第⼀个全连接层后添加丢弃层
        
        nn.Dense(256, activation="relu"),
#         nn.Dropout(0.2), # 在第⼆个全连接层后添加丢弃层
        
        nn.Dense(256, activation="relu"),
#         nn.Dropout(0.5), # 在第三个全连接层后添加丢弃层
        
        nn.Dense(256, activation="relu"),
#         nn.Dropout(0.5), # 在第四个全连接层后添加丢弃层
        
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [34]:
# 使⽤丢弃法结果。
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 10, batch_size, None, None, trainer)

epoch 1, loss 2.3031, train acc 0.099, test acc 0.100
epoch 2, loss 2.3022, train acc 0.105, test acc 0.195
epoch 3, loss 1.6711, train acc 0.303, test acc 0.547
epoch 4, loss 1.2419, train acc 0.485, test acc 0.660
epoch 5, loss 0.8630, train acc 0.643, test acc 0.713
epoch 6, loss 0.6985, train acc 0.716, test acc 0.772
epoch 7, loss 0.6265, train acc 0.752, test acc 0.802
epoch 8, loss 0.5746, train acc 0.784, test acc 0.839
epoch 9, loss 0.5194, train acc 0.814, test acc 0.849
epoch 10, loss 0.4847, train acc 0.829, test acc 0.855


In [9]:
# 不使⽤丢弃法结果。
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 10, batch_size, None, None, trainer)

epoch 1, loss 2.3030, train acc 0.099, test acc 0.100
epoch 2, loss 2.2200, train acc 0.129, test acc 0.209
epoch 3, loss 1.4752, train acc 0.375, test acc 0.616
epoch 4, loss 1.1878, train acc 0.543, test acc 0.658
epoch 5, loss 0.8256, train acc 0.676, test acc 0.762
epoch 6, loss 0.5957, train acc 0.760, test acc 0.789
epoch 7, loss 0.5302, train acc 0.796, test acc 0.827
epoch 8, loss 0.4812, train acc 0.821, test acc 0.840
epoch 9, loss 0.4501, train acc 0.833, test acc 0.845
epoch 10, loss 0.4195, train acc 0.845, test acc 0.852


In [12]:
# # 以本节中的模型为例，比较使用丢弃法与权重衰减的效果。如果同时使用丢弃法和权重衰减，效果会如何?
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1),
        nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob2),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [9]:
# 只使用丢弃法
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 10, batch_size, None, None, trainer)

epoch 1, loss 1.2314, train acc 0.527, test acc 0.744
epoch 2, loss 0.5997, train acc 0.778, test acc 0.831
epoch 3, loss 0.4976, train acc 0.817, test acc 0.852
epoch 4, loss 0.4550, train acc 0.835, test acc 0.853
epoch 5, loss 0.4262, train acc 0.846, test acc 0.857
epoch 6, loss 0.4080, train acc 0.852, test acc 0.851
epoch 7, loss 0.3893, train acc 0.859, test acc 0.869
epoch 8, loss 0.3762, train acc 0.865, test acc 0.876
epoch 9, loss 0.3627, train acc 0.869, test acc 0.880
epoch 10, loss 0.3544, train acc 0.870, test acc 0.876


In [13]:
# 同时使用丢弃法和权重衰减
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr, 'wd': 0.01})
d2l.train_ch3(net, train_iter, test_iter, loss, 10, batch_size, None, None, trainer)

epoch 1, loss 1.2772, train acc 0.501, test acc 0.718
epoch 2, loss 0.8281, train acc 0.695, test acc 0.788
epoch 3, loss 0.7236, train acc 0.736, test acc 0.778
epoch 4, loss 0.7605, train acc 0.724, test acc 0.792
epoch 5, loss 0.7151, train acc 0.742, test acc 0.790
epoch 6, loss 0.7093, train acc 0.743, test acc 0.786
epoch 7, loss 0.6742, train acc 0.758, test acc 0.771
epoch 8, loss 0.7685, train acc 0.725, test acc 0.797
epoch 9, loss 0.8089, train acc 0.711, test acc 0.512
epoch 10, loss 0.7766, train acc 0.711, test acc 0.767
