# 了解知道Dropout原理
Dropout首次出现是在AlexNet的论文里, 其灵感来自与Hiton观察到的一种现象, 银行的柜员每隔一段时间就会变更

Dropout通常在深层的神经网络中使用, 作用是作为一种正则方法来减轻网络的过拟合, 

Dropout的原理是, 按设置好的比率随机删掉指定网络层的神经元, 在一定程度上减少了模型的复杂度, 

每轮训练drop不同的神经元, 最后训练出的模型相当于是进行了模型融合


# Dropout的numpy实现

In [2]:
import numpy as np
import random
np.random.seed(20190414)

N_SAMPLES = 100
N_FEATURES = 20
N_HIDDEN = 5
KEEP_PROB = 0.5

# input 
x = np.random.rand(N_SAMPLES, N_FEATURES)  

# param
w = np.random.rand(N_FEATURES, N_HIDDEN) 
b = np.zeros((N_HIDDEN, ))

# dropout 
drop_mask = np.ones((N_HIDDEN,))
idx = np.arange(0, N_HIDDEN)
random.shuffle(idx)
drop_mask[idx[:int(KEEP_PROB*N_HIDDEN)]] = 0
print(drop_mask,'\n')
w *= drop_mask
print(w)
z = x @ w + b
a = 1 / (1 + np.exp(-z))

[1. 0. 1. 1. 0.] 

[[0.71645649 0.         0.05060891 0.61955856 0.        ]
 [0.88576087 0.         0.39937113 0.53747527 0.        ]
 [0.53110658 0.         0.2702829  0.76411014 0.        ]
 [0.98724908 0.         0.29208576 0.31459534 0.        ]
 [0.622194   0.         0.1566883  0.26401286 0.        ]
 [0.76566414 0.         0.4378574  0.27853535 0.        ]
 [0.72641444 0.         0.08246531 0.5653347  0.        ]
 [0.62880534 0.         0.73748472 0.12773497 0.        ]
 [0.20516862 0.         0.80561179 0.00392014 0.        ]
 [0.23672954 0.         0.49349176 0.19032891 0.        ]
 [0.59615967 0.         0.68799677 0.70462144 0.        ]
 [0.34684668 0.         0.45034929 0.3083422  0.        ]
 [0.82270837 0.         0.47790851 0.01770492 0.        ]
 [0.41069159 0.         0.92817836 0.40919922 0.        ]
 [0.96195132 0.         0.20167057 0.19196291 0.        ]
 [0.57686403 0.         0.70863631 0.60936838 0.        ]
 [0.04163928 0.         0.30326119 0.70660164 0.     

# PyTorch中实现dropout

In [None]:
# 给AlexNet 添加L1, L2正则
# 正则化应该是模型每一层的加权参数，而不是每一层的输出

import torch as t
import torch.nn as nn
t.manual_seed(20190414)

class AlexNet(nn.Module):

    def __init__(self, num_classes=1):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), 256 * 6 * 6)
        x = self.classifier(x)
        return x


batch_size = 32    
    
net = AlexNet()    
optimizer = t.optim.SGD(net.parameters(), lr=1e-4)


x = t.rand((batch_size, )+ (3,224,224), requires_grad=True)
y_ = t.ones(batch_size, requires_grad=True)
l1, l2 = t.tensor(0).float(), t.tensor(0).float()


optimizer.zero_grad()
y = net(x)
y[y<0]=0

criterion = nn.BCELoss()
loss = criterion(y, y_.detach())

for param in net.parameters():
    l1 += t.norm(param, 1)
    l2 += t.norm(param, 2)

loss = loss + l1 + l2
loss.backward()
optimizer.step()