# 丢弃法

* 丢弃法（dropout）方法主要用来应对深度学习中过拟合问题
* dropout 只在训练时使用

$$E(h^{'}_{i}) = \frac{E(\xi_{i})}{1-p}h_i$$

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import time
import numpy as np
import utils as d2l

## load dataset

In [2]:
def load_dataset():
    mnist_train = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=True, download=True,
                                                    transform=transforms.ToTensor())
    mnist_test = torchvision.datasets.FashionMNIST(root='../Datasets/FashionMNIST', train=False, download=True,
                                                   transform=transforms.ToTensor())
    return mnist_train, mnist_test

In [3]:
mnist_train, mnist_test = load_dataset()
print(len(mnist_train), len(mnist_test))

60000 10000


## model

In [4]:
def dropout(x, drop_prob=0.0):
    
    x = x.float()
    
    assert 0 <= drop_prob and drop_prob<= 1
    keep_prob = 1 - drop_prob  # 1-p
    if keep_prob == 0:
        return torch.zeros_like(x)
    
    mask = (torch.rand(x.shape) < keep_prob).float()
  
    return mask * x / keep_prob
      

In [5]:
x = torch.arange(12).view(4,3)
print(dropout(x, 0.0))
print(dropout(x, 0.5))
print(dropout(x, 1.0))

tensor([[ 0.,  1.,  2.],
        [ 3.,  4.,  5.],
        [ 6.,  7.,  8.],
        [ 9., 10., 11.]])
tensor([[ 0.,  2.,  0.],
        [ 0.,  8., 10.],
        [ 0.,  0., 16.],
        [18.,  0., 22.]])
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


### params

In [6]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256


w1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True)
w2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)
w3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

params = [w1, b1, w2, b2, w3, b3]

In [7]:
# linear regression
def linear_regression(x, w, b):
    
    return torch.mm(x, w) + b

In [8]:
def net(x, drop_prob=0.0, is_training=True):
    
    drop_prob = drop_prob if is_training else 0.0
    
    h1 = linear_regression(x.view(-1, num_inputs), w1, b1).relu()
    h1 = dropout(h1, drop_prob)
    h2 = linear_regression(h1, w2, b2).relu()
    h2 = dropout(h2, drop_prob)
    logits = linear_regression(h2, w3, b3)
    
    return logits

In [9]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        if isinstance(net, torch.nn.Module):
            net.eval() # 评估模式, 这会关闭dropout
            acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            net.train() # 改回训练模式
        else: # 自定义的模型
            if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                # 将is_training设置成False
                acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
            else:
                acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
        n += y.shape[0]
    return acc_sum / n

In [10]:
lr = 0.2
batch_size = 256
num_epochs = 10
drop_prob = 0.5
num_workers = 4


loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(params, lr=lr)

train_generator = data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_generator = data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)


def train():
    for epoch in range(num_epochs):
        train_acc_sum, train_num = 0., 0
        train_loss_sum = 0.0
        for x, y in train_generator:
            y_pred = net(x, drop_prob, is_training=True)
            
            # computer loss
            l = loss(y_pred, y).sum() 
            
            # clean zero grad
            if optimizer is not None:
                optimizer.zero_grad()
            
            # compute grad
            l.backward()
            
            # update parameter
            optimizer.step()
                
            train_loss_sum += l.item()
            train_acc_sum += (y_pred.argmax(dim=1)==y).float().sum().item() 
            train_num += x.shape[0]
        
        # show train log
        train_acc = train_acc_sum / train_num
        train_loss = train_loss_sum / train_num

        test_acc = evaluate_accuracy(test_generator, net)
        
        print('epoch {} => loss {:.4f}, train acc {:.4f}, test acc {:.4f}'.
              format(epoch + 1, train_loss, train_acc, test_acc)) 

In [11]:
train()

epoch 1 => loss 0.0056, train acc 0.4378, test acc 0.6718
epoch 2 => loss 0.0028, train acc 0.7372, test acc 0.7617
epoch 3 => loss 0.0023, train acc 0.7912, test acc 0.8011
epoch 4 => loss 0.0020, train acc 0.8139, test acc 0.7907
epoch 5 => loss 0.0019, train acc 0.8306, test acc 0.8089
epoch 6 => loss 0.0018, train acc 0.8357, test acc 0.8320
epoch 7 => loss 0.0017, train acc 0.8426, test acc 0.8357
epoch 8 => loss 0.0017, train acc 0.8469, test acc 0.8492
epoch 9 => loss 0.0016, train acc 0.8538, test acc 0.8528
epoch 10 => loss 0.0016, train acc 0.8562, test acc 0.8522
