In [1]:
import torch
from torch import nn
import torchvision
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader

Typically, we disable dropout at test time. Given a trained model and a new example, we do not drop out any nodes and thus do not need to normalize. However, there are some exceptions: some researchers use dropout at test time as a heuristic for estimating the uncertainty of neural network predictions: if the predictions agree across many different dropout outputs, then we might say that the network is more confident.

In [2]:
def dropout_layer(X, dropout):
    assert 0 <= dropout <= 1
    if dropout == 1:
        return torch.zeros_like(X)
    mask = (torch.rand(X.shape) > dropout).to(X.dtype)
    return mask * X / (1.0 - dropout)

In [3]:
X = torch.arange(16, dtype=torch.float32).reshape((2, 8))
print(dropout_layer(X, 0))
print(dropout_layer(X, 0.5))
print(dropout_layer(X, 1))

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])
tensor([[ 0.,  0.,  0.,  6.,  8.,  0.,  0., 14.],
        [ 0., 18.,  0.,  0.,  0.,  0., 28.,  0.]])
tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])


## **Data**
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-iamges-idx3-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-iamges-idx3-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-iamges-idx1-ubyte.gz  
`{root}\FashionMNIST\raw`

In [4]:
trans = transforms.Compose([transforms.Resize((32, 32)),  # upscale
                            transforms.ToTensor()])

data_train = torchvision.datasets.FashionMNIST(
    root='./data', train=True, transform=trans, download=False 
)
data_val = torchvision.datasets.FashionMNIST(
    root='./data', train=False, transform=trans, download=False
)

In [5]:
image, label = data_train[0]  # [image, label]
print(image.shape) # (channel, height, weight)
print(label)

torch.Size([1, 32, 32])
9


In [6]:
batch_size = 64
train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(data_val, batch_size=batch_size, shuffle=False)

## **From Scratch**

A common choice is to set a lower dropout probability closer to the input layer. We ensure that dropout is only active during training.

In [7]:
class DropoutMLPScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens_1, num_hiddens_2, num_outputs, dropout_1, dropout_2):
        super().__init__()
        self.dropout_1 = dropout_1
        self.dropout_2 = dropout_2
        self.l1 = nn.Linear(num_inputs, num_hiddens_1)
        self.l2 = nn.Linear(num_hiddens_1, num_hiddens_2)
        self.l3 = nn.Linear(num_hiddens_2, num_outputs)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, X):
        X = X.reshape(X.shape[0], -1)
        H1 = self.relu(self.l1(X))
        if self.training:  # model.train() 하면 self.training에 True가 assign됨
            H1 = dropout_layer(H1, self.dropout_1)
        H2 = self.relu(self.l2(H1))
        if self.training:
            H2 = dropout_layer(H2, self.dropout_2)
        return self.softmax(self.l3(H2))

In [8]:
model = DropoutMLPScratch(
    num_inputs=1*32*32, num_hiddens_1=256, num_hiddens_2=256, num_outputs=10,
    dropout_1=0.5, dropout_2=0.5
)

In [9]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.1)

In [10]:
def cross_entropy(y_hat, y):
    # y_hat: (B, q)
    # y: (B)
    # sum -y_i*log(y_hat_i)
    return -torch.log(y_hat[list(range(y_hat.shape[0])), y]).mean()  # 정의는 sum()인데 batch_size로 나눠주려고 mean() 씀

def accuracy(y_hat, y):
    # y_hat: (B, q)
    # y: (B)
    preds = y_hat.argmax(axis=1).type(y.dtype)  # (B)
    compare = (preds == y).type(torch.float32)  # (B)
    return compare.mean()

In [11]:
%%time
max_epochs = 10

for i in range(max_epochs):
    model.train()  # dropout 켜짐
    train_loss = 0
    num_train_batches = 0
    
    for X, y in train_loader:
        optimizer.zero_grad()  # paramter.grad에 저장된 값을 초기화
        y_hat = model(X)
        loss = cross_entropy(y_hat, y)
        loss.backward()   # parameter.grad에 미분값이 assign됨
        optimizer.step()  # parameter.grad에 저장된 값에 따라 paramter의 값을 update 해줌
        train_loss += loss.item()
        num_train_batches += 1

    model.eval()  # dropout 꺼짐
    val_loss = 0
    val_acc = 0
    num_val_batches = 0
    with torch.no_grad():
        for X, y in val_loader:
            y_hat = model(X)
            loss = cross_entropy(y_hat, y)
            val_loss += loss.item()
            num_val_batches += 1
            val_acc += accuracy(y_hat, y)

    print(f'epoch={i:02d} | train_loss={train_loss/num_train_batches:.4f} | val_loss={val_loss/num_val_batches:.4f} | val_acc={val_acc/num_val_batches:.4f}')

epoch=00 | train_loss=0.7897 | val_loss=0.5252 | val_acc=0.8064
epoch=01 | train_loss=0.5323 | val_loss=0.4653 | val_acc=0.8287
epoch=02 | train_loss=0.4842 | val_loss=0.4434 | val_acc=0.8385
epoch=03 | train_loss=0.4528 | val_loss=0.4449 | val_acc=0.8358
epoch=04 | train_loss=0.4359 | val_loss=0.3994 | val_acc=0.8562
epoch=05 | train_loss=0.4209 | val_loss=0.4012 | val_acc=0.8578
epoch=06 | train_loss=0.4096 | val_loss=0.3923 | val_acc=0.8557
epoch=07 | train_loss=0.4013 | val_loss=0.3825 | val_acc=0.8603
epoch=08 | train_loss=0.3937 | val_loss=0.3808 | val_acc=0.8560
epoch=09 | train_loss=0.3814 | val_loss=0.3794 | val_acc=0.8588
CPU times: total: 13min 2s
Wall time: 2min 12s


## **Concise Implementation**

In [12]:
class DropoutMLP(nn.Module):
    def __init__(self, num_inputs, num_hiddens_1, num_hiddens_2, num_outputs, dropout_1, dropout_2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(num_inputs, num_hiddens_1),
            nn.ReLU(),
            nn.Dropout(dropout_1),
            nn.Linear(num_hiddens_1, num_hiddens_2),
            nn.ReLU(),
            nn.Dropout(dropout_2),
            nn.Linear(num_hiddens_2, num_outputs)
        )

    def forward(self, X):
        X = X.reshape(X.shape[0], -1)
        return self.net(X)

In [13]:
model = DropoutMLP(
    num_inputs=1*32*32, num_hiddens_1=256, num_hiddens_2=256, num_outputs=10,
    dropout_1=0.5, dropout_2=0.5
)

In [14]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.1)

In [15]:
def cross_entropy(y_hat, y):
    loss_fn = torch.nn.CrossEntropyLoss()
    return loss_fn(y_hat, y)  # 여기서의 y는 soft max 씌운 확률 값이 아니라 output 그 자체

def accuracy(y_hat, y):
    # y_hat: (B, q)
    # y: (B)
    preds = y_hat.argmax(axis=1).type(y.dtype)  # (B)
    compare = (preds == y).type(torch.float32)  # (B)
    return compare.mean()

In [16]:
%%time
max_epochs = 10

for i in range(max_epochs):
    model.train()  # dropout 켜짐
    train_loss = 0
    num_train_batches = 0
    
    for X, y in train_loader:
        optimizer.zero_grad()  # paramter.grad에 저장된 값을 초기화
        y_hat = model(X)
        loss = cross_entropy(y_hat, y)
        loss.backward()   # parameter.grad에 미분값이 assign됨
        optimizer.step()  # parameter.grad에 저장된 값에 따라 paramter의 값을 update 해줌
        train_loss += loss.item()
        num_train_batches += 1

    model.eval()  # dropout 꺼짐
    val_loss = 0
    val_acc = 0
    num_val_batches = 0
    with torch.no_grad():
        for X, y in val_loader:
            y_hat = model(X)
            loss = cross_entropy(y_hat, y)
            val_loss += loss.item()
            num_val_batches += 1
            val_acc += accuracy(y_hat, y)

    print(f'epoch={i:02d} | train_loss={train_loss/num_train_batches:.4f} | val_loss={val_loss/num_val_batches:.4f} | val_acc={val_acc/num_val_batches:.4f}')

epoch=00 | train_loss=0.7975 | val_loss=0.5321 | val_acc=0.8018
epoch=01 | train_loss=0.5361 | val_loss=0.4685 | val_acc=0.8260
epoch=02 | train_loss=0.4828 | val_loss=0.4246 | val_acc=0.8486
epoch=03 | train_loss=0.4561 | val_loss=0.4564 | val_acc=0.8344
epoch=04 | train_loss=0.4361 | val_loss=0.4073 | val_acc=0.8526
epoch=05 | train_loss=0.4202 | val_loss=0.4010 | val_acc=0.8518
epoch=06 | train_loss=0.4087 | val_loss=0.3861 | val_acc=0.8583
epoch=07 | train_loss=0.3976 | val_loss=0.3791 | val_acc=0.8627
epoch=08 | train_loss=0.3893 | val_loss=0.3986 | val_acc=0.8557
epoch=09 | train_loss=0.3809 | val_loss=0.3614 | val_acc=0.8658
CPU times: total: 14min 3s
Wall time: 2min 22s
