# Implementing Dropout for MNIST

In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

batch_size_train = 4
learning_rate = 0.001
momentum = 0.9
dropout_rate_hidden = 0.5
dropout_rate_input = 0.2

### Data

In [29]:
trainloader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST(root='../data', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True)


use_gpu = torch.cuda.is_available()

### Bernoulli dropout

In [30]:
class BernoulliDropout(nn.Module):
    def __init__(self, p=0.5):
        super(BernoulliDropout, self).__init__()
        if p < 0 or p > 1:
            raise ValueError("dropout probability has to be between 0 and 1, "
                             "but got {}".format(p))
        self.p = p
        
    def forward(self, x):
        binomial = torch.distributions.binomial.Binomial(probs=self.p)
        return x * binomial.sample(x.size())

### Gaussian dropout

In [31]:
class GaussianDropout(nn.Module):
    def __init__(self, p=0.5):
        super(GaussianDropout, self).__init__()
        alpha = p/(1-p)
        self.alpha = torch.Tensor([alpha])
        
    def forward(self, x):
#         Sample noise   e ~ N(1, alpha)
        epsilon = Variable(torch.randn(x.size()) * self.alpha + 1)
        if use_gpu:
            epsilon = epsilon.cuda()
        return x * epsilon
        

### Feed forward network

In [32]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
#         self.do1 = nn.Dropout(p=dropout_rate_hidden) 
#         self.do2 = nn.Dropout(p=dropout_rate_input)

        self.do1 = GaussianDropout(dropout_rate_hidden)
        self.do2 = GaussianDropout(dropout_rate_input)
        
        self.fc1 = nn.Linear(28*28, 1024)
        self.fc2 = nn.Linear(1024, 10)

    def forward(self, x):
        x = self.do2(x.view(-1,28*28))
        x = F.relu(self.do1(self.fc1(x)))
        x = F.relu(self.do1(self.fc2(x)))
        return x

### Training

In [33]:
net = Net()

if use_gpu:
    net = net.cuda()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)


for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0
            niter = epoch * len(trainloader) + i
print('Finished Training')

[1,   100] loss: 2.158
[1,   200] loss: 1.789
[1,   300] loss: 1.705
[1,   400] loss: 1.610
[1,   500] loss: 1.494
[1,   600] loss: 1.579
[1,   700] loss: 1.446
[1,   800] loss: 1.580
[1,   900] loss: 1.394
[1,  1000] loss: 1.438
[1,  1100] loss: 1.426
[1,  1200] loss: 1.486
[1,  1300] loss: 1.468
[1,  1400] loss: 1.431
[1,  1500] loss: 1.402
[1,  1600] loss: 1.282
[1,  1700] loss: 1.297
[1,  1800] loss: 1.366
[1,  1900] loss: 1.501
[1,  2000] loss: 1.312
[1,  2100] loss: 1.253
[1,  2200] loss: 1.239
[1,  2300] loss: 1.412
[1,  2400] loss: 1.417
[1,  2500] loss: 1.268
[1,  2600] loss: 1.320
[1,  2700] loss: 1.275
[1,  2800] loss: 1.308
[1,  2900] loss: 1.168
[1,  3000] loss: 1.296
[1,  3100] loss: 1.145
[1,  3200] loss: 1.076
[1,  3300] loss: 1.199
[1,  3400] loss: 1.290
[1,  3500] loss: 1.294
[1,  3600] loss: 1.180
[1,  3700] loss: 1.293
[1,  3800] loss: 1.089
[1,  3900] loss: 1.240
[1,  4000] loss: 1.367
[1,  4100] loss: 1.223
[1,  4200] loss: 1.292
[1,  4300] loss: 1.138
[1,  4400] 

### Testing
Comparison of classification error for Bernoulli and Gaussian dropout.

In [34]:
testSet = torchvision.datasets.MNIST(
    root='../data',
    train=False,
    download=True,
    transform=torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.1307,), (0.3081,))])
)

testLoader = torch.utils.data.DataLoader(
    testSet,
    batch_size=batch_size_train,
    shuffle=False,
    num_workers=0
)

In [38]:
total = 0
correct = 0
with torch.no_grad():
    for data in testLoader:
        images, labels = data
        if use_gpu:
            images = inputs.cuda()
            labels = labels.cuda()
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 9 %
