# Implementing Dropout for MNIST

In [83]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

batch_size_train = 4
learning_rate = 0.001
momentum = 0.9
dropout_rate_hidden = 0.5
dropout_rate_input = 0.2

### Data

In [89]:
trainloader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST(root='../data', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True)


use_gpu = torch.cuda.is_available()

### Bernoulli dropout

In [None]:
class BernoulliDropout(nn.Module):
    def __init__(self, p=0.5):
        super(BernoulliDropout, self).__init__()
        if p < 0 or p > 1:
            raise ValueError("dropout probability has to be between 0 and 1, "
                             "but got {}".format(p))
        self.p = p
        
    def forward(self, x):
        binomial = torch.distributions.binomial.Binomial(probs=self.p)
        return x * binomial.sample(x.size())

### Gaussian dropout

In [91]:
class GaussianDropout(nn.Module):
    def __init__(self, p=0.5):
        super(GaussianDropout, self).__init__()
        alpha = p/(1-p)
        self.alpha = torch.Tensor([alpha])
        
    def forward(self, x):
#         Sample noise   e ~ N(1, alpha)
        epsilon = Variable(torch.randn(x.size()) * self.alpha + 1)
        if use_gpu:
            epsilon = epsilon.cuda()
        return x * epsilon
        

### Feed forward network

In [86]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
#         self.do1 = nn.Dropout(p=dropout_rate_hidden) 
#         self.do2 = nn.Dropout(p=dropout_rate_input)

        self.do1 = GaussianDropout(dropout_rate_hidden)
        self.do2 = GaussianDropout(dropout_rate_input)
        
        self.fc1 = nn.Linear(28*28, 1024)
        self.fc2 = nn.Linear(1024, 10)

    def forward(self, x):
        x = self.do2(x.view(-1,28*28))
        x = F.relu(self.do1(self.fc1(x)))
        x = F.relu(self.do1(self.fc2(x)))
        return x

### Training

In [None]:
net = Net()

if use_gpu:
    net = net.cuda()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)


for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0
            niter = epoch * len(trainloader) + i
print('Finished Training')

### Testing
Comparison of classification error for Bernoulli and Gaussian dropout.