# Implementing Dropout for MNIST

In [22]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

batch_size_train = 4
learning_rate = 0.001
momentum = 0.9
dropout_rate_hidden = 0.5
dropout_rate_input = 0.2

### Data

In [23]:
trainloader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST(root='../data', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True)


use_gpu = torch.cuda.is_available()

# Weight Initialization

In [24]:
def init_weights(m):
    if type(m) == nn.Linear:
        # This may need to be changed to just normal_
        nn.init.xavier_normal_(m.weight)
        # Constant Bias
        m.bias.data.fill_(0.01)

### Bernoulli dropout

In [25]:
class BernoulliDropout(nn.Module):
    def __init__(self, p=0.5):
        super(BernoulliDropout, self).__init__()
        if p < 0 or p > 1:
            raise ValueError("dropout probability has to be between 0 and 1, "
                             "but got {}".format(p))
        self.p = p
        
    def forward(self, x):
        binomial = torch.distributions.binomial.Binomial(probs=self.p)
        return x * binomial.sample(x.size())

### Gaussian dropout

In [26]:
class GaussianDropout(nn.Module):
    def __init__(self, p=0.5):
        super(GaussianDropout, self).__init__()
        alpha = p/(1-p)
        self.alpha = torch.Tensor([alpha])
        
    def forward(self, x):
#         Sample noise   e ~ N(1, alpha)
        epsilon = Variable(torch.randn(x.size()) * self.alpha + 1)
        if use_gpu:
            epsilon = epsilon.cuda()
        return x * epsilon
        

### Feed forward network

In [27]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
#         self.do1 = nn.Dropout(p=dropout_rate_hidden) 
#         self.do2 = nn.Dropout(p=dropout_rate_input)

        self.do1 = GaussianDropout(dropout_rate_hidden)
        self.do2 = GaussianDropout(dropout_rate_input)
        
        self.fc1 = nn.Linear(28*28, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 10)

    def forward(self, x):
        x = self.do2(x.view(-1,28*28))
        x = F.relu(self.fc1(self.do1(x)))
        x = F.relu(self.fc2(self.do1(x)))
        x = self.fc3(x)
        return x

### Training

In [28]:
net = Net()
net.apply(init_weights)

if use_gpu:
    net = net.cuda()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)


for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0
            niter = epoch * len(trainloader) + i
print('Finished Training')

[1,   100] loss: 2.045
[1,   200] loss: 1.277
[1,   300] loss: 1.135
[1,   400] loss: 0.935
[1,   500] loss: 0.767
[1,   600] loss: 0.882
[1,   700] loss: 0.894
[1,   800] loss: 0.921
[1,   900] loss: 0.895
[1,  1000] loss: 0.710
[1,  1100] loss: 0.577
[1,  1200] loss: 0.747
[1,  1300] loss: 0.698
[1,  1400] loss: 0.718
[1,  1500] loss: 0.645
[1,  1600] loss: 0.659
[1,  1700] loss: 0.622
[1,  1800] loss: 0.640
[1,  1900] loss: 0.732
[1,  2000] loss: 0.534
[1,  2100] loss: 0.543
[1,  2200] loss: 0.596
[1,  2300] loss: 0.498
[1,  2400] loss: 0.529
[1,  2500] loss: 0.608
[1,  2600] loss: 0.479
[1,  2700] loss: 0.619
[1,  2800] loss: 0.487
[1,  2900] loss: 0.492
[1,  3000] loss: 0.540
[1,  3100] loss: 0.530
[1,  3200] loss: 0.482
[1,  3300] loss: 0.622
[1,  3400] loss: 0.527
[1,  3500] loss: 0.534
[1,  3600] loss: 0.456
[1,  3700] loss: 0.458
[1,  3800] loss: 0.492
[1,  3900] loss: 0.510
[1,  4000] loss: 0.552
[1,  4100] loss: 0.490
[1,  4200] loss: 0.491
[1,  4300] loss: 0.458
[1,  4400] 

### Testing
Comparison of classification error for Bernoulli and Gaussian dropout.

In [29]:
testSet = torchvision.datasets.MNIST(
    root='../data',
    train=False,
    download=True,
    transform=torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.1307,), (0.3081,))])
)

testLoader = torch.utils.data.DataLoader(
    testSet,
    batch_size=batch_size_train,
    shuffle=False,
    num_workers=0
)

In [30]:
total = 0
correct = 0
with torch.no_grad():
    for data in testLoader:
        images, labels = data
        if use_gpu:
            images = images.cuda()
            labels = labels.cuda()
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 92 %
