<a href="https://colab.research.google.com/github/abursuc/practicals_hti_2019/blob/master/05_dropout_mnist_dldiy_hti_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dropout

Refer to the slides for more informatio on Dropout: https://abursuc.github.io/slides/2019_hti/dl_hti_5_deeper.html#33

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import numpy as np
import random

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import torch
from torch.optim import lr_scheduler
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision.datasets import MNIST
from torchvision import transforms


# 1. Setup and initializations
We'll go through analysing the role of Dropout on MNIST dataset. 


In [0]:
class ExperimentParams():
    def __init__(self):
        self.data_dir = '/home/docker_user/'
        self.num_classes = 10
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.batch_size = 256
        self.num_epochs = 20
        self.num_workers = 4
        self.lr = 1e-2
        
        self.drop_prob1 = 0.2
        self.drop_prob2 = 0.5

args = ExperimentParams()

## 1.1 Prepare dataset


In [4]:
mean, std = 0.1307, 0.3081

train_dataset = MNIST(f'{args.data_dir}/data/MNIST', train=True, download=True,
                             transform=transforms.Compose([
                                 transforms.ToTensor(),
                                 transforms.Normalize((mean,), (std,))
                             ]))
test_dataset = MNIST(f'{args.data_dir}/data/MNIST', train=False, download=True,
                            transform=transforms.Compose([
                                transforms.ToTensor(),
                                transforms.Normalize((mean,), (std,))
                            ]))

0it [00:00, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /home/docker_user//data/MNIST/MNIST/raw/train-images-idx3-ubyte.gz


9920512it [00:02, 3642987.26it/s]                             


Extracting /home/docker_user//data/MNIST/MNIST/raw/train-images-idx3-ubyte.gz to /home/docker_user//data/MNIST/MNIST/raw


0it [00:00, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /home/docker_user//data/MNIST/MNIST/raw/train-labels-idx1-ubyte.gz


32768it [00:00, 57432.69it/s]                           
0it [00:00, ?it/s]

Extracting /home/docker_user//data/MNIST/MNIST/raw/train-labels-idx1-ubyte.gz to /home/docker_user//data/MNIST/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /home/docker_user//data/MNIST/MNIST/raw/t10k-images-idx3-ubyte.gz


1654784it [00:01, 867470.19it/s]                             
0it [00:00, ?it/s]

Extracting /home/docker_user//data/MNIST/MNIST/raw/t10k-images-idx3-ubyte.gz to /home/docker_user//data/MNIST/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /home/docker_user//data/MNIST/MNIST/raw/t10k-labels-idx1-ubyte.gz


8192it [00:00, 21670.04it/s]            

Extracting /home/docker_user//data/MNIST/MNIST/raw/t10k-labels-idx1-ubyte.gz to /home/docker_user//data/MNIST/MNIST/raw
Processing...
Done!





## 1.2 Common setup

In [0]:

mnist_classes = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


def get_raw_images(dataloader,mean=0.1307, std=0.3081):

    raw_images = np.zeros((len(dataloader.dataset), 1, 28, 28))
    k = 0
    for input, target in dataloader:
        raw_images[k:k+len(input)] = (input*std + mean).data.cpu().numpy()
        k += len(input)

    return raw_images


def show(img, title=None):
    # img is a torch.Tensor     
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1,2,0)), interpolation='nearest')
    plt.axis('off')
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated

# 2. Playing with DropOut


## 2.1 Architecture

#### Exercise

Complete the missing blocks in the definition of the following `DropoutNet` architecture: (`FullyConnected 256 -> ReLU -> Dropout (0.2) -> Fully Connected 256 -> ReLU -> -> Dropout (0.5) -> Fully Connected 10 `)

In [0]:
class DropoutNet(nn.Module):
    def __init__(self, num_classes=10,drop_prob1=0.2, drop_prob2=0.5):
        super(DropoutNet, self).__init__()
        self.classifier = nn.Sequential( 
                    #  TODO
                    )

    def forward(self, x):
                # TODO
        return self.classifier(x)



## 2.2 Training

In [0]:
# Set up data loaders

kwargs = {'num_workers': args.num_workers, 'pin_memory': True} 
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, **kwargs)

model_dropout = DropoutNet(num_classes=args.num_classes, drop_prob1=args.drop_prob1, drop_prob2=args.drop_prob2)
optimizer_dropout = optim.Adam(model_dropout.parameters(), lr=args.lr)
scheduler_dropout = lr_scheduler.StepLR(optimizer_dropout, 8, gamma=0.1, last_epoch=-1)

model_simple = DropoutNet(num_classes=args.num_classes, drop_prob1=0, drop_prob2=0)
optimizer_simple = optim.Adam(model_simple.parameters(), lr=args.lr)
scheduler_simple = lr_scheduler.StepLR(optimizer_dropout, 8, gamma=0.1, last_epoch=-1)

loss_fn = torch.nn.CrossEntropyLoss()

model_dropout.to(args.device)
model_simple.to(args.device)
loss_fn.to(args.device)

In [0]:
def train_classif_epoch(train_loader, model, loss_fn, optimizer, args, log_interval=100):
    model.train()
    losses = []
    total_loss, total_corrects, num_samples = 0, 0, 0
    corrects = 0.    
    for batch_idx, (data, target) in enumerate(train_loader):
        num_samples += data.size(0)
        
        data, target = data.to(args.device), target.to(args.device)
        
        optimizer.zero_grad()
        outputs = model(data)

        loss = loss_fn(outputs, target)
        losses.append(loss.data.item())

        _,preds = torch.max(outputs.data,1)
        corrects += torch.sum(preds == target.data).cpu()

        loss.backward()
        optimizer.step()
        
        if batch_idx % log_interval == 0:
            print('Train: [{}/{} ({:.0f}%)]\tLoss: {:.6f} \tAccuracy: {}'.format(
                batch_idx * len(data[0]), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), np.mean(losses), float(total_corrects)/num_samples))           
            
            total_loss += np.sum(losses)
            total_corrects += corrects
            losses, corrects = [], 0
            
    accuracy = total_corrects.item()/num_samples
    return total_loss/(batch_idx + 1), accuracy

def test_classif_epoch(test_loader, model, loss_fn, args, log_interval=100):
    with torch.no_grad():
        model.eval()
        losses, corrects = [], 0
        num_samples = 0
        corrects = 0.
        for batch_idx, (data, target) in enumerate(test_loader):

            num_samples += data.size(0)
            data, target = data.to(args.device), target.to(args.device)

            outputs = model(data)

            loss = loss_fn(outputs, target)
            losses.append(loss.data.item())

            _,preds = torch.max(outputs.data,1)
            corrects += torch.sum(preds == target.data).cpu()

        accuracy = corrects.item()/num_samples
        return np.sum(losses)/(batch_idx + 1), accuracy

#### Training the baseline model for a while

In [0]:
start_epoch = 0

for epoch in range(0, start_epoch):
    scheduler_simple.step()

train_losses_simple, val_losses_simple, val_accuracies_simple = [], [], []
for epoch in range(start_epoch, args.num_epochs):
    scheduler_simple.step()

    train_loss, train_accuracy = train_classif_epoch(train_loader, model_simple, loss_fn, optimizer_simple, args)

    message = 'Epoch: {}/{}. Train set: Average loss: {:.4f} Average accuracy: {:.4f}'.format(
        epoch + 1, args.num_epochs, train_loss, train_accuracy)
    
    val_loss, val_accuracy = test_classif_epoch(test_loader, model_simple, loss_fn, args)
    
    message += '\nEpoch: {}/{}. Validation set: Average loss: {:.4f}  Average accuracy: {:.4f}'.format(epoch + 1, args.num_epochs,
                                                                             val_loss, val_accuracy)
    print(message)
    train_losses_simple.append(train_loss)
    val_losses_simple.append(val_loss)
    val_accuracies_simple.append(val_accuracy)

#### Training the Dropout variant

In [0]:
start_epoch = 0

for epoch in range(0, start_epoch):
    scheduler_dropout.step()

train_losses, val_losses, val_accuracies = [], [], []
for epoch in range(start_epoch, args.num_epochs):
    scheduler_dropout.step()

    train_loss, train_accuracy = train_classif_epoch(train_loader, model_dropout, loss_fn, optimizer_dropout, args)

    message = 'Epoch: {}/{}. Train set: Average loss: {:.4f} Average accuracy: {:.4f}'.format(
        epoch + 1, args.num_epochs, train_loss, train_accuracy)
    
    val_loss, val_accuracy = test_classif_epoch(test_loader, model_dropout, loss_fn, args)
    
    message += '\nEpoch: {}/{}. Validation set: Average loss: {:.4f}  Average accuracy: {:.4f}'.format(epoch + 1, args.num_epochs,
                                                                             val_loss, val_accuracy)
    print(message)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)



#### Plot results

In [0]:
plt.cla()
epochs = np.arange(args.num_epochs)
plt.plot(epochs, train_losses_simple, 'orange', lw=3, label='no dropout')
plt.plot(epochs, train_losses, 'green', lw=3, label='with dropout')
plt.legend(loc='upper left'); 
plt.xlabel('epoch')
plt.ylabel('train loss')
plt.title('Train loss')
plt.grid(True)
plt.pause(0.1)
plt.show()


plt.cla()
epochs = np.arange(args.num_epochs)
plt.plot(epochs, val_losses_simple, 'orange', lw=3, label='no dropout')
plt.plot(epochs, val_losses, 'green', lw=3, label='with dropout')
plt.legend(loc='upper left'); 
plt.xlabel('epoch')
plt.ylabel('validation loss')
plt.title('Validation loss')
plt.grid(True)
plt.pause(0.1)
plt.show()

        
plt.cla()
epochs = np.arange(args.num_epochs)
plt.plot(epochs, val_accuracies_simple, 'orange', lw=3, label='no dropout')
plt.plot(epochs, val_accuracies, 'green', lw=3, label='with dropout')
plt.legend(loc='upper left'); 
plt.xlabel('epoch')
plt.ylabel('validation accuracy')
plt.title('Validation accuracy')
plt.grid(True)
plt.pause(0.1)
plt.show()


# 3. Exercises

1. Try out what happens if you change the dropout probabilities for layers 1 and 2. In particular, what happens if you switch the ones for both layers?
2. Increase the number of epochs and compare the results obtained when using dropout with those when not using it.
3. If changes are made to the model to make it more complex, such as adding hidden layer units, will the effect of using dropout to cope with overfitting be more obvious?
4. What happens if we apply dropout to the individual weights of the weight matrix rather than the activations?
