## 0. Setup

In [19]:
# Import dependencies
import sys
sys.path.append("notebooks")

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [20]:
# Set up your device 
cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda else "cpu")

In [21]:
# Set up random seed to 1008. Do not change the random seed.
# Yes, these are all necessary when you run experiments!
seed = 1008
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

## 1. Data: MNIST
#### Load the MNIST training and test dataset using $\texttt{torch.utils.data.DataLoader}$ and $\texttt{torchvision.datasets}$. 

Hint: You might find Alf's notebook useful: https://github.com/Atcold/pytorch-Deep-Learning/blob/master/06-convnet.ipynb, or see some of the PyTorch tutorials.

### 1.1. Load Training Set [4 pts]

In [22]:
input_size  = 28*28   # images are 28x28 pixels
output_size = 10      # there are 10 classes
batch_size=128

In [23]:
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST(
        'data/',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])
    ),
    batch_size=batch_size, 
    shuffle=True
)

### 1.1. Load Test Set [4 pts]

In [24]:
# Load the MNIST test set with batch size 128, apply data shuffling and normalization
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST(
        'data/', 
        train=False, 
        transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))
        ])
    ),
    batch_size=batch_size,
    shuffle=True
)

## 2. Models
#### You are going to define two convolutional neural networks which are trained to classify MNIST digits

### 2.1. CNN without Batch Norm [5 pts]

In [25]:
28-5+1

24

In [26]:
# Fill in the values below that make this network valid for MNIST data

conv1_in_ch = 1
conv2_in_ch = 20
fc1_in_features = 800
fc2_in_features = 500
n_classes = output_size

## Tamaño de las capas:
(w es el tamaño de una dimensión cualquiera(h,w,d))
- Después de MaxPool o Conv:
$$W_o=\frac{W_i+2p-k}{s} + 1 $$
- Después de BatchNorm
$$W_o=W_i$$
Ver el [artículo](https://medium.com/mlreview/a-guide-to-receptive-field-arithmetic-for-convolutional-neural-networks-e0f514068807) o el [paper](https://arxiv.org/pdf/1603.07285.pdf)

In [27]:
class NetWithoutBatchNorm(nn.Module):
    def __init__(self):
        super(NetWithoutBatchNorm, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=conv1_in_ch, out_channels=20, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(in_channels=conv2_in_ch, out_channels=50, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(in_features=fc1_in_features, out_features=500)
        self.fc2 = nn.Linear(in_features=fc2_in_features, out_features=n_classes)

    def forward(self, x):
        #[X]=NxCxHxW (en los comentarios se obvia el N - tamaño del batch)
        x = F.relu(self.conv1(x)) #1x28x28 -> 20x24x24
        x = F.max_pool2d(x, kernel_size=2, stride=2) #20x24x24 -> 20x12x12
        x = F.relu(self.conv2(x)) #20x12x12 -> 50x8x8
        x = F.max_pool2d(x, kernel_size=2, stride=2) #50x8x8 -> 50x4x4
        x = x.view(-1, fc1_in_features) # 50x4x4 -> 800
        x = F.relu(self.fc1(x)) #800 -> 500
        x = self.fc2(x) #500 -> 10
        x = F.log_softmax(x) #clasificación
        return x

### 2.2. CNN with Batch Norm [5 pts]

In [28]:
# Fill in the values below that make this network valid for MNIST data

conv1_bn_size = 20
conv2_bn_size = 50
fc1_bn_size = 500

In [29]:
# Define the CNN with architecture explained in Part 2.2
class NetWithBatchNorm(nn.Module):
    def __init__(self):
        super(NetWithBatchNorm, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=conv1_in_ch, out_channels=20, kernel_size=5, stride=1)
        self.conv1_bn = nn.BatchNorm2d(conv1_bn_size)
        self.conv2 = nn.Conv2d(in_channels=conv2_in_ch, out_channels=50, kernel_size=5, stride=1)
        self.conv2_bn = nn.BatchNorm2d(conv2_bn_size)
        self.fc1 = nn.Linear(in_features=fc1_in_features, out_features=500)
        self.fc1_bn = nn.BatchNorm1d(fc1_bn_size)
        self.fc2 = nn.Linear(in_features=fc2_in_features, out_features=n_classes)

    def forward(self, x):
        x = F.relu(self.conv1_bn(self.conv1(x)))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2_bn(self.conv2(x)))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = x.view(-1, fc1_in_features)
        x = F.relu(self.fc1_bn(self.fc1(x)))
        x = self.fc2(x)
        x = F.log_softmax(x) #clasificación
        return x

## 3. Training & Evaluation

### 3.1. Define training method [10 pts]

In [35]:
def train(model, device, train_loader, optimizer, epoch, log_interval = 100):
    # Set model to training mode
    model.train()

    # Loop through data points
    for batch_idx, (data, target) in enumerate(train_loader):
    
        # Send data and target to device
        data=data.to(device)
        target=target.to(device)
        
        # Zero out the ortimizer
        optimizer.zero_grad()
        
        # Pass data through model
        Y=model(data)
        
        # Compute the negative log likelihood loss
        loss=F.nll_loss(Y,target)
        
        # Backpropagate loss
        loss.backward()
        
        # Make a step with the optimizer
        optimizer.step()
        
        # Print loss (uncomment lines below once implemented)
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

### 3.2. Define test method [10 pts]

In [45]:
# Define test method
def test(model, device, test_loader):
    # Set model to evaluation mode
    model.eval()
    # Variable for the total loss 
    test_loss = 0
    # Counter for the correct predictions
    num_correct = 0
    
    # don't need autograd for eval
    with torch.no_grad():
        # Loop through data points
        for data, target in test_loader:
        
            # Send data to device
            data=data.to(device)
            target=target.to(device)
            
            # Pass data through model
            Y=model(data)
            
            # Compute the negative log likelihood loss with reduction='sum' and add to total test_loss
            test_loss+=F.nll_loss(Y,target,reduction="sum")
            
            # Get predictions from the model for each data point
            pred = Y.data.max(1, keepdim=True)[1]     

            # Add number of correct predictions to total num_correct 
            num_correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()
    
    # Compute the average test_loss
    avg_test_loss = test_loss/len(test_loader.dataset)
    
    # Print loss (uncomment lines below once implemented)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        avg_test_loss, num_correct, len(test_loader.dataset),
        100. * num_correct / len(test_loader.dataset)))

### 3.3 Train NetWithoutBatchNorm() [5 pts]

In [46]:
# Deifne model and sent to device
model = NetWithoutBatchNorm()
model.to(device)

# Optimizer: SGD with learning rate of 1e-2 and momentum of 0.5
optimizer = torch.optim.SGD(model.parameters(),lr=1e-2,momentum=0.5)

# Training loop with 10 epochs
for epoch in range(1, 10 + 1):

    # Train model
    train(model,device,train_loader,optimizer,epoch)
    
    # Test model
    test(model,device,test_loader)


Test set: Average loss: 0.1527, Accuracy: 9544/10000 (95%)


Test set: Average loss: 0.0948, Accuracy: 9692/10000 (97%)


Test set: Average loss: 0.0813, Accuracy: 9745/10000 (97%)


Test set: Average loss: 0.0573, Accuracy: 9815/10000 (98%)


Test set: Average loss: 0.0525, Accuracy: 9829/10000 (98%)


Test set: Average loss: 0.0462, Accuracy: 9854/10000 (99%)


Test set: Average loss: 0.0434, Accuracy: 9860/10000 (99%)


Test set: Average loss: 0.0377, Accuracy: 9886/10000 (99%)


Test set: Average loss: 0.0392, Accuracy: 9867/10000 (99%)


Test set: Average loss: 0.0384, Accuracy: 9874/10000 (99%)



### 3.4 Train NetWithBatchNorm() [5 pts]

In [48]:
# Deifne model and sent to device
model = NetWithBatchNorm()
model.to(device)

# Optimizer: SGD with learning rate of 1e-2 and momentum of 0.5
optimizer = torch.optim.SGD(model.parameters(),lr=1e-2,momentum=0.5)

# Training loop with 10 epochs
for epoch in range(1, 10 + 1):
    
    # Train model
    train(model,device,train_loader,optimizer,epoch)
    
    # Test model
    test(model,device,test_loader)


Test set: Average loss: 0.0877, Accuracy: 9775/10000 (98%)


Test set: Average loss: 0.0608, Accuracy: 9844/10000 (98%)


Test set: Average loss: 0.0497, Accuracy: 9861/10000 (99%)


Test set: Average loss: 0.0417, Accuracy: 9871/10000 (99%)


Test set: Average loss: 0.0364, Accuracy: 9884/10000 (99%)


Test set: Average loss: 0.0335, Accuracy: 9901/10000 (99%)


Test set: Average loss: 0.0324, Accuracy: 9898/10000 (99%)


Test set: Average loss: 0.0325, Accuracy: 9895/10000 (99%)


Test set: Average loss: 0.0301, Accuracy: 9914/10000 (99%)


Test set: Average loss: 0.0288, Accuracy: 9910/10000 (99%)



## 4. Empirically, which of the models achieves higher accuracy faster? [2 pts]

Answer: 