# Basics on how to build a simple Neural Network

## 0. Imports

In [70]:
import torch
import torch.nn as nn # Network Modules
import torch.optim as optim # Gradient Descent, SGD, Adam, ...
import torch.nn.functional as F # Activation functions

# The Data Loader gives us easier data set management
# allowing us to create mini batches and this kind of things easily
from torch.utils.data import DataLoader

# Datasets from torchvision: https://pytorch.org/vision/stable/datasets.html
import torchvision.datasets as datasets

# Transformations to perform on our data set (for data augmentation, for example)
import torchvision.transforms as transforms

# Already implemented & pre-trained models from torchvsion: https://pytorch.org/vision/stable/models.html
import torchvision.models

from tqdm import tqdm # progress bar

## 1. Create a Fully Connected Network

Model of the neural network:

In [43]:
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        # call the initialization of the nn.Module
        super(NN, self).__init__()

        # create here the NN modules that are going to be used
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, num_classes)
    
    def forward(self, x):
        # assembly the modules that participate on the forward propagation part
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

To import the CNN:

In [56]:
from models.SimpleCNN import CNN

# # to make sure it runs correctly (should output torch.Size([64, 10])):
# model = CNN()
# x = torch.randn(64, 1, 28, 28)
# print(model(x).shape)

## 2. Set Device

In [57]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 3. Hyperparameters

In [58]:
INPUT_SIZE = 784
INPUT_CHANNELS = 1
NUM_CLASSES = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 3
LOAD_MODEL = True
CHECKPOINT_NAME = "checkpoints/my_checkpoint.pth.tar"

## 4. Load Data

- `root`: Where the dataset is going to be downloaded.
- `train`: If True: download the training set. If False: Download the test set.
- `transform`: Transformations to perform on the dataset (from NumPy to Tensor to be run on PyTorch).

In [59]:
train_ds = datasets.MNIST(root='data', train=True, transform=transforms.ToTensor(), download=True)
test_ds = datasets.MNIST(root='data', train=False, transform=transforms.ToTensor(), download=True)

⚠️ Be careful not to shuffle the data if it has to follow an specific order, like in some NLP cases.

In [60]:
train_loader = DataLoader(dataset=train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_ds, batch_size=BATCH_SIZE, shuffle=True)

## 5. Initialize network

> To choose the model, just decomment it and comment the rest

Simple neural network (NN):

In [61]:
# model = NN(input_size=INPUT_SIZE, num_classes=NUM_CLASSES).to(DEVICE)

Convolutional neural network (CNN):

In [62]:
model = CNN(in_channels=INPUT_CHANNELS, num_classes=NUM_CLASSES).to(DEVICE)

VGG16:

Initial model summary:
```
VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    ...
    (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (29): ReLU(inplace=True)
    (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(7, 7))
  (classifier): Sequential(
    (0): Linear(in_features=25088, out_features=4096, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=4096, out_features=4096, bias=True)
    (4): ReLU(inplace=True)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=4096, out_features=1000, bias=True)
  )
)

```

We don't want to perform any operation as avgpool. Therefore, we're going to create an Identity module that will leave the input as it is:

In [74]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
    
    def forward(self, x):
        return x

We also don't want the classifier part to have an output of 1000 features, so we are gonna say that the classifier is just a Linear module with an output of 10:

In [79]:
def load_vgg16_model():
    model = torchvision.models.vgg16(pretrained=True)

    # We just want to perform backpropagation on the last layers. Therefore,
    # we're going to deactivate the grad of the parameters until now.
    # This will make the traning much more faster as it will only train the new
    # added layers!
    for param in model.parameters():
        param.requires_grad = False

    model.avgpool = Identity()
    # if we look at line 28 of the summary, we can see that there are 512 output_channels
    model.classifier = nn.Sequential(nn.Linear(512, 100),
                                    nn.ReLU(),
                                    nn.Linear(100, NUM_CLASSES))
    return model
    
# model = load_vgg16_model()

# print(model)        # model summary

Summary:
```
VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    ...
    (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (29): ReLU(inplace=True)
    (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): Identity()
  (classifier): Sequential(
    (0): Linear(in_features=512, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=10, bias=True)
  )
)
```

## 6. Loss & Optimizer

In [63]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

## 7. Checkpoints & Model Loading

In [64]:
def save_checkpoint(state, filename="checkpoints/my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)

def load_checkpoint(checkpoint):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

## 8. Train Network

In [69]:
if LOAD_MODEL:
    try:
        load_checkpoint(torch.load(CHECKPOINT_NAME))
    except:
        raise FileNotFoundError("No previous checkpoints were found.")

print("Checkpoint has been loaded correctly!")

=> Loading checkpoint


In [None]:
def reshape_if_simple_nn(data):
    if isinstance(model, NN):
        # Get to correct shape for the simple neural network: [64, 1, 28, 28] -> [64, 784]
        # - The Linear layer expects one input per neuron, therefore,
        #   we cannot introduce an array per neuron. We've first to convert it to only 1 value.
        data = data.reshape(data.shape[0], -1) # -1 flatten all the following layers
    return data

In [66]:
for epoch in range(NUM_EPOCHS):
    losses = []
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)

    if epoch % 2 == 0: # save a checkpoint every two epochs
        checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
        save_checkpoint(checkpoint, CHECKPOINT_NAME)

    for batch_idx, (data, targets) in loop:
        # Carry data to CUDA if possible
        data = data.to(device=DEVICE)
        targets = targets.to(device=DEVICE)

        data = reshape_if_simple_nn(data)

        ### Forward ###
        scores = model(data)
        loss = criterion(scores, targets)
        losses.append(loss.item())

        ### Backward ###

        # For each batch, set all the gradients to 0 to avoid using previous gradients
        # on a new batch and run through new problems
        optimizer.zero_grad()
        loss.backward()

        # perform the optimization
        optimizer.step()

        # update progress bar
        loop.set_description(f"Epoch [{epoch}/{NUM_EPOCHS}]")
        loop.set_postfix(loss = loss.item())
    
    mean_loss = sum(losses) / len(losses)
    print(f"Loss at epoch {epoch} was {mean_loss:.5f}")

=> Saving checkpoint
Loss at epoch 0 was 0.25962
Loss at epoch 1 was 0.07248
=> Saving checkpoint
Loss at epoch 2 was 0.05244


## 9. Accuracy & Test

In [67]:
def check_accuracy(loader, model):
    dataset_type = "training" if loader.dataset.train else 'test'
    print(f"Checking accuracy on {dataset_type} data")

    num_correct = 0
    num_samples = 0
    model.eval() # in other cases, it'll disable dropout and this kind of layers

    # with torch.no_grad() we avoid computing the gradients in the calculations
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=DEVICE)
            y = y.to(device=DEVICE)

            x = reshape_if_simple_nn(x)

            scores = model(x)
            # Remember we said that the output shape is gonna be nn.Linear(50, 10)
            # We want to take the greatest value, so just apply argmax
            predictions = scores.argmax(dim=1)

            # Remember, x, predictions & y are batches of 64 elements.
            # if we perform (predictions == y), we'll obtain a tensor like the following one:
            # tensor([True, False, True, True]).sum() = 4
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
        
        acc = (float(num_correct) / float(num_samples)) * 100
        print(f"Got {num_correct} / {num_samples} with accuracy {acc:.2f}")
    
    model.train() # to remove the model.eval() part
    return acc

In [68]:
check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

Checking accuracy on training data
Got 59258 / 60000 with accuracy 98.76
Checking accuracy on test data
Got 9854 / 10000 with accuracy 98.54


98.54