In [0]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import datasets

from torchsummary import summary

import matplotlib.pyplot as plt
import numpy as np

In [18]:
transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                     download=True, transform = transforms)
trainloader = torch.utils.data.DataLoader(trainset,
                                          batch_size=batch_size,
                                          shuffle = True,
                                          num_workers=2
                                         )

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                    download = True, transform=transforms)
testloader = torch.utils.data.DataLoader(testset, 
                                         batch_size=batch_size,
                                         shuffle=False, 
                                         num_workers=2
                                        )

Files already downloaded and verified
Files already downloaded and verified


# ResNet 34

<p>
        <img src="https://neurohive.io/wp-content/uploads/2019/01/resnet-e1548261477164.png" alt="dominating_sets_example2"/>
        <br>
        <em>Fig. 1: ResNet Building Block</em>
</p>

According to paper, a building block is defined as:

$$ y = F(x, \{W_i\}) + x $$ 

Here $x$ and $y$ are the input and output vectors of the layers, $i$, considered. The function $F(x, \{W_i\})$ represents the residual mapping. 
The operation $F + x$ is performed by a shortcut connection and element-wise addition. After addition, nonlinearity (ReLU) is adopted.

> Implemented this in class BasicBlock, line $24$.

In [0]:
class BasicBlock(nn.Module):
  def __init__(self, in_channels, out_channels, stride=1, downsample = None):
    super(BasicBlock, self).__init__()
    
    self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
    self.bn1 = nn.BatchNorm2d(out_channels)
    self.relu = nn.ReLU()
    
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
    self.bn2 = nn.BatchNorm2d(out_channels)
    self.downsample = downsample
    
  def forward(self, x):
    residual = x
    
    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)
    
    out = self.conv2(out)
    out = self.bn2(out)
    
    if self.downsample: # Option B is chosen according to paper.
      residual = self.downsample(residual)
    
#   y = F(x,W_i) + x
    out += residual
    out = self.relu(out)
    
    return out  

## Note

According to the paper, when the dimensions increases, two options are considered:

1.  Adding extra entries by using zero padding.
2.  The projection shortcut, $x$, is used to match dimensions. So it is downsampled by using $1 \times 1$ convolution. 

> I chose option $2$. Code Line $27$ in Resnet class.


For both options, when the shortcuts go across feature maps of two sizes (Let's say from $64 \rightarrow 128$) they are performed with stride $2$.



In [20]:
class Resnet(nn.Module):
  def __init__(self, block, num_blocks, num_classes=10):
    super(Resnet, self).__init__()
    
    self.in_channels = 64
    
#     1st layer has 7x7 conv with stride 2. But CIFAR10 images are low resolution. So that's why I have used kernel_size = 3. 
#   Another way is to use resnet18. So the num_blocks will be [2,2,2,2].
    self.conv1 = nn.Conv2d(3, 64, kernel_size = 3, stride = 1, padding = 1)
    self.bn1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU()
    self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 1, padding = 1)
    
    self.layers1 = self._make_layer(block, 64, num_blocks[0], stride = 1)
    self.layers2 = self._make_layer(block, 128, num_blocks[1], stride = 2) # Why stride = 2? details below
    self.layers3 = self._make_layer(block, 256, num_blocks[2], stride = 2)
    self.layers4 = self._make_layer(block, 512, num_blocks[3], stride = 2)
    
#    before the fc layer avg pool is used and the output size is converted into (1x1)
    self.avgpool = nn.AvgPool2d(kernel_size = 4, stride = 1)
    
    self.fc = nn.Linear(512, num_classes)
    
  def _make_layer(self, block, out_channels, num_blocks, stride=1):
      downsample = None
      
#       According to paper, Option B in Section 3.3, the shortcut is done by 1x1 conv
#     When shortcuts go to feature maps of two sizes (64->128 / 128->256/ 256->512) stride = 2 is used
      if stride != 1 or self.in_channels != out_channels:
        downsample = nn.Sequential(
          nn.Conv2d(self.in_channels, out_channels, kernel_size = 1, stride=stride),
          nn.BatchNorm2d(out_channels)
        )
       
      netlayers = []
      
      netlayers.append(block(self.in_channels, out_channels, stride = stride, downsample = downsample))
      
      self.in_channels = out_channels
      
      for _ in range(1, num_blocks):
        netlayers.append(block(self.in_channels, out_channels))
        self.in_channels = out_channels
        
      return nn.Sequential(*netlayers)
  
  def forward(self, x):
      x = self.relu(self.bn1(self.conv1(x)))
      x = self.maxpool(x)
      
      x = self.layers1(x)
#       print("1.",x.shape)
      x = self.layers2(x)
#       print(x.shape)
      x = self.layers3(x)
#       print(x.shape)
      x = self.layers4(x)
#       print(x.shape)
      
      x = self.avgpool(x) # In paper, Avg pooling is used at the end
      x = torch.flatten(x, 1)
      x = self.fc(x)
      
      return x
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#  according to paper, 3*2 + 4*2 + 6*2 + 3*3 = 34 layers in residual34 
model = Resnet(BasicBlock, num_blocks=[3, 4, 6, 3]).to(device)
img, label = trainset.__getitem__(0)
summary(model, input_size=img.size())

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,792
       BatchNorm2d-2           [-1, 64, 32, 32]             128
              ReLU-3           [-1, 64, 32, 32]               0
         MaxPool2d-4           [-1, 64, 32, 32]               0
            Conv2d-5           [-1, 64, 32, 32]          36,928
       BatchNorm2d-6           [-1, 64, 32, 32]             128
              ReLU-7           [-1, 64, 32, 32]               0
            Conv2d-8           [-1, 64, 32, 32]          36,928
       BatchNorm2d-9           [-1, 64, 32, 32]             128
             ReLU-10           [-1, 64, 32, 32]               0
       BasicBlock-11           [-1, 64, 32, 32]               0
           Conv2d-12           [-1, 64, 32, 32]          36,928
      BatchNorm2d-13           [-1, 64, 32, 32]             128
             ReLU-14           [-1, 64,

In [0]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [22]:
EPOCHS = 2
print_every = 2000

for epoch in range(EPOCHS):
  
  running_loss = 0.0
  
  for iters, data in enumerate(trainloader, 0):
    inputs, labels = data[0].to(device), data[1].to(device) 
    
#     make gradients parameter zero
    optimizer.zero_grad()
    
#     forward propagation
    outputs = model(inputs)
#   calculate loss
    loss = criterion(outputs, labels)
#   do backward propagation
    loss.backward()
#   update weights
    optimizer.step()
   
#     print result
    running_loss += loss.item()
    if iters % print_every == 1999:
      print("epoch: %d, iterations: %5d, loss: %.3f" % 
           (epoch + 1, iters + 1, running_loss / print_every))
      
      running_loss = 0.0

print("Finished Training")

epoch: 1, iterations:  2000, loss: 2.033
epoch: 1, iterations:  4000, loss: 1.791
epoch: 1, iterations:  6000, loss: 1.596
epoch: 1, iterations:  8000, loss: 1.441
epoch: 1, iterations: 10000, loss: 1.306
epoch: 1, iterations: 12000, loss: 1.183
epoch: 2, iterations:  2000, loss: 1.051
epoch: 2, iterations:  4000, loss: 1.006
epoch: 2, iterations:  6000, loss: 0.909
epoch: 2, iterations:  8000, loss: 0.891
epoch: 2, iterations: 10000, loss: 0.848
epoch: 2, iterations: 12000, loss: 0.810
Finished Training


In [23]:
correct1 = 0
total = 0

with torch.no_grad():
  for data in testloader:
    images, labels = data[0].to(device), data[1].to(device)
    output1 = model(images)
    
    _, predicted = torch.max(output1.data, 1)
    
    total += labels.size(0)
    
    correct1 += (predicted == labels).sum().item()
    
print("Accuracy of the network: %2.3f %%" % (100 * correct1 / total))

Accuracy of the network: 72.290 %
