## Implementing a ResNet

In [1]:
import time

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms

from tqdm.notebook import tqdm

### Loading the Data

In [2]:
batch_size = 16
data_root = './data/cifar10'
transform = transforms.Compose([
    transforms.ToTensor(),
    # scales pixel values to range [-1, 1]
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),                                      
])
train_size = 40_000
val_size = 10_000

In [3]:
dataset = torchvision.datasets.CIFAR10(
    root=data_root, 
    train=True, 
    download=True,
    transform=transform,
)
assert train_size + val_size <= len(dataset), "Trying to sample too many elements!" \
    "Please lower the train or validation set sizes."
train_set, val_set, _ = torch.utils.data.random_split(
    dataset, [train_size, val_size, len(dataset) - train_size - val_size]
)
test_set = torchvision.datasets.CIFAR10(
    root=data_root, 
    train=False, 
    download=True,
    transform=transform,
)
classes = (
    'plane', 'car', 'bird', 'cat', 'deer', 
    'dog', 'frog', 'horse', 'ship', 'truck',
)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
)
val_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
)
test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
)

In [5]:
import numpy as np
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [7.5, 7.5]

def show_image(img):
    img = img / 2 + 0.5
    img = img.numpy()
    img = img.transpose([1, 2, 0])
    plt.imshow(img)
    plt.show()




In [6]:
images, labels = next(iter(train_loader))
nrow = 4
# print('showing training images')
# show_image(torchvision.utils.make_grid(images, nrow=nrow))
# print('\n... and their corresponding labels')

In [7]:
for idxs in np.array_split(np.arange(batch_size), nrow):
    print(' '.join(f'{classes[labels[j]]:8s}' for j in idxs))

ship     car      dog      truck   
plane    frog     bird     truck   
horse    horse    dog      horse   
bird     dog      ship     truck   


In [8]:
images, labels = next(iter(train_loader))
print(images.shape)

conv1 = torch.nn.Conv2d(3, 5, 5)
pool = torch.nn.MaxPool2d(2, 2)
conv2 = torch.nn.Conv2d(5, 16, 5)

x = conv1(images)
x = pool(x)
x = conv2(x)
x = pool(x)

print(x.shape)

torch.Size([16, 3, 32, 32])
torch.Size([16, 16, 5, 5])


### Fully Connected Neural Network

In [12]:
class FCNN(nn.Module):
    def __init__(self, layer_dims):
        '''
        Initializes the torch.nn.Module and define some layers for future ues

        layer_dims: list of integers, representing the width, len(layer_dims) == 3
        '''
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(layer_dims[0], layer_dims[1])
        self.fc2 = nn.Linear(layer_dims[1], layer_dims[2])

    def forward(self, x):
        '''
        Computes a forward pass of batch of images

        x: torch.Tensor of shape (batch_size, num_channels, width, height)
        returns: result of passing x through your network
        '''
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [13]:
if torch.cuda.is_available():
    print(1)

1


In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def compute_accuracy(model, val_loader):
    total_correct = 0
    model = model.to(device)
    for inputs, labels in tqdm(val_loader, leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        outputs = outputs.argmax(1)
        correct = (outputs == labels)
        total_correct += correct.sum()
    return total_correct / len(val_loader.dataset)

def train(model, train_loader, val_loader, num_epochs, criterion, optimizer):
    print('beginning to train model')
    model = model.to(device)
    for epoch in tqdm(range(num_epochs)):
        total_loss = 0
        start_time = time.perf_counter()
        for inputs, labels in tqdm(train_loader, leave=False):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss
        end_time = time.perf_counter()
        duration = end_time - start_time
        train_acc = compute_accuracy(model, train_loader)
        val_acc = compute_accuracy(model, val_loader)
        
        print(f'epoch {epoch:2}', 
              f'loss: {total_loss:.3f}',
              f'time: {duration:.3f}', 
              f'train acc: {train_acc:.4f}',
              f'val acc: {val_acc:.4f}', sep='\t')

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [15]:
num_epochs = 15
layer_dims = [3072, 100, 10]
fcnn = FCNN(layer_dims)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(fcnn.parameters(), lr=1e-3, momentum=0.9)

In [16]:
train(fcnn, train_loader, val_loader, num_epochs, criterion, optimizer)

beginning to train model


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  0	loss: 4356.633	time: 10.152	train acc: 0.4500	val acc: 0.4380


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  1	loss: 3855.447	time: 7.663	train acc: 0.4851	val acc: 0.4538


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  2	loss: 3652.396	time: 7.702	train acc: 0.5038	val acc: 0.4682


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  3	loss: 3496.900	time: 7.597	train acc: 0.5419	val acc: 0.4856


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  4	loss: 3367.062	time: 7.554	train acc: 0.5602	val acc: 0.5018


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  5	loss: 3253.588	time: 7.555	train acc: 0.5707	val acc: 0.5022


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  6	loss: 3161.642	time: 7.626	train acc: 0.5876	val acc: 0.5092


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  7	loss: 3074.693	time: 7.670	train acc: 0.5911	val acc: 0.5028


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  8	loss: 2998.370	time: 7.608	train acc: 0.6084	val acc: 0.5082


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  9	loss: 2919.463	time: 7.637	train acc: 0.6149	val acc: 0.5083


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 10	loss: 2855.868	time: 7.645	train acc: 0.6240	val acc: 0.5183


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 11	loss: 2787.826	time: 7.642	train acc: 0.6365	val acc: 0.5130


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 12	loss: 2733.296	time: 7.596	train acc: 0.6322	val acc: 0.5044


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 13	loss: 2676.254	time: 7.634	train acc: 0.6470	val acc: 0.5101


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 14	loss: 2618.630	time: 7.605	train acc: 0.6636	val acc: 0.5118


In [17]:
print(count_parameters(fcnn))

308310


### Basic Convolutional Neural Network

In [18]:
class CNN(nn.Module):
    def __init__(self):
        '''
        Initializes the torch.nn.Module and define some layers for future ues.
        
        - conv layer with 5 output channels and kernel size 5
        - max pooling on 2 by 2 blocks with stride of 2
        - conv layer with 16 output channels and kernel size 5
        - max pooling on 2 by 2 blocks with stride of 2
        - linear (fully-connected) layer with 120 output dims
        - linear (fully-connected) layer with 84 output dims
        - linear (fully-connected) layer with 10 output dims
        '''
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 5, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(5, 16, 5)
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        '''
        Computes a forward pass of batch of images. 

        x: torch.Tensor of shape (batch_size, num_channels, width, height)
        returns: result of passing x through your network
        '''

        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x

In [19]:
num_epochs = 15
cnn = CNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cnn.parameters(), lr=1e-3, momentum=0.9)

In [20]:
train(cnn, train_loader, val_loader, num_epochs, criterion, optimizer)

beginning to train model


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  0	loss: 5360.196	time: 10.969	train acc: 0.3382	val acc: 0.3408


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  1	loss: 4159.346	time: 9.364	train acc: 0.4341	val acc: 0.4321


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  2	loss: 3755.297	time: 9.273	train acc: 0.4844	val acc: 0.4731


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  3	loss: 3492.766	time: 9.262	train acc: 0.5186	val acc: 0.5053


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  4	loss: 3282.693	time: 9.200	train acc: 0.5552	val acc: 0.5362


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  5	loss: 3125.323	time: 9.210	train acc: 0.5797	val acc: 0.5496


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  6	loss: 2975.193	time: 9.268	train acc: 0.6031	val acc: 0.5617


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  7	loss: 2847.983	time: 9.216	train acc: 0.6123	val acc: 0.5654


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  8	loss: 2719.635	time: 9.225	train acc: 0.6349	val acc: 0.5843


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  9	loss: 2614.631	time: 9.311	train acc: 0.6573	val acc: 0.5926


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 10	loss: 2500.850	time: 9.406	train acc: 0.6637	val acc: 0.6003


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 11	loss: 2404.285	time: 9.349	train acc: 0.6873	val acc: 0.6063


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 12	loss: 2317.855	time: 9.399	train acc: 0.6986	val acc: 0.6166


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 13	loss: 2232.954	time: 9.338	train acc: 0.7172	val acc: 0.6154


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch 14	loss: 2143.760	time: 9.344	train acc: 0.7207	val acc: 0.6174


In [21]:
fcnn_num_params = count_parameters(fcnn)
cnn_num_params = count_parameters(cnn) 
print('FCNN Num Params:', fcnn_num_params)
print('CNN Num Params:', cnn_num_params)
print('Ratio:', cnn_num_params/fcnn_num_params)

FCNN Num Params: 308310
CNN Num Params: 61530
Ratio: 0.19957185949206968


### ResNet

In [22]:
class ResNetBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        '''
        Creates a residual block for our ResNet18 architecture.

        - conv layer with  
            out_channels=out_channels, 3x3 kernel, stride=stride
        - batchnorm layer (Batchnorm2D)
        - conv layer with
            out_channels=out_channels, 3x3 kernel, stride=1
        - batchnorm layer (Batchnorm2D)
        - shortcut layer:
            if either the stride is not 1 or the out_channels is not equal to in_channels:
                the shortcut layer is composed of two steps:
                - conv layer with
                    in_channels=in_channels, out_channels=out_channels, 3x3 kernel, stride=stride
                - batchnorm layer (Batchnorm2D)
            else:
                the shortcut layer should be an no-op
        
        '''
        super(ResNetBlock, self).__init__()
        self.conv1 = self.conv(in_channels, out_channels, 3, stride)
        self.bn = nn.BatchNorm2d(out_channels)
        self.conv2 = self.conv(out_channels, out_channels, 3, 1)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                self.conv(in_channels, out_channels, kernel_size=3, stride=stride),
                nn.BatchNorm2d(out_channels),
            )
            
    
    def conv(self, in_channels, out_channels, kernel_size, stride):
        return nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding=1, bias=False)

    def forward(self, x):
        '''
        Computes a forward pass of this batch of data on this residual block. 

        x: batch of images of shape (batch_size, num_channels, width, height)
        returns: result of passing x through this block
        '''
        out = self.conv1(x)
        out = self.bn(out)
        out = F.relu(out)
        out = self.conv2(out)
        out = self.bn(out)
        short = self.shortcut(x)
        out = F.relu(short + out)
        return out

class ResNet18(nn.Module):
    def __init__(self):
        num_classes = 10
        super(ResNet18, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(in_channels=3, 
                               out_channels=64, 
                               kernel_size=3,
                               stride=1, 
                               padding=1, 
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self.make_block(out_channels=64, stride=1)
        self.layer2 = self.make_block(out_channels=128, stride=2)
        self.layer3 = self.make_block(out_channels=256, stride=2)
        self.layer4 = self.make_block(out_channels=512, stride=2)
        self.linear = nn.Linear(512, num_classes)

    def make_block(self, out_channels, stride):
        layers = []
        for stride in [stride, 1]:
            layers.append(ResNetBlock(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = F.avg_pool2d(x, 4)
        x = torch.flatten(x, 1)
        x = self.linear(x)
        return x

In [23]:
num_epochs = 10
resnet = ResNet18()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(resnet.parameters(), lr=1e-3, momentum=0.9)

This model will take a while (~1 hour) to train. Make sure that you are connected to a GPU runtime for Colab or, if you have GPU on your computer, consider training locally. Watch out for Colab's runtime disconnecting; it shouldn't be a big problem for this homework, but make sure to keep the browser tab open while the models are training.

The staff implementation got a validation accuracy of 66% after 1 training epoch. If your results are significantly worse than that, consider stopping your training run and debugging your model.

In [24]:
train(resnet, train_loader, val_loader, num_epochs, criterion, optimizer)

beginning to train model


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  0	loss: 3245.016	time: 64.076	train acc: 0.6957	val acc: 0.6655


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  1	loss: 1985.184	time: 65.496	train acc: 0.8115	val acc: 0.7489


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  2	loss: 1427.313	time: 65.853	train acc: 0.8609	val acc: 0.7642


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  3	loss: 1047.410	time: 64.798	train acc: 0.8919	val acc: 0.7712


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  4	loss: 740.509	time: 64.971	train acc: 0.9275	val acc: 0.7744


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  5	loss: 491.167	time: 69.533	train acc: 0.9539	val acc: 0.7870


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  6	loss: 355.025	time: 64.960	train acc: 0.9702	val acc: 0.7900


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  7	loss: 232.389	time: 65.863	train acc: 0.9757	val acc: 0.7904


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  8	loss: 186.996	time: 65.120	train acc: 0.9791	val acc: 0.7932


  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

epoch  9	loss: 159.265	time: 65.428	train acc: 0.9833	val acc: 0.7929
