# Alethea's Attempt at ResNet-50

In [1]:
import torch
from torch import nn
import matplotlib
import torchvision
from torchvision import transforms
import matplotlib.pyplot as plt
import numpy as np
import os
from torch.utils.data import DataLoader

In [2]:
# CUDA for PyTorch
# Shamelessly stolen from: https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
#cudnn.benchmark = True

## Let's Build the Model

I'm basing this on the resnet diagram from: https://cv-tricks.com/keras/understand-implement-resnets/

In [3]:
class ReLU(nn.Module):
    def __init__(self):
        super().__init__()
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        return self.relu(x)

In [4]:
class ConvModule(nn.Module):
    """Implements a single Convolution layer"""

    def __init__(self, in_channels, out_channels, kernel_size, 
                 stride=1, padding=0, dilation=1, groups=1, bias=True, 
                 padding_mode='zeros'):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, 
                              stride, padding, dilation, groups, 
                              bias, padding_mode)

    def forward(self, x):
        return self.conv(x) # Do I need to add a ReLU in here? Maybe: ReLU().forward(self.conv(x))

In [5]:
tmpmodel = ConvModule(in_channels=3, out_channels=3, kernel_size=3)
print(list(tmpmodel.parameters()))

[Parameter containing:
tensor([[[[-0.0433, -0.0336,  0.0493],
          [ 0.0224,  0.1019,  0.1001],
          [-0.0408, -0.1168, -0.0113]],

         [[ 0.0136,  0.0998, -0.0105],
          [ 0.0400, -0.0456,  0.0639],
          [ 0.0699, -0.0696,  0.1902]],

         [[ 0.0832, -0.1746, -0.1907],
          [-0.1049,  0.0691, -0.1871],
          [-0.0276,  0.1535,  0.1923]]],


        [[[-0.1066, -0.0549,  0.0300],
          [ 0.0261, -0.1431,  0.0460],
          [ 0.0217, -0.0153,  0.1679]],

         [[-0.0623, -0.1219, -0.0839],
          [-0.0378,  0.0910,  0.0437],
          [-0.1247,  0.1253,  0.1326]],

         [[-0.1290, -0.0888, -0.0208],
          [ 0.1539, -0.1096, -0.0183],
          [ 0.0043,  0.0917, -0.0418]]],


        [[[ 0.1515,  0.0251,  0.1516],
          [ 0.1792,  0.1006,  0.1411],
          [ 0.0843,  0.1919, -0.1548]],

         [[ 0.0478,  0.1888,  0.0375],
          [ 0.1686, -0.0645, -0.0321],
          [-0.0037, -0.1907,  0.1757]],

         [[-0.1782, -

In [6]:
class MaxPoolModule(nn.Module):
    """Implements a single Max Pool layer"""

    def __init__(self, kernel_size, stride=None, padding=0, dilation=1, 
                 return_indices=False, ceil_mode=False):
        super().__init__()
        self.maxpool = nn.MaxPool2d(kernel_size, stride, padding, dilation, 
                                    return_indices, ceil_mode)

    def forward(self, x):
        return self.maxpool(x) 

In [7]:
class AvgPoolModule(nn.Module):
    """Implements a single Average Pool layer"""
    
    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False, 
                 count_include_pad=True, divisor_override=None):
        super().__init__()
        self.avgpool = nn.AvgPool2d(kernel_size, stride, padding, ceil_mode, 
                                    count_include_pad, divisor_override)

    def forward(self, x):
        return self.avgpool(x) 

In [8]:
class LinearModule(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features, bias)
    
    def forward(self, x):
        return self.linear(x)

In [9]:
tmpmodel = LinearModule(in_features=100, out_features=100)
print(list(tmpmodel.parameters()))

[Parameter containing:
tensor([[-0.0288,  0.0595,  0.0290,  ..., -0.0109, -0.0751,  0.0465],
        [ 0.0518, -0.0331,  0.0077,  ...,  0.0485, -0.0016, -0.0042],
        [ 0.0934,  0.0565,  0.0011,  ..., -0.0836,  0.0887, -0.0487],
        ...,
        [ 0.0779, -0.0140,  0.0451,  ...,  0.0603,  0.0381,  0.0747],
        [ 0.0624,  0.0643,  0.0161,  ...,  0.0558,  0.0779,  0.0800],
        [-0.0760,  0.0754, -0.0955,  ...,  0.0783, -0.0525, -0.0843]],
       requires_grad=True), Parameter containing:
tensor([ 8.3013e-02, -8.4906e-02,  8.5836e-03,  7.4810e-02,  5.9445e-02,
         8.5487e-02, -7.7168e-02, -6.9145e-02, -5.3128e-02, -6.6627e-03,
        -7.6876e-02,  3.7216e-02, -6.5353e-02, -1.4852e-02,  3.0931e-02,
        -2.5520e-02,  7.2159e-02,  3.7897e-02,  9.0557e-02,  4.6501e-02,
         2.1900e-02,  3.0614e-02,  3.2165e-02, -4.0792e-02,  8.3559e-03,
        -6.6088e-02, -4.7993e-02, -6.6386e-02,  4.1887e-02, -4.2243e-02,
        -3.8193e-02,  8.1635e-02, -5.3732e-03, -9.7534e

In [10]:
class SoftmaxModule(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.softmax = nn.Softmax(dim)
    
    def forward(self, x):
        return self.softmax(x)

In [11]:
class ResidualModule(nn.Module):
    """Implements a single residual block from a given stage"""
    
    def __init__(self, stage_num):
        super().__init__()

        # p = parameters
        p = {1: {'in_channels': [64, 64, 64], 
                 'out_channels': [64, 64, 256], 
                 'kernel_size': [1, 3, 1],
                 'stride': [1] * 3, 
                 'padding': [1, 0, 0], 
                 'dilation': [1] * 3, 
                 'groups': [1] * 3, 
                 'bias': [True] * 3,
                 'padding_mode': ['zeros'] * 3 },
             2: {'in_channels': [128, 128, 128], 
                 'out_channels': [128, 128, 512], 
                 'kernel_size': [1, 3, 1],
                 'stride': [1] * 3, 
                 'padding': [1, 0, 0],
                 'dilation': [1] * 3, 
                 'groups': [1] * 3, 
                 'bias': [True] * 3,
                 'padding_mode': ['zeros'] * 3 },
             3: {'in_channels': [256,256, 256], 
                 'out_channels': [256, 256, 1024], 
                 'kernel_size': [1, 3, 1],
                 'stride': [1] * 3, 
                 'padding': [1, 0, 0], 
                 'dilation': [1] * 3, 
                 'groups': [1] * 3, 
                 'bias': [True] * 3,
                 'padding_mode': ['zeros'] * 3 },
             4: {'in_channels': [512, 512, 512], 
                 'out_channels': [512, 512, 2048], 
                 'kernel_size': [1, 3, 1],
                 'stride': [1] * 3, 
                 'padding': [1, 0, 0], 
                 'dilation': [1] * 3, 
                 'groups': [1] * 3, 
                 'bias': [True] * 3,
                 'padding_mode': ['zeros'] * 3 },
            }[stage_num]
        
        # Pytorch doesn't seem to be able to find trainable parameters if we put submodules 
        # into an array
        #
        # for i in range(3):
        #     self.layers.append()

        ReLU(F + x)
        i = 0
        self.layer0 = ConvModule(p['in_channels'][i], p['out_channels'][i], p['kernel_size'][i], 
                                 p['stride'][i], p['padding'][i], p['dilation'][i], 
                                 p['groups'][i], p['bias'][i], p['padding_mode'][i])
        i = 1
        self.layer1 = ConvModule(p['in_channels'][i], p['out_channels'][i], p['kernel_size'][i], 
                                 p['stride'][i], p['padding'][i], p['dilation'][i], 
                                 p['groups'][i], p['bias'][i], p['padding_mode'][i])
        i = 2
        self.layer2 = ConvModule(p['in_channels'][i], p['out_channels'][i], p['kernel_size'][i], 
                                 p['stride'][i], p['padding'][i], p['dilation'][i], 
                                 p['groups'][i], p['bias'][i], p['padding_mode'][i])
        
                    
    def forward(self, x):
        a = x
        a = self.layer0.forward(a)
        a = self.layer1.forward(a)
        a = self.layer2.forward(a) 
        return a + x


In [None]:
class Stage(nn.Module):
    """Implements each of the four stages of residual blocks. 
    One instance of this class is one stage"""
    
    def __init__(self, stage_num):
        super().__init__()
    
        
        # Pytorch doesn't seem to be able to find trainable parameters if we put submodules 
        # into an array
        #
        # self.stage_num = stage_num
        #
        # self.num_blocks = {1: 3, 
        #                    2: 3, 
        #                    3: 6, 
        #                    4: 3}[stage_num]
        #
        # self.blocks = []
        # for i in range(self.num_blocks):
        #     self.blocks.append(ResidualModule(stage_num))

        self.block0 = ResidualModule(stage_num)
        self.block1 = ResidualModule(stage_num)
        self.block2 = ResidualModule(stage_num)
        if stage_num == 3:
            self.block3 = ResidualModule(stage_num)
            self.block4 = ResidualModule(stage_num)
            self.block5 = ResidualModule(stage_num)
            
        
    def forward(self, x):
        # a = x
        # for block in self.blocks:
        #     a = block.forward(x)
        # return a
        a = x
        a = self.block0.forward(a)
        a = self.block1.forward(a)
        a = self.block2.forward(a)
        if stage_num == 3:
            a = self.block3.forward(a)
            a = self.block4.forward(a)
            a = self.block5.forward(a)
        return a

In [None]:
class ResNet50(nn.Module):

    def __init__(self, num_labels):
        super().__init__()
        
        # 7x7 Conv
        self.layer1 = ConvModule(in_channels=3, out_channels=64, 
                                 kernel_size=7, stride=2, padding=4)

        # 3x3 MaxPool
        self.layer2 = MaxPoolModule(kernel_size=3, stride=2, padding=2)

        # Residual Stages
        self.stage1 = Stage(1)
        self.stage2 = Stage(2)
        self.stage3 = Stage(3)
        self.stage4 = Stage(4)

        # AveragePool
        self.avgpool = AvgPoolModule(kernel_size=7)  # FIXME
        
        # Fully Connected
        self.linear = LinearModule(in_features=1000, out_features=num_labels)
        
        # I think this one is handled by choosing criterion = nn.CrossEntropyLoss() below
        # self.softmax = SoftmaxModule(dim=num_labels)  # Is this needed? 

    def forward(self, x):
        a = x
        a = self.layer1.forward(a)
        a = self.layer2.forward(a)
        a = self.stage1.forward(a)
        a = self.stage2.forward(a)
        a = self.stage3.forward(a)
        a = self.stage4.forward(a)
        a = self.avgpool.forward(a)
        a = self.linear.forward(a)
        a = self.softmax.forward(a)
        return a

## Load our data. 

I'm using advice from https://www.learnopencv.com/pytorch-for-beginners-image-classification-using-pre-trained-models/ about regularizing image data. 


In [None]:
transform = transforms.Compose([transforms.Resize(256),        
                                transforms.CenterCrop(224),    
                                transforms.ToTensor(),         
                                transforms.Normalize(mean=[0.485, 0.456, 0.406],    
                                                     std=[0.229, 0.224, 0.225])
                               ])


datadir = "/home/apower/data/oxford-iiit-pet"
traindir = os.path.join(datadir, 'train')
devdir = os.path.join(datadir, 'dev')
testdir = os.path.join(datadir, 'test')

X_train = torchvision.datasets.ImageFolder(traindir, transform)
X_dev = torchvision.datasets.ImageFolder(devdir, transform)
X_test = torchvision.datasets.ImageFolder(testdir, transform)

# It's better to pre-divide data into train/dev/test. That way it doesn't randomly shift between runs. 

#dataset = torchvision.datasets.ImageFolder(datadir, transform)
#total_pics = len(dataset)
#test_pics = int(min(total_pics * .1, 1000))
#dev_pics = int(min(total_pics * .1, 1000))
#train_pics = total_pics - (dev_pics + test_pics)
#(X_train, X_dev, X_test) = torch.utils.data.random_split(dataset, (train_pics, dev_pics, test_pics))

In [None]:
print('training_set:', len(X_train), '\ndev_set:', len(X_dev), '\ntest_set:', len(X_test))

In [None]:
to_pic = torchvision.transforms.ToPILImage()
to_pic(X_train[0][0])

In [None]:
to_pic(X_dev[0][0])

In [None]:
to_pic(X_test[0][0])

## Let's Do it


In [None]:
model = ResNet50(num_labels=len(X_train.classes))
model.cuda()

In [None]:
print(model)


In [None]:
for p in model.parameters():
    print(p)

Woohoo!!!! We have parameters! :-D

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

In [None]:
train_loader = DataLoader(X_train, batch_size=1, shuffle=True)
dev_loader = DataLoader(X_dev, batch_size=1, shuffle=True)
test_loader = DataLoader(X_test, batch_size=1, shuffle=True)

max_epochs = 100

losses = []
for epoch in range(max_epochs):
    for local_batch, local_labels in train_loader:
        # Transfer to GPU
        X, y = local_batch.to(device), local_labels.to(device)
        y_pred = model.forward(X)
        loss = criterion(y_pred, y)
        losses.append(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('epoch:', epoch, 'loss:', loss.item())
