<a href="https://colab.research.google.com/github/ab7289-tandon-nyu/csgy6953_DeepLearning_Midterm/blob/oscar1/notebooks/try_residual_bottleneck_block.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Our Team's Code

In [1]:
!git clone -b bottleneck "https://github.com/ab7289-tandon-nyu/csgy6953_DeepLearning_Midterm.git"

fatal: destination path 'csgy6953_DeepLearning_Midterm' already exists and is not an empty directory.


In [2]:
!cp -r /content/csgy6953_DeepLearning_Midterm/src/ .

To see whether the latest commit was cloned here:

In [3]:
!cat src/model.py

import torch
import torch.nn as nn
from enum import Enum 

from typing import List, Tuple, Optional

class ResidualBlockType(Enum):
    '''
    Enum class to represent the residual block type for ResNet
    '''
    BASIC = 0
    BOTTLENECK = 1
    
class ResidualBlock(nn.Module):
    '''
    Class representing a convolutional residual block 
    '''

    def __init__(self, num_channels: int, use_stem: bool = False, strides: int = 1):
        '''
        Creates a new instance of a Residual Block
        @param: num_channels (int) - the number of output channels for all convolutions in 
            the block
        @param: use_stem (bool) - whether a 1x1 convolution is needed to downsample the
            residual
        @param: strides (int) - the number of strides to use in the convolutions, defaults to 1
        '''
        super().__init__()
        self.num_channels = num_channels
        self.use_stem = use_stem
        self.strides = strides

        self.conv1 = nn.LazyConv2d(

# Install torchsummary

In [4]:
!pip install torch-summary==1.4.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from torchsummary import summary

In [6]:
# view the function's parameters
summary

<function torchsummary.torchsummary.summary(model: torch.nn.modules.module.Module, input_data: Union[torch.Tensor, torch.Size, Sequence[torch.Tensor], Sequence[Union[int, Sequence[Any], torch.Size]], NoneType] = None, *args: Any, batch_dim: Union[int, NoneType] = 0, branching: bool = True, col_names: Union[Iterable[str], NoneType] = None, col_width: int = 25, depth: int = 3, device: Union[torch.device, NoneType] = None, dtypes: Union[List[torch.dtype], NoneType] = None, verbose: int = 1, **kwargs: Any) -> torchsummary.model_statistics.ModelStatistics>

# Load Transformed CIFAR-10 Data

In [7]:
## transforms.py

import torch
import torchvision.transforms as transforms

from typing import Tuple


def make_transforms(means: torch.Tensor, std_devs: torch.Tensor) -> Tuple:
    '''
    Given a tensor of computed means and a tensor of computed standard devations,
    return's a tuple containing a train and test transform pipelines
    '''
    train_transforms = transforms.Compose([
        transforms.RandomRotation(5),
        transforms.RandomHorizontalFlip(0.5),
        transforms.RandomCrop(32, padding=2),
        transforms.ToTensor(),
        transforms.Normalize(mean=means,
                             std=std_devs)
    ])

    test_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=means,
                             std=std_devs)
    ])

    return train_transforms, test_transforms


In [8]:
## data.py

import torchvision.datasets as datasets
import torch.utils.data as data
import torch
import copy
from typing import Tuple, Callable


## 2. Prepare to normalize data (the same way for TRAIN and TEST)
def summarize_train_data(train_data: datasets.cifar.CIFAR10) -> Tuple[torch.Tensor , torch.Tensor]:
    '''Compute means and standard deviations along the R,G,B channel'''
    means = train_data.data.mean(axis = (0, 1, 2)) / 255
    stds  = train_data.data.std( axis = (0, 1, 2)) / 255
    # EACH returns a tensor of shape (3,) = a vector of size 3 (= R,G,B)
    return means, stds

## 4. Load and custom-partition data
def partition_train_data(train_data, valid_ratio):
    '''partition TRAIN data into TRAIN and VALID'''

    # the partition:len(train_data) == num_valid_examples + num_train_examples
    num_valid_examples = int(len(train_data) * valid_ratio)
    num_train_examples = len(train_data) - num_valid_examples

    train_data, valid_data = \
    data.random_split(train_data, [num_train_examples, num_valid_examples])
    
    return train_data, copy.deepcopy(valid_data)


## 1. through 4.
def get_transformed_data(make_transforms: Callable, valid_ratio: float) -> Tuple[
    torch.utils.data.dataset.Subset,
    torch.utils.data.dataset.Subset,
    datasets.cifar.CIFAR10]:
    '''
    Where transform = augment & normalize
    @param: make_transforms (Callable) - how to augment & normalize train and non-train data
        (e.g. this function defined in `from src.transforms import make_transforms`)
    @param: valid_ratio (float) - the share of train_data that will redesignate as valid_data
    '''
    
    ## 1. Download TRAIN (50K) data
    ROOT = '.data'
    train_data = datasets.CIFAR10(root = ROOT, train = True, download = True) # len=50K

    ## 2. Prepare to normalize data (the same way for TRAIN and TEST)
    train_data_means, train_data_stds = summarize_train_data(train_data)

    ## 3. Augment TRAIN data; Normalize data (the same way  for TRAIN and TEST)
    train_transforms, test_transforms = make_transforms(train_data_means, train_data_stds)

    ## 4. Load and custom-partition data

    # load TRAIN (50K) and TEST (10K) data
    train_data = datasets.CIFAR10(ROOT, train=True,  download=True, transform=train_transforms)
    test_data  = datasets.CIFAR10(ROOT, train=False, download=True, transform=test_transforms)

    # custom-partition data: TRAIN -> VALID(valid_ratio) & TRAIN(1 - valid_ratio)
    train_data, valid_data = partition_train_data(train_data, valid_ratio)

    # will transform (= augment & normalize) VALID data the same way as we do TEST data
    valid_data.dataset.transform = test_transforms

    return train_data, valid_data, test_data

## 5. Data loader
def make_data_loaders(
    train_data: torch.utils.data.dataset.Subset,
    valid_data: torch.utils.data.dataset.Subset, 
    test_data:  datasets.cifar.CIFAR10,
    batch_size: int) -> Tuple[
        torch.utils.data.dataloader.DataLoader,
        torch.utils.data.dataloader.DataLoader,
        torch.utils.data.dataloader.DataLoader]:

    # training requires shuffling
    train_iterator = \
    torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

    valid_iterator = \
    torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=False)

    test_iterator = \
    torch.utils.data.DataLoader(test_data,  batch_size=batch_size, shuffle=False)

    return train_iterator, valid_iterator, test_iterator


In [9]:
BATCH_SIZE  = 256
VALID_RATIO = 0.1  # 10% of TRAIN becomes VALID; 90% of TRAIN remains TRAIN

In [10]:
train_data, valid_data, test_data = \
get_transformed_data(make_transforms = make_transforms, valid_ratio = VALID_RATIO)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [11]:
train_iterator, valid_iterator, test_iterator = \
make_data_loaders(train_data, valid_data, test_data, BATCH_SIZE)

# Instantiate Model; Count Parameters

In [12]:
from src.model import ResidualBlockType

from src.model import ResNet, StemConfig
from src.utils import initialize_parameters, epoch_time



In [13]:
# reference: def 
# create_block(self, block_type: ResidualBlockType, 
#                    num_residuals: int, 
#                    num_channels: int, 
#                    first_block: bool = False) -> nn.Sequential:
model_architecture = (
    (ResidualBlockType.BASIC, 1, 128),
    (ResidualBlockType.BASIC, 2, 128),
    (ResidualBlockType.BASIC, 2, 128),
    (ResidualBlockType.BASIC, 2, 128),
    (ResidualBlockType.BASIC, 2, 196),
    (ResidualBlockType.BASIC, 2, 196),
)

stem_config = StemConfig(num_channels=128, kernel_size=5, stride=1, padding=2)

model = ResNet(model_architecture, stem_config=stem_config, output_size=10)



In [14]:
import torch

In [15]:
input = torch.rand(256, 3, 32, 32)
output = model(input)

In [16]:
summary(model)

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Conv2d: 2-1                       9,728
|    └─BatchNorm2d: 2-2                  256
|    └─ReLU: 2-3                         --
├─Sequential: 1-2                        --
|    └─AdaptiveAvgPool2d: 2-4            --
|    └─Flatten: 2-5                      --
|    └─Linear: 2-6                       1,970
├─Sequential: 1-3                        --
|    └─Sequential: 2-7                   --
|    |    └─ResidualBlock: 3-1           295,680
|    └─Sequential: 2-8                   --
|    |    └─ResidualBlock: 3-2           312,192
|    |    └─ResidualBlock: 3-3           295,680
|    └─Sequential: 2-9                   --
|    |    └─ResidualBlock: 3-4           312,192
|    |    └─ResidualBlock: 3-5           295,680
|    └─Sequential: 2-10                  --
|    |    └─ResidualBlock: 3-6           312,192
|    |    └─ResidualBlock: 3-7           295,680
|    └─Sequential: 2-11      

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Conv2d: 2-1                       9,728
|    └─BatchNorm2d: 2-2                  256
|    └─ReLU: 2-3                         --
├─Sequential: 1-2                        --
|    └─AdaptiveAvgPool2d: 2-4            --
|    └─Flatten: 2-5                      --
|    └─Linear: 2-6                       1,970
├─Sequential: 1-3                        --
|    └─Sequential: 2-7                   --
|    |    └─ResidualBlock: 3-1           295,680
|    └─Sequential: 2-8                   --
|    |    └─ResidualBlock: 3-2           312,192
|    |    └─ResidualBlock: 3-3           295,680
|    └─Sequential: 2-9                   --
|    |    └─ResidualBlock: 3-4           312,192
|    |    └─ResidualBlock: 3-5           295,680
|    └─Sequential: 2-10                  --
|    |    └─ResidualBlock: 3-6           312,192
|    |    └─ResidualBlock: 3-7           295,680
|    └─Sequential: 2-11      

```
num_channels, block_type = 128, ResidualBlockType.BASIC
|    └─ReLU: 2-3                         [-1, 128, 32, 32]         --
├─Sequential: 1-2                        [-1, 128, 32, 32]         --
|    └─Sequential: 2-4                   [-1, 128, 32, 32]         --
|    |    └─ResidualBlock: 3-1           [-1, 128, 32, 32]         295,680
```

```
num_channels, block_type = 128, ResidualBlockType.BOTTLENECK
|    └─ReLU: 2-3                         [-1, 128, 32, 32]         --
├─Sequential: 1-2                        [-1, 128, 32, 32]         --
|    └─Sequential: 2-4                   [-1, 128, 32, 32]         --
|    |    └─ResidualBottleNeck: 3-1      [-1, 128, 32, 32]         17,984
```
```
num_channels, block_type = 128*4, ResidualBlockType.BOTTLENECK
|    └─ReLU: 2-3                         [-1, 512, 32, 32]         --
├─Sequential: 1-2                        [-1, 512, 32, 32]         --
|    └─Sequential: 2-4                   [-1, 512, 32, 32]         --
|    |    └─ResidualBottleNeck: 3-1      [-1, 512, 32, 32]         280,832
```

```
num_channels1, num_channels2, block_type = 64, 128*4, ResidualBlockType.BOTTLENECK
                                         [-1, 3, 32, 32]           --
|    └─Conv2d: 2-1                                                 9,728
                                         [-1, 128, 32, 32]
|    └─BatchNorm2d: 2-2                                            256
                                         [-1, 128, 32, 32]
|    |    └─ResidualBottleNeck: 3-1                                17,984
                                         [-1, 128, 32, 32]
|    |    └─ResidualBottleNeck: 3-2                                297,728
                                         [-1, 512, 16, 16]
|    └─AdaptiveAvgPool2d: 2-6                                      --
                                         [-1, 512, 1, 1]
```
```
num_channels1, num_channels2, block_type = 64, 128*4, ResidualBlockType.BOTTLENECK
model_architecture = (
    (block_type, 2, num_channels1),
    (block_type, 2, num_channels2)
)                                        [-1, 3, 32, 32]           --
|    └─Conv2d: 2-1                                                 4,864
                                         [-1, 64, 32, 32]
|    └─BatchNorm2d: 2-2                                            128
                                         [-1, 64, 32, 32]
|    |    └─ResidualBottleNeck: 3-1                                4,640
                                         [-1, 64, 32, 32]
|    |    └─ResidualBottleNeck: 3-2                                4,640
                                         [-1, 64, 32, 32]
|    |    └─ResidualBottleNeck: 3-3                                256,768
                                         [-1, 512, 16, 16]
|    |    └─ResidualBottleNeck: 3-4                                280,832
                                         [-1, 512, 16, 16]
|    └─AdaptiveAvgPool2d: 2-6                                      --
                                         [-1, 512, 1, 1]
```
```
num_channels1, num_channels2, block_type = 64, 128, ResidualBlockType.BASIC
model_architecture = (
    (block_type, 2, num_channels1),
    (block_type, 2, num_channels2)
)
├─Sequential: 1-1                        [-1, 3, 32, 32]           --
|    └─Conv2d: 2-1                                                 4,864
                                         [-1, 64, 32, 32]
|    └─BatchNorm2d: 2-2                                            128
                                         [-1, 64, 32, 32]
|    |    └─ResidualBlock: 3-1                                     74,112
                                         [-1, 64, 32, 32]
|    |    └─ResidualBlock: 3-2                                     74,112
                                         [-1, 64, 32, 32]
|    |    └─ResidualBlock: 3-3                                     230,272
                                         [-1, 128, 16, 16]
|    |    └─ResidualBlock: 3-4                                     295,680
                                         [-1, 128, 16, 16]
|    └─AdaptiveAvgPool2d: 2-6                                      --
                                         [-1, 128, 1, 1]
```
```
num_channels1, num_channels2, block_type = 64, 128, ResidualBlockType.BOTTLENECK

model_architecture = (
    (block_type, 2, num_channels1),
    (block_type, 2, num_channels2)
)
                                         [-1, 3, 32, 32]
|    └─Conv2d: 2-1                                 4,864
                                         [-1, 64, 32, 32]
|    └─BatchNorm2d: 2-2                            128
                                         [-1, 64, 32, 32]
|    |    └─ResidualBottleNeck: 3-1                4,640
                                         [-1, 64, 32, 32]
|    |    └─ResidualBottleNeck: 3-2                4,640
                                         [-1, 64, 32, 32]
|    |    └─ResidualBottleNeck: 3-3               24,256
                                         [-1, 128, 16, 16]
|    |    └─ResidualBottleNeck: 3-4               17,984
                                         [-1, 128, 16, 16]
|    └─AdaptiveAvgPool2d: 2-6                       --
                                         [-1, 128, 1, 1]
```


Conclusion:
```
[-1,  64, 32, 32] --ResidualBlock     ( 74,112 params)--> [-1,  64, 32, 32]
[-1,  64, 32, 32] --ResidualBottleNeck(  4,640 params)--> [-1,  64, 32, 32]


[-1,  64, 32, 32] --ResidualBlock     (230,272 parmas)--> [-1, 128, 16, 16]
[-1,  64, 32, 32] --ResidualBottleNeck( 24,256 params)--> [-1, 128, 16, 16]
[-1,  64, 32, 32] --ResidualBottleNeck(256,768 params)--> [-1, 512, 16, 16]

[-1, 128, 16, 16] --ResidualBlock     (295,680 params)--> [-1, 128, 16, 16]
[-1, 128, 16, 16] --ResidualBottleNeck( 17,984 params)--> [-1, 128, 16, 16]
[-1, 512, 16, 16] --ResidualBottleNeck(280,832 parmas)--> [-1, 512, 16, 16]
```

In [23]:
num_channels1, num_channels2, block_type = 64, 128, ResidualBlockType.BOTTLENECK

stem_config = StemConfig(num_channels=num_channels1, 
                         kernel_size=5, stride=1, padding=2)

model_architecture = (
    (block_type, 2, num_channels1),
    (block_type, 2, num_channels2)
)

model = ResNet(model_architecture, stem_config=stem_config, output_size=10)

input = torch.rand(256, 3, 32, 32)
output = model(input)

summary(model, (3,32,32), verbose = 0)



Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 64, 32, 32]          --
|    └─Conv2d: 2-1                       [-1, 64, 32, 32]          4,864
|    └─BatchNorm2d: 2-2                  [-1, 64, 32, 32]          128
|    └─ReLU: 2-3                         [-1, 64, 32, 32]          --
├─Sequential: 1-2                        [-1, 128, 16, 16]         --
|    └─Sequential: 2-4                   [-1, 64, 32, 32]          --
|    |    └─ResidualBottleNeck: 3-1      [-1, 64, 32, 32]          4,640
|    |    └─ResidualBottleNeck: 3-2      [-1, 64, 32, 32]          4,640
|    └─Sequential: 2-5                   [-1, 128, 16, 16]         --
|    |    └─ResidualBottleNeck: 3-3      [-1, 128, 16, 16]         24,256
|    |    └─ResidualBottleNeck: 3-4      [-1, 128, 16, 16]         17,984
├─Sequential: 1-3                        [-1, 10]                  --
|    └─AdaptiveAvgPool2d: 2-6            [-1, 128, 1, 1]           

In [18]:
num_channels = 200

model_architecture = (
    (ResidualBlockType.BOTTLENECK, 1, num_channels),
)

stem_config = StemConfig(num_channels=num_channels, 
                         kernel_size=5, stride=1, padding=2)

model = ResNet(model_architecture, stem_config=stem_config, output_size=10)

input = torch.rand(256, 3, 32, 32)
output = model(input)

summary(model, (3,32,32), verbose = 0)

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 200, 32, 32]         --
|    └─Conv2d: 2-1                       [-1, 200, 32, 32]         15,200
|    └─BatchNorm2d: 2-2                  [-1, 200, 32, 32]         400
|    └─ReLU: 2-3                         [-1, 200, 32, 32]         --
├─Sequential: 1-2                        [-1, 200, 32, 32]         --
|    └─Sequential: 2-4                   [-1, 200, 32, 32]         --
|    |    └─ResidualBottleNeck: 3-1      [-1, 200, 32, 32]         43,400
├─Sequential: 1-3                        [-1, 10]                  --
|    └─AdaptiveAvgPool2d: 2-5            [-1, 200, 1, 1]           --
|    └─Flatten: 2-6                      [-1, 200]                 --
|    └─Linear: 2-7                       [-1, 10]                  2,010
Total params: 61,010
Trainable params: 61,010
Non-trainable params: 0
Total mult-adds (M): 59.03
Input size (MB): 0.01
Forward/backward pass si