https://arxiv.org/abs/1512.03385  
https://arxiv.org/abs/1812.01187  
https://arxiv.org/abs/1706.02677  

In [None]:
# conclusion? 
# LayerNorm is better than batchNorm. (it came out as an improvement) 
# but, LayerNorm depends on the Emb size, whereas BatchNorm on BS, therefore its easy to implement a dynamic (image size independent) convlayers using batchnorm than layernorm
# https://arxiv.org/abs/1706.02677 init gamma of Normlayer just before the residual connection with 0 helps in learning, why think about it. Hint "residual"
# https://arxiv.org/abs/1812.01187 stem in the beginning and AvgPool replaces FC, helps to reduce computation and parameters respectively
# 1x1 convs used in bottleneck layers are faster than original residual block, on addition to increasing depth

In [1]:
import numpy
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
from fastai.vision.all import *

In [4]:
# ConvLayer: Conv-Norm-Act
class ConvLayer(nn.Module):
    def __init__(self, ci, co, k=3, s=1, p=None, norm=True, norm_init_zero=False, act=True):
        super().__init__()
        if p is None: p = k//2
        self.conv = nn.Conv2d(ci, co, kernel_size=k, stride=s, padding=p)
        if norm:
            self.norm = nn.BatchNorm2d(co)
            if norm_init_zero:
                self.norm.weight.data.fill_(0)
                if self.norm.bias is not None:
                    self.norm.bias.data.fill_(0)
        else:
            self.norm = None
        self.act = nn.GELU() if act else None

    def forward(self, x):
        x = self.conv(x)
        if self.norm is not None:
            x = self.norm(x)
        if self.act is not None:
            x = self.act(x)
        return x

# ConvLayer(3, 32)

In [5]:
# https://arxiv.org/pdf/1812.01187
def _stem(*sizes):
    net = []
    for i in range(len(sizes)-1):
        net.append(ConvLayer(sizes[i], sizes[i+1], 3, s=2 if i==0 else 1))
    return nn.Sequential(*net, nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

# _stem(3,32,32,64)

In [6]:
def _conv_block(ci, co, s=1, bottleneck=False):
    if bottleneck:
        layers = [
            ConvLayer(ci, co, k=1, s=s, p=0),
            ConvLayer(co, co, s=s),
            ConvLayer(co, co, k=1, s=s, p=0, norm_init_zero=True, act=False),
        ]
    else:
        layers = [
            ConvLayer(ci, co, s=s),
            ConvLayer(co, co, s=s, norm_init_zero=True, act=False),
        ]
    return nn.Sequential(*layers)

# _conv_block(64, 64)

In [7]:
# residual block
class noop(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return x

class ResBlock(nn.Module):
    def __init__(self, ci, co, s=1, bottle_neck=False):
        super().__init__()
        self.convs = _conv_block(ci, co, s=s, bottleneck=bottle_neck)
        self.idconv = noop() if ci==co else ConvLayer(ci, co, k=1, p=0, act=False) #residual
        self.pool = noop() if s==1 else nn.AvgPool2d(2, ceil_mode=True)
    def forward(self, x):
        out = self.convs(x)
        residual = self.idconv(self.pool(x))
#         if out.shape != residual.shape:
#             residual = self.pool(residual)
        return F.gelu(out + residual)

# ResBlock(30,64,128)

In [8]:
# checking above implementation--
# starting img, bs 128
x = torch.randn((128, 3, 128, 128))
print(x.shape)
x = _stem(3,32,32,64)(x)
print(x.shape) # stem out
x = ResBlock(64, 128)(x) # residual block (bottleneck)
print(x.shape)
# # now create a loop

torch.Size([128, 3, 128, 128])
torch.Size([128, 64, 32, 32])
torch.Size([128, 128, 32, 32])


In [9]:
# stem + [2,2,2,2] blocks
# 3c + 1p +  4 * 2 * 2c
# 3c + 16c
# 19 conv == resnet18

# input goes through stem
# if starting residual block, stride 1 else stride 2
# progressive depth of residual blocks
    # 64 -> 64 -> 128 -> 256 -> 512

# resnet
class ResNet(nn.Module):
    def __init__(self, layers, n_out, bottle_neck=False):
        super().__init__()
        self.stem = _stem(3,32,32,64)
        block_sz = [64,64,128,256,512]
        blocks = []
        for i in range(len(block_sz)-1):
            stride = 1 # if i==0 else 2 # but it sucks, to much compression, we already have Avgpool
            blocks.append(self._make_layer(block_sz[i], block_sz[i+1], layers[i], stride, bottle_neck))
        self.core = nn.Sequential(*blocks)
        ## below could be in Sequential as well, but nvm
        self.final_pool = nn.AdaptiveAvgPool2d(1)
        self.flat = nn.Flatten()
        self.fc = nn.Linear(block_sz[-1], n_out)
        
    def _make_layer(self, _in, _out, blocks, stride, bottle_neck=False):
        net = []
        for _ in range(1, blocks):
            net.append(ResBlock(_in,_out,s=stride, bottle_neck=bottle_neck))
        return nn.Sequential(*net)

    def forward(self, x):
#         import time
#         st = time.monotonic()
        x = self.stem(x)
        x = self.core(x)
        x = self.final_pool(x)
        x = self.flat(x)
        x = self.fc(x)
#         et = time.monotonic()
#         print((et-st)//1) # 13 sec single forward pass, bleh
        return x

x = torch.randn((128, 3, 128, 128))
resnet = ResNet([2,2,2,2], 1000)
resnet(x).shape

torch.Size([128, 1000])

# ResNet
- Init BN's gamma with 0's if the layer is just before the "+" (residual addition)

- stem
    - instead of directly starting with resblock, we start with few convs
    - reason: resblocks are computational expensive, initially when the image is original, it requires more computation
- 4 group of resnet blocks
    - filters size for each blocks are: 64, 128, 256, 512
    - except 1st (just after MaxPool layer), each has stride = 2


# ResNet-18, -34, -50 etc. what are these?
- the architecture contains 18 conv layers


1. get imagenette dataset from fastai
2. get things done  
    2.1 dataaugmentation-- mixup  
3. implement resnet-50 (i think good enough to fit in memory)
4. Train model  
    4.1 raw  
    4.2 label smoothing  1. get imagenette dataset from fastai
2. get things done  
    2.1 dataaugmentation-- mixup  
3. implement resnet-50 (i think good enough to fit in memory)
4. Train model  
    4.1 raw  
    4.2 label smoothing  

In [10]:
from fastai.vision.all import URLs

In [11]:
def get_data(url, presize, resize):
    path = untar_data(url)
    print(path)
    return DataBlock(
        blocks=(ImageBlock, CategoryBlock), get_items=get_image_files,
        splitter=GrandparentSplitter(valid_name='val'),
        get_y=parent_label, item_tfms=Resize(presize),
        batch_tfms=[*aug_transforms(min_scale=0.5, size=resize),
        Normalize.from_stats(*imagenet_stats)],
    ).dataloaders(path, bs=64)
dls = get_data(URLs.IMAGENETTE_320, presize=320, resize=224)

/root/.fastai/data/imagenette2-320


In [12]:
torch.cuda.empty_cache()

In [13]:
train,valid = dls.train, dls.valid

In [15]:
import tqdm

def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, device='cuda'):
    model.to(device)
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        for phase in ['train', 'valid']:
            if phase == 'train': model.train()
            else: model.eval()
            lossi = 0.0
            acci = 0

            for inp,targs in tqdm.tqdm(dataloaders[phase]):
                inputs = torch.tensor(inp.cpu().numpy()).to(device)
                labels = torch.tensor(targs.cpu().numpy()).to(device)
                model.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    out = model(inputs)
                    preds = torch.argmax(out.softmax(1), 1)
                    loss = criterion(out, labels)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                lossi += loss.item() * inputs.size(0)
                acci  += torch.sum(preds == labels.data)
            
            epoch_loss = lossi / len(dataloaders[phase].dataset)
            epoch_acc = acci.double() / len(dataloaders[phase].dataset)
            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
    return model


model = ResNet([2, 2, 2, 2], n_out=10) #resnet18
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
dataloaders = {'train': dls.train, 'valid': dls.valid}

model = train_model(model, dataloaders, criterion, optimizer, num_epochs=20)

Epoch 0/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 2.0636 Acc: 0.2871


100%|██████████| 62/62 [00:14<00:00,  4.30it/s]


valid Loss: 1.9042 Acc: 0.3666
Epoch 1/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.8198 Acc: 0.3877


100%|██████████| 62/62 [00:14<00:00,  4.35it/s]


valid Loss: 1.7808 Acc: 0.4120
Epoch 2/19


100%|██████████| 147/147 [01:13<00:00,  2.01it/s]


train Loss: 1.7112 Acc: 0.4273


100%|██████████| 62/62 [00:14<00:00,  4.31it/s]


valid Loss: 1.6898 Acc: 0.4548
Epoch 3/19


100%|██████████| 147/147 [01:13<00:00,  2.01it/s]


train Loss: 1.6185 Acc: 0.4636


100%|██████████| 62/62 [00:14<00:00,  4.35it/s]


valid Loss: 1.6109 Acc: 0.4874
Epoch 4/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.5412 Acc: 0.4963


100%|██████████| 62/62 [00:14<00:00,  4.40it/s]


valid Loss: 1.5624 Acc: 0.5085
Epoch 5/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.4691 Acc: 0.5168


100%|██████████| 62/62 [00:14<00:00,  4.34it/s]


valid Loss: 1.4333 Acc: 0.5615
Epoch 6/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.4168 Acc: 0.5442


100%|██████████| 62/62 [00:14<00:00,  4.30it/s]


valid Loss: 1.3961 Acc: 0.5671
Epoch 7/19


100%|██████████| 147/147 [01:13<00:00,  2.01it/s]


train Loss: 1.3694 Acc: 0.5595


100%|██████████| 62/62 [00:14<00:00,  4.34it/s]


valid Loss: 1.3622 Acc: 0.5781
Epoch 8/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.3349 Acc: 0.5669


100%|██████████| 62/62 [00:14<00:00,  4.35it/s]


valid Loss: 1.3260 Acc: 0.5819
Epoch 9/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.3015 Acc: 0.5745


100%|██████████| 62/62 [00:14<00:00,  4.26it/s]


valid Loss: 1.2955 Acc: 0.5969
Epoch 10/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.2682 Acc: 0.5888


100%|██████████| 62/62 [00:14<00:00,  4.31it/s]


valid Loss: 1.2743 Acc: 0.6079
Epoch 11/19


100%|██████████| 147/147 [01:13<00:00,  2.01it/s]


train Loss: 1.2419 Acc: 0.5966


100%|██████████| 62/62 [00:14<00:00,  4.36it/s]


valid Loss: 1.2932 Acc: 0.6041
Epoch 12/19


100%|██████████| 147/147 [01:13<00:00,  2.01it/s]


train Loss: 1.2137 Acc: 0.6106


100%|██████████| 62/62 [00:14<00:00,  4.35it/s]


valid Loss: 1.3055 Acc: 0.5893
Epoch 13/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.1981 Acc: 0.6100


100%|██████████| 62/62 [00:14<00:00,  4.32it/s]


valid Loss: 1.2926 Acc: 0.6008
Epoch 14/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.1678 Acc: 0.6267


100%|██████████| 62/62 [00:14<00:00,  4.32it/s]


valid Loss: 1.2646 Acc: 0.6000
Epoch 15/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.1547 Acc: 0.6266


100%|██████████| 62/62 [00:14<00:00,  4.35it/s]


valid Loss: 1.3014 Acc: 0.5939
Epoch 16/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.1335 Acc: 0.6347


100%|██████████| 62/62 [00:14<00:00,  4.37it/s]


valid Loss: 1.2180 Acc: 0.6166
Epoch 17/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.1202 Acc: 0.6379


100%|██████████| 62/62 [00:14<00:00,  4.34it/s]


valid Loss: 1.1789 Acc: 0.6316
Epoch 18/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.0954 Acc: 0.6477


100%|██████████| 62/62 [00:14<00:00,  4.35it/s]


valid Loss: 1.2575 Acc: 0.6171
Epoch 19/19


100%|██████████| 147/147 [01:13<00:00,  2.00it/s]


train Loss: 1.0783 Acc: 0.6574


100%|██████████| 62/62 [00:14<00:00,  4.32it/s]

valid Loss: 1.1130 Acc: 0.6596



