In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
from nb_002 import *

import typing
from typing import Dict, Any, AnyStr, List, Sequence, TypeVar, Tuple, Optional, Union

In [None]:
DATA_PATH = Path('data')
# PATH = DATA_PATH/'cifar10_dog_air'
PATH = DATA_PATH/'cifar10'

train_ds = FilesDataset(PATH/'train')
valid_ds = FilesDataset(PATH/'test', train_ds.classes)

x = train_ds[1][0]
bs=256
c = len(train_ds.classes)
len(train_ds)

# CIFAR augmentation

## Data

In [None]:
tfms = [flip_lr(p=0.5),
        pad(padding=4),
        crop(size=32, row_pct=(0,1.), col_pct=(0,1.))]

In [None]:
#export
class DatasetTfm(Dataset):
    def __init__(self, ds:Dataset, tfms:Collection[Callable]=None, **kwargs):
        self.ds,self.tfms,self.kwargs = ds,tfms,kwargs
        
    def __len__(self): return len(self.ds)
    
    def __getitem__(self,idx):
        x,y = self.ds[idx]
        if self.tfms is not None: x = apply_tfms(self.tfms, x, **self.kwargs)
        return x,y

In [None]:
train_tds = DatasetTfm(train_ds, tfms)

In [None]:
_,axes = plt.subplots(1,4, figsize=(12,9))
for ax in axes.flat: show_image(train_tds[1][0], ax)

## Normalization and training

To train our network the first step is to normalize our pixels. This makes our cost function faster and easier to optimize [(see Yann le Cun's paper, section 4.3)](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)

Normalization is a pixel transform since it directly modifies the pixels of our input image.

In [None]:
#export
def normalize(x, mean,std):   return (x-mean[...,None,None]) / std[...,None,None]
def denormalize(x, mean,std): return x*std[...,None,None] + mean[...,None,None]

def normalize_batch(b, mean, std, do_y=False):
    x,y = b
    x = normalize(x,mean,std)
    if do_y: y = normalize(y,mean,std)
    return x,y

def normalize_funcs(mean, std, do_y=False, device=None):
    if device is None: device=default_device
    return (partial(normalize_batch, mean=mean.to(device),std=std.to(device)),
            partial(denormalize,     mean=mean,           std=std))

@dataclass
class DeviceDataLoader():
    dl: DataLoader
    device: torch.device
    progress_func:Callable=None
    tfms: List[Callable]=None

    def __len__(self): return len(self.dl)

    def proc_batch(self,b):
        b = to_device(self.device,b)
        return b if self.tfms is None else self.tfms(b)
    
    def __iter__(self):
        self.gen = map(self.proc_batch, self.dl)
        if self.progress_func is not None:
            self.gen = self.progress_func(self.gen, total=len(self.dl), leave=False)
        return iter(self.gen)

    @classmethod
    def create(cls, *args, device=default_device, progress_func=tqdm, tfms=tfms, **kwargs):
        return cls(DataLoader(*args, **kwargs), device=device, progress_func=progress_func, tfms=tfms)

In [None]:
# CIFAR 10 stats looked up on google 
cifar_mean,cifar_std = map(tensor, ([0.491, 0.482, 0.447], [0.247, 0.243, 0.261]))
cifar_norm,cifar_denorm = normalize_funcs(cifar_mean,cifar_std)

In [None]:
train_dl = DeviceDataLoader.create(train_ds, bs, tfms=cifar_norm, shuffle=True, progress_func=None)

In [None]:
x,y = next(iter(train_dl))
x = x.cpu()
print(x.min(),x.max(),x.mean(),x.std())
x = cifar_denorm(x)
show_images(x,y,6,train_ds.classes, figsize=(9,10))

In [None]:
#export
class DataBunch():
    def __init__(self, train_ds, valid_ds, bs=64, device=None, num_workers=4, **kwargs):
        self.device = default_device if device is None else device
        self.train_dl = DeviceDataLoader.create(train_ds, bs,   shuffle=True,  num_workers=num_workers, **kwargs)
        self.valid_dl = DeviceDataLoader.create(valid_ds, bs*2, shuffle=False, num_workers=num_workers, **kwargs)

    @classmethod
    def create(cls, train_ds, valid_ds, train_tfm=None, valid_tfm=None, dl_tfms=None, **kwargs):
        return cls(DatasetTfm(train_ds, train_tfm), DatasetTfm(valid_ds, valid_tfm), tfms=dl_tfms, **kwargs)
        
    @property
    def train_ds(self): return self.train_dl.dl.dataset
    @property
    def valid_ds(self): return self.valid_dl.dl.dataset

In [None]:
data = DataBunch.create(train_ds, valid_ds, bs=bs, train_tfm=tfms, dl_tfms=cifar_norm, num_workers=12)

In [None]:
x,y = next(iter(data.train_dl))
x = x.cpu()
print(x.min(),x.max(),x.mean(),x.std())
x = cifar_denorm(x)
show_images(x,y,6,train_ds.classes, figsize=(9,10))

In [None]:
learn = Learner(data, simple_cnn([3,16,16,c], [3,3,3], [2,2,2]))
opt_fn = partial(optim.SGD, momentum=0.9)

In [None]:
learn.fit(1, 0.1, opt_fn=opt_fn)

# Darknet

Now we are going to try our transforms on an architecture similar to the [darknet-53](https://pjreddie.com/media/files/papers/yolo.pdf) architecture. Note that it is not the whole architecture, just the part of it that the authors pre-trained on Imagenet (see paper, section 2.2). This is the basis of any modern ResNet based architecture and it is good for experimenting.

If you are interested in a full, step-by-step description of this architecture please refer to a [video explanation](https://youtu.be/ondivPiwQho?t=0h11m07s) in Lesson 12 of Part 2 of the course or a [written transcript](https://medium.com/@hiromi_suenaga/deep-learning-2-part-2-lesson-12-215dfbf04a94), courtesy of @hiromi.

In [None]:
#export
def conv_layer(ni, nf, ks=3, stride=1):
    return nn.Sequential(
        nn.Conv2d(ni, nf, kernel_size=ks, bias=False, stride=stride, padding=ks//2),
        nn.BatchNorm2d(nf),
        nn.LeakyReLU(negative_slope=0.1, inplace=True))

class ResLayer(nn.Module):
    def __init__(self, ni):
        super().__init__()
        self.conv1=conv_layer(ni, ni//2, ks=1)
        self.conv2=conv_layer(ni//2, ni, ks=3)
        
    def forward(self, x): return x + self.conv2(self.conv1(x))

class Darknet(nn.Module):
    def make_group_layer(self, ch_in, num_blocks, stride=1):
        return [conv_layer(ch_in, ch_in*2,stride=stride)
               ] + [(ResLayer(ch_in*2)) for i in range(num_blocks)]

    def __init__(self, num_blocks, num_classes, nf=32):
        super().__init__()
        layers = [conv_layer(3, nf, ks=3, stride=1)]
        for i,nb in enumerate(num_blocks):
            layers += self.make_group_layer(nf, nb, stride=2-(i==1))
            nf *= 2
        layers += [nn.AdaptiveAvgPool2d(1), Flatten(), nn.Linear(nf, num_classes)]
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x): return self.layers(x)

In [None]:
model = Darknet([1, 2, 4, 4, 2], num_classes=c, nf=16)
# model = Darknet([1, 2, 4, 6, 3], num_classes=c, nf=32)
learner = Learner(data, model)
opt_fn = partial(optim.SGD, momentum=0.9)

In [None]:
learner.fit(1, 0.1, opt_fn=opt_fn)

In [None]:
# for lr in (0.1,0.2,0.4,0.8,0.1,0.01):
#     momentum = 0.95 if lr<0.1 else 0.85 if lr>0.5 else 0.9
#     learner.fit(2, lr, opt_fn=partial(optim.SGD, momentum=momentum))

# Fin