# A basic training loop

## From the last notebook...

In [1]:
import pickle, gzip, torch, math, numpy as np, torch.nn.functional as F
from pathlib import Path
from IPython.core.debugger import set_trace
from dataclasses import dataclass

#support type hints
from typing import Any, Collection, Callable, NewType, List, Union, TypeVar, Optional

from functools import partial, reduce
from numbers import Number

from numpy import array
from torch import nn, optim, tensor, Tensor
from torch.utils.data import TensorDataset, Dataset, DataLoader

**Load the MNIST training images and labels**

The data is downloaded in notebook *_001a_nn_basics* so make sure you have run through that notebook first.

We print out the min and max of the features to get a feel for the range of feature values. In the case of the MNIST dataset, the features for a specific training example correspond to pixel values that, as we can see, range from 0 to ~1.

In [4]:
DATA_PATH = Path('../data')
PATH = DATA_PATH/'mnist'

with gzip.open(PATH/'mnist.pkl.gz', 'rb') as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

x_train,y_train,x_valid,y_valid = map(torch.tensor, (x_train,y_train,x_valid,y_valid))
x_train.min(),x_train.max()

(tensor(0.), tensor(0.9961))

In [11]:
x_train[0].nonzero().shape

torch.Size([166, 1])

**Set the batch size, number of epochs and learning rate**

The fast.ai library uses lots of abbreviations so as to make the code more concise. A reference for abbreviations can be found [here](https://github.com/fastai/fastai_v1/blob/master/docs/abbr.md).

In [13]:
bs=64
epochs = 2
lr=0.2

**Load the training and validation datasets**

In this case we are passing in two tensors: training features and classification targets.
So each iteration of the TensorDataset will return a tuple of length two with the following form: (x_features, y_target)

In [12]:
train_ds = TensorDataset(x_train, y_train)
valid_ds = TensorDataset(x_valid, y_valid)

In [44]:
# newtype type check
UserId= NewType('UserId', int)
UserId(524313)

524313

In [47]:
Rank0Tensor = NewType('OneEltTensor', Tensor)
LossFunction = Callable[[Tensor, Tensor], Rank0Tensor]
Model = nn.Module

In [46]:
def is_listy(x:Any)->bool: return isinstance(x, (tuple,list))

def loss_batch(model, xb, yb, loss_fn, opt=None):
    loss = loss_fn(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
        
    return loss.item(), len(xb)

In [48]:
def fit(epochs:int, model:Model, loss_fn:LossFunction, 
        opt:optim.Optimizer, train_dl:DataLoader, valid_dl:DataLoader):
    for epoch in range(epochs):
        model.train()
        for xb,yb in train_dl: loss_batch(model, xb, yb, loss_fn, opt)

        model.eval()
        with torch.no_grad():
            losses,nums = zip(*[loss_batch(model, xb, yb, loss_fn)
                                for xb,yb in valid_dl])
        val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)

        print(epoch, val_loss)

In [49]:
#export
class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func=func
        
    def forward(self, x): return self.func(x)

## Simplify nn.Sequential layers

**Function Composition** 

Function composition is a great way to capture and parameterize common operations as a single concept. You can see this with the *PoolFlatten* function below.

The *Lambda* layer we created in the last notebook makes it easy to quickly create Pytorch layers for different purposes.

In [50]:
#export
def ResizeBatch(*size): return Lambda(lambda x: x.view((-1,)+size))
def Flatten(): return Lambda(lambda x: x.view((x.size(0), -1)))
def PoolFlatten(): 
    "Apply `nn.AdaptiveAvgPool2d` to `x` and then flatten the result"
    return nn.Sequential(nn.AdaptiveAvgPool2d(1), Flatten())

**Define the model**

Thanks to our named Pytorch nn.Modules above, the meaning and intention of each of the layers in our model is clearer and less prone to error when we make changes.

On kernel size: We will nearly always use small kernels of size 3 due to the reasons presented in section 2.3 in [this](https://arxiv.org/pdf/1409.1556.pdf) paper (mainly a few small kernels achieve a receptive field of the same dimension as one bigger kernel while at the same time achieving increased discriminative power and using fewer parameters). 

Next we have a stripped down CNN example:

In [51]:
model = nn.Sequential(
    ResizeBatch(1,28,28),
    nn.Conv2d(1,  16, kernel_size=3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1), nn.ReLU(),
    PoolFlatten()
)

**Define a *get_data* function**

It's often conventient to define a *get_data* function that encapsulates the work of setting up the training, validation and sometimes test data. Parameterizing *get_data* makes it easy to do things like change the batch size, etc.

Notice in this scenario that we shuffle the training dataloader but not the validation dataloader. We want the validation loss to be calculated the same way every time so that we can tell if we are still learning and not overfitting. Shuffling the training data helps prevent overfitting when calculating the gradients to be applied after each batch.

In [52]:
def get_data(train_ds, valid_ds, bs):
    return (DataLoader(train_ds, batch_size=bs, shuffle=True),
            DataLoader(valid_ds, batch_size=bs*2))

train_dl,valid_dl = get_data(train_ds, valid_ds, bs)

**Set loss function**

[Here](https://rdipietro.github.io/friendly-intro-to-cross-entropy-loss/) is tutorial explaining why cross entropy is a resonable loss function for classifciation tasks.

In [53]:
loss_fn = F.cross_entropy

**Set optimizer**

We stick with stochastic gradient descent without momentum as our optimizer. This is a basic optimizer and it is [easy to understand](http://ruder.io/optimizing-gradient-descent/index.html#stochasticgradientdescent). We will move into better optimizers as we go forward.

In [54]:
opt = optim.SGD(model.parameters(), lr=lr)

**Test our loss function**

We try out our loss function on one batch of X features and y targets to make sure it's working correctly.

In [55]:
loss_fn(model(x_valid[0:bs]), y_valid[0:bs])

tensor(2.3037, grad_fn=<NllLossBackward>)

**Fit**

Everything looks ready, we call the fit function we developed earlier for two epochs to confirm that the model learns.

In [57]:
epochs=5
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 0.43291589860916135
1 0.6470388515472412
2 0.3106609450340271
3 0.27235843381881714
4 0.42025797719955443


## Transformations

We are going to refactor some of the data transformations out of the network and into a pipeline that is applied to the data being fed into the Dataloders.

This is more flexible, simplifies the model, and will be useful later when we want to apply additional transformations for things like data augmentation.

**Define transformations**

In this example our only transformation will be *mnist2image*. This is a utility function to reshape our features into 28x28 arrays.

X is a batch of features where the first dimension is the number of samples in the batch and the remaining dimensions define the shape of the training example. y is the target variable to be learned, in this case it an integer representing one of 10 image classes.

With MNIST data, the X features start out as a 1x784 vector and we want to convert the features to 1x28x28 images (see line 62). This helper function does that for an entire batch of features.

In [67]:
def mnist2image(b): return b.view(1,28,28)

Similar to this
```
class FMNIST(Dataset):
    def __init__(self,X,y,transforms=None):
        self.X = X[:,:,:,None] # n,28,28 to n,28,28,1
        self.y = y
        self.n = len(X)
        self.transforms = transforms
    
    def __getitem__(self,index):
        img = self.X[index]
        label = self.y[index]
        if self.transforms is not None:
            img = self.transforms(img)
        return (img,label)
    
    def __len__(self):
        return self.n
```

In [68]:
#export
# @dataclass
# class TfmDataset(Dataset):
#     ds: Dataset
#     tfm: Callable = None
        
#     def __len__(self): return len(self.ds)
    
#     def __getitem__(self,idx):
#         x,y = self.ds[idx]
#         if self.tfm is not None: x = self.tfm(x)
#         return x,y
    
class TfmDataset(Dataset):
    def __init__(self,ds,tfm=None):
        self.ds = ds
        self.tfm = tfm
    
    def __getitem__(self,idx):
        x,y = self.ds[idx]
        if self.tfm is not None: x = self.tfm(x)
        return x,y
    
    def __len__(self):
        return len(self.ds)

In [69]:
train_tds = TfmDataset(train_ds, mnist2image)
valid_tds = TfmDataset(valid_ds, mnist2image)

In [42]:
def get_data(train_ds, valid_ds, bs):
    return (DataLoader(train_tds, bs,   shuffle=True),
            DataLoader(valid_tds, bs*2, shuffle=False))

In [43]:
train_dl,valid_dl = get_data(train_ds, valid_ds, bs)

We make some checks to make sure that *mnist2image* is working correctly:
1. The input and output shapes are as expected
2. The input and output data (features) are the same

In [23]:
x,y = next(iter(valid_dl))

In [26]:
x.size()
y.size()

torch.Size([128, 1, 28, 28])

torch.Size([128])

In [24]:
valid_ds[0][0].shape, x[0].shape

(torch.Size([784]), torch.Size([1, 28, 28]))

In [25]:
torch.allclose(valid_ds[0][0], x[0].view(-1))

True

## Refactor network

**Define layer types and loop over them**

When use a layer type more than once in a contiguous fashion (one after the other), it makes sense to define a function for that layer type and then use that function to build our model function. 

That is what we do here with *conv2_relu* with which we avoid the three subsequent lines of code in line 12 (this saving becomes more significant in deeper networks).

In [58]:
def conv2d(ni:int, nf:int, ks:int=3, stride:int=1, padding:int=None, bias=False) -> nn.Conv2d:
    "Create `nn.Conv2d` layer: `ni` inputs, `nf` outputs, `ks` kernel size. `padding` defaults to `k//2`"
    if padding is None: padding = ks//2
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=padding, bias=bias)

def conv2d_relu(ni:int, nf:int, ks:int=3, stride:int=1, 
                padding:int=None, bn:bool=False) -> nn.Sequential:
    "Create a `conv2d` layer with `nn.ReLU` activation and optional(`bn`) `nn.BatchNorm2d`"
    layers = [conv2d(ni, nf, ks=ks, stride=stride, padding=padding), nn.ReLU()]
    if bn: layers.append(nn.BatchNorm2d(nf))
    return nn.Sequential(*layers)

def conv2d_trans(ni:int, nf:int, ks:int=2, stride:int=2, padding:int=0) -> nn.ConvTranspose2d:
    "Create `nn.nn.ConvTranspose2d` layer: `ni` inputs, `nf` outputs, `ks` kernel size. `padding` defaults to 0"
    return nn.ConvTranspose2d(ni, nf, kernel_size=ks, stride=stride, padding=padding)

In [59]:
#export
# def conv2_relu(nif, nof, ks, stride):
#     return nn.Sequential(nn.Conv2d(nif, nof, ks, stride, padding=ks//2), nn.ReLU())

def simple_cnn(actns, kernel_szs, strides):
    layers = [conv2d_relu(actns[i], actns[i+1], kernel_szs[i], stride=strides[i])
        for i in range(len(strides))]
    layers.append(PoolFlatten())
    return nn.Sequential(*layers)

In [45]:
def get_model():
    model = simple_cnn([1,16,16,10], [3,3,3], [2,2,2])
    return model, optim.SGD(model.parameters(), lr=lr)

In [46]:
model,opt = get_model()

In [47]:
%%time
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 1.4467840166091919
1 0.5012163402557374
CPU times: user 19.6 s, sys: 1.09 s, total: 20.7 s
Wall time: 9.87 s


## CUDA

**Run in GPU and add progress bar**

To run our Pytorch networks in the GPU we have to specify it in the code. This is done by setting *torch.device('cuda')*. We will also add a progress bar to keep track of the progress during training. This we acomplish with the *tqdm_notebook* module of the [tqdm](https://github.com/tqdm/tqdm) package.

We integrate both these features into a custom Dataloader which we build on top of the Pytorch Dataloader.

In [79]:
# #export
# from tqdm import tqdm, tqdm_notebook, trange, tnrange
# from ipykernel.kernelapp import IPKernelApp

# def in_notebook(): return IPKernelApp.initialized()

# def to_device(device, b): return [o.to(device) for o in b]
# default_device = torch.device('cuda')

# if in_notebook():
#     tqdm = tqdm_notebook
#     trange = tnrange




# class DeviceDataLoader():
#     '''
#     Custom Data Loader (GPU training + tqdm progress bar)
#     '''
#     def __init__(self,dl,device,progress_func = None):
#         self.dl,self.device,self.progress_func = dl,device,progress_func
        
#     def __len__(self): return len(self.dl)
#     def __iter__(self):
#         self.gen = (to_device(self.device,o) for o in self.dl) # each o is (X,y), and will turn into [X in cuda, y in cuda]
#         if self.progress_func is not None:
#             self.gen = self.progress_func(self.gen, total=len(self.dl), leave=False)
#         return iter(self.gen)


#     '''
#     Return DeviceDataLoader obj
#     '''
#     @classmethod
#     def create(cls, *args, device=default_device, progress_func=tqdm, **kwargs):
#         return cls(DataLoader(*args, **kwargs), device=device, progress_func=progress_func)

In [60]:
#export
def ifnone(a:bool,b:Any):
    "`a` if its not None, otherwise `b`"
    return b if a is None else a

default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
Tensors = Union[Tensor, Collection['Tensors']]

def to_device(b:Tensors, device:torch.device):
    "Ensure `b` is on `device`"
    device = ifnone(device, default_device)
    if is_listy(b): return [to_device(o, device) for o in b]
    return b.to(device)

@dataclass
class DeviceDataLoader():
    "`DataLoader` that ensures batches from `dl` are on `device`"
    dl: DataLoader
    device: torch.device

    def __len__(self) -> int: return len(self.dl)
    def proc_batch(self,b:Tensors): return to_device(b, self.device)

    def __iter__(self)->Tensors:
        "Ensure batches from `dl` are on `device` as we iterate"
        self.gen = map(self.proc_batch, self.dl)
        return iter(self.gen)

    @classmethod
    def create(cls, *args, device:torch.device=default_device, **kwargs): 
        return cls(DataLoader(*args, **kwargs), device=device)

In [70]:
def get_data(train_ds, valid_ds, bs):
    return (DeviceDataLoader.create(train_tds, bs,   shuffle=True),
            DeviceDataLoader.create(valid_tds, bs*2, shuffle=False))

In [71]:
train_dl,valid_dl = get_data(train_tds, valid_tds, bs)

In [74]:
#export
def fit(epochs:int, model:Model, loss_fn:LossFunction, 
        opt:optim.Optimizer, train_dl:DataLoader, valid_dl:DataLoader) -> None:
    "Train `model` for `epochs` with `loss_fun` and `optim`"
    for epoch in range(epochs):
        model.train()
        for xb,yb in train_dl: loss,_ = loss_batch(model, xb, yb, loss_fn, opt)

        model.eval()
        with torch.no_grad():
            losses,nums = zip(*[loss_batch(model, xb, yb, loss_fn)
                                for xb,yb in valid_dl])
        val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)

        print(epoch, val_loss)

In [72]:
def get_model():
    model = simple_cnn([1,16,16,10], [3,3,3], [2,2,2]).to(default_device)
    return model, optim.SGD(model.parameters(), lr=lr)

In [73]:
model,opt = get_model()

In [75]:
%%time
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 0.9289371654510498
1 0.7716784593105316
2 0.5402628433704376
3 0.6262347139358521
4 0.2591162664413452
CPU times: user 8.47 s, sys: 340 ms, total: 8.81 s
Wall time: 9.16 s


## Learner

**Define learner**

Finally, we are missing a learner class to close the gap between our loaded data and our model. The learner class will receive our loaded data (after transformations) and the model and we will be able to call fit on it to start the training phase.

Note that we must define another fit function to track the progress of our training with the progress bar we included in the Dataloader.

In [63]:
# #export
# def fit(epochs, model, loss_fn, opt, train_dl, valid_dl):
#     for epoch in tnrange(epochs):
#         model.train()
#         for xb,yb in train_dl:
#             loss,_ = loss_batch(model, xb, yb, loss_fn, opt)
#             if train_dl.progress_func is not None: train_dl.gen.set_postfix_str(loss)

#         model.eval()
#         with torch.no_grad():
#             losses,nums = zip(*[loss_batch(model, xb, yb, loss_fn)
#                                 for xb,yb in valid_dl])
#         val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)

#         print(epoch, val_loss)

In [80]:
# #export
# class DataBunch():
#     def __init__(self, train_ds, valid_ds, bs=64, device=None, train_tfm=None, valid_tfm=None):
#         self.device = default_device if device is None else device
#         self.train_dl = DeviceDataLoader.create(TfmDataset(train_ds,train_tfm), bs, shuffle=True)
#         self.valid_dl = DeviceDataLoader.create(TfmDataset(valid_ds, valid_tfm), bs*2, shuffle=False)
        

# class Learner():
#     '''
#     Encapsulate data (dataloader) and model, also to fit model given opt,loss fn, lr,epochs ...
#     '''
#     def __init__(self, data, model):
#         self.data,self.model = data,model.to(data.device)

#     def fit(self, epochs, lr, opt_fn=optim.SGD):
#         opt = opt_fn(self.model.parameters(), lr=lr)
#         loss_fn = F.cross_entropy
#         fit(epochs, self.model, loss_fn, opt, self.data.train_dl, self.data.valid_dl)

In [82]:
#export
TItem = TypeVar('TItem')
TfmCallable = Callable[[TItem],TItem]
TfmList = Union[TfmCallable, Collection[TfmCallable]]
Tfms = Optional[TfmList]

@dataclass
class DataBunch():
    "Bind `train_dl`, `valid_dl` to `device`"
    train_dl:DataLoader
    valid_dl:DataLoader
    device:torch.device=None

    @classmethod
    def create(cls, train_ds:Dataset, valid_ds:Dataset, bs:int=64, 
               train_tfm:Tfms=None, valid_tfm:Tfms=None, device:torch.device=None, **kwargs):
        return cls(DeviceDataLoader.create(TfmDataset(train_ds, train_tfm), bs,   
                                           shuffle=True,  device=device, **kwargs),
                   DeviceDataLoader.create(TfmDataset(valid_ds, valid_tfm), bs*2, 
                                           shuffle=False, device=device, **kwargs),
                   device=device)

class Learner():
    "Train `model` on `data` for `epochs` using learning rate `lr` and `opt_fn` to optimize training"
    def __init__(self, data:DataBunch, model:Model):
        self.data,self.model = data,to_device(model, data.device)

    def fit(self, epochs, lr, opt_fn=optim.SGD):
        opt = opt_fn(self.model.parameters(), lr=lr)
        loss_fn = F.cross_entropy
        fit(epochs, self.model, loss_fn, opt, self.data.train_dl, self.data.valid_dl)

In [80]:
data = DataBunch.create(train_ds, valid_ds, bs=bs, train_tfm=mnist2image, valid_tfm=mnist2image)
model = simple_cnn([1,16,16,10], [3,3,3], [2,2,2])
learner = Learner(data, model)
opt_fn = partial(optim.SGD, momentum=0.9)

In [81]:
learner.fit(4, lr/5, opt_fn=opt_fn)

0 0.457115455532074
1 0.3451181959152222
2 0.2594533142089844
3 0.2409851318359375


In [None]:
learner = Learner(data, simple_cnn([1,16,16,10], [3,3,3], [2,2,2]))

In [None]:
learner.fit(1, lr/5, opt_fn=opt_fn)
learner.fit(2, lr, opt_fn=opt_fn)
learner.fit(1, lr/5, opt_fn=opt_fn)