In [None]:
%load_ext autoreload
%autoreload 2

# FP16

In [None]:
#export
from nb_004b import *

In [None]:
DATA_PATH = Path('data')
PATH = DATA_PATH/'cifar10'

data_mean,data_std = map(tensor, ([0.491, 0.482, 0.447], [0.247, 0.243, 0.261]))
cifar_norm,cifar_denorm = normalize_funcs(data_mean, data_std)

train_tfms = [flip_lr(p=0.5),
              pad(padding=4),
              crop(size=32, row_pct=(0,1.), col_pct=(0,1.))]
valid_tfms = []

bs = 64

In [None]:
#export
def to_half(b):  return [b[0].half(), b[1]]

@dataclass
class DeviceDataLoader():
    dl: DataLoader
    device: torch.device
    tfms: List[Callable]=None
    half: bool = False

    def __len__(self): return len(self.dl)

    def proc_batch(self,b):
        b = to_device(self.device,b)
        if self.tfms is not None: b = self.tfms(b)
        return to_half(b) if self.half else b

    def __iter__(self):
        self.gen = map(self.proc_batch, self.dl)
        return iter(self.gen)

    @classmethod
    def create(cls, *args, device=default_device,tfms=tfms, **kwargs):
        return cls(DataLoader(*args, **kwargs), device=device, tfms=tfms, half=False)
    
import nb_002b
#nb_004b.DeviceDataLoader = DeviceDataLoader
nb_002b.DeviceDataLoader = DeviceDataLoader

In [None]:
DataBunch

In [None]:
train_ds = FilesDataset.from_folder(PATH/'train', classes=['airplane','dog'])
valid_ds = FilesDataset.from_folder(PATH/'test', classes=['airplane','dog'])
data = DataBunch.create(train_ds, valid_ds, bs=bs, num_workers=0, 
                        train_tfm=train_tfms, valid_tfm=valid_tfms, dl_tfms=cifar_norm)
len(data.train_dl), len(data.valid_dl)

In [None]:
model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)
learn = Learner(data, model)
learn.metrics = [accuracy]
sched = OneCycleScheduler(learn, 0.1, 5)

# FP16

In [None]:
#export
def bn2float(module):
    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): module.float()
    for child in module.children(): bn2float(child)
    return module

def model2half(model):
    "Converts the model to half precision except the batchnorm layers"
    return bn2float(model.half())

Helper function to save the master model in FP32 with flat tensors (apparently it helps with performance)

In [None]:
#export
from torch._utils import _unflatten_dense_tensors
from torch.nn.utils import parameters_to_vector

In [None]:
def vector_to_parameters1(vec, parameters):
    
    if not isinstance(vec, torch.Tensor):
        raise TypeError('expected torch.Tensor, but got: {}'
                        .format(torch.typename(vec)))
    param_device = None
    pointer = 0
    for param in parameters:
        param_device = _check_param_device(param, param_device)
        num_param = torch.prod(torch.LongTensor(list(param.size())))
        param.data.copy_(vec[pointer:pointer + num_param].view(param.size()).data)
        pointer += num_param

In [None]:
#export
def get_master(layer_groups:Collection[nn.Module], flat_master:bool=False) -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
    "Returns two lists, one for the model parameters in FP16 and one for the master parameters in FP32"
    model_params = [[param for param in lg.parameters() if param.requires_grad] for lg in layer_groups]
    if flat_master:
        master_params = [parameters_to_vector([param.data.float() for param in lg]) for lg in model_params]
        master_params = [torch.nn.Parameter(mp, requires_grad=True) for mp in master_params]
        for mp in master_params:
            if mp.grad is None: mp.grad = mp.new(*mp.size())
        return model_params, [[mp] for mp in master_params]
    else:
        master_params = [[param.clone().float().detach() for param in lg] for lg in model_params]
        for mp in master_params:
            for param in mp: param.requires_grad = True
        return model_params, master_params

def model_g2master_g(model_params:Sequence[Tensor], master_params:Sequence[Tensor], flat_master:bool=False):
    "Copies the model gradients to the master parameters for the optimizer step"
    if flat_master:
        for model_group,master_group in zip(model_params,master_params):
            master_group[0].grad.data.copy_(parameters_to_vector([p.grad.data.float() for p in model_group]))
    else:
        for model_group,master_group in zip(model_params,master_params):
            for model, master in zip(model_group, master_group):
                if model.grad is not None:
                    if master.grad is None: master.grad = master.data.new(*master.data.size())
                    master.grad.data.copy_(model.grad.data)
                else: master.grad = None

def master2model(model_params:Sequence[Tensor], master_params:Sequence[Tensor], flat_master:bool=False):
    "Copy master parameters to model parameters"
    if flat_master:
        for model_group,master_group in zip(model_params,master_params):
            for model, master in zip(model_group, _unflatten_dense_tensors(master_group[0].data, model_group)):
                model.data.copy_(master)
    else:
        for model_group,master_group in zip(model_params,master_params):
            for model, master in zip(model_group, master_group): model.data.copy_(master.data)

In [None]:
#export
from torch._utils import _unflatten_dense_tensors
from torch.nn.utils import parameters_to_vector

@dataclass
class MixedPrecision(Callback):
    "Callback that handles mixed-precision training"
    learn:Learner
    loss_scale:float=512.
    flat_master:bool=False
    def __post_init__(self): assert torch.backends.cudnn.enabled, "Mixed precision training requires cudnn." 
    
    def on_train_begin(self, **kwargs):
        #Insures the dataloaders are in half precision.
        self.learn.data.train_dl.half = True
        if hasattr(self.learn.data, 'valid_dl') and self.learn.data.valid_dl is not None:
            self.learn.data.valid_dl.half = True
        #Get a copy of the model params in FP32
        self.model_params, self.master_params = get_master(self.learn.layer_groups, self.flat_master)
        #Changes the optimizer so that the optimization step is done in FP32.
        opt = self.learn.opt
        mom,wd,beta = opt.mom,opt.wd,opt.beta
        opt_params = [{'params': mp, 'lr': lr} for mp,lr in zip(self.master_params, self.learn.opt._lr)]
        self.learn.opt.opt = self.learn.opt_fn(opt_params)
        opt.mom,opt.wd,opt.beta = mom,wd,beta
    
    def on_loss_begin(self, last_output:Tensor, **kwargs) -> Tensor:
        #It's better to compute the loss in FP32, to avoid reduction overflow.
        return last_output.float()
    
    def on_backward_begin(self, last_loss:Rank0Tensor, **kwargs) -> Rank0Tensor:
        #To avoid gradient underflow, we scale the gradients
        return last_loss * self.loss_scale
    
    def on_backward_end(self, **kwargs):
        #Convert the gradients back to FP32 and divide them by the scale.
        model_g2master_g(self.model_params, self.master_params, self.flat_master)
        for group in self.master_params:
            for param in group: param.grad.div_(self.loss_scale)
    
    def on_step_end(self, **kwargs):
        #Zeros the gradients of the model since the optimizer is disconnected.
        self.learn.model.zero_grad()
        #Update the params from master to model.
        master2model(self.model_params, self.master_params, self.flat_master)

In [None]:
model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)
model = model2half(model)
learn = Learner(data, model)
learn.metrics = [accuracy]
scheds = [MixedPrecision(learn, flat_master=True), OneCycleScheduler(learn, 0.1, 5)]

In [None]:
learn.fit(2, 1e-2, callbacks=scheds)

In [None]:
learn.model.layers[0][0].weight.type()

In [None]:
scheds[0].master_params[0][0].size(),scheds[0].master_params[0][0].type()

Test with discriminative lrs

In [None]:
model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)
model = model2half(model)
layer_groups = split_model(model.layers, [5,9])
learn = Learner(data, model)
learn.layer_groups = layer_groups
learn.metrics = [accuracy]
scheds = [MixedPrecision(learn, flat_master=True), OneCycleScheduler(learn, 0.1, 5)]

In [None]:
learn.fit(1, 1e-2, callbacks=scheds)

In [None]:
learn.model.layers[0][0].weight.type()

In [None]:
for master in scheds[0].master_params:
    print(master[0].size(),master[0].type())