## CIFAR 10

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.conv_learner import *
# from fastai.models.cifar10.wideresnet import wrn_22_cat, wrn_22, WideResNetConcat
torch.backends.cudnn.benchmark = True
PATH = Path.home()/"data/cifar10/"
os.makedirs(PATH,exist_ok=True)

In [3]:
%pwd

'/home/paperspace/fastai/courses/dl2'

In [4]:
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
stats = (np.array([ 0.4914 ,  0.48216,  0.44653]), np.array([ 0.24703,  0.24349,  0.26159]))
workers=7

In [5]:
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
def pad(img, p=4, padding_mode='reflect'):
    return Image.fromarray(np.pad(np.asarray(img), ((p, p), (p, p), (0, 0)), padding_mode))

def torch_loader(data_path, size, bs, val_bs=None, prefetcher=True):
#     if not os.path.exists(data_path/'train'): download_cifar10(data_path)

    val_bs = val_bs or bs
    # Data loading code
    traindir = str(data_path/'train')
    valdir = str(data_path/'test')
    tfms = [transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]

    train_tfms = transforms.Compose([
        pad, # TODO: use `padding` rather than assuming 4
        transforms.RandomCrop(size),
        transforms.RandomHorizontalFlip(),
    ] + tfms)

    train_dataset = datasets.ImageFolder(traindir, train_tfms)
    val_dataset = datasets.ImageFolder(valdir, transforms.Compose(tfms))

    train_loader = DataLoader(
        train_dataset, batch_size=bs, shuffle=True,
        num_workers=workers, pin_memory=True)

    val_loader = DataLoader(
        val_dataset, batch_size=val_bs, shuffle=False,
        num_workers=workers, pin_memory=True)
    
    aug_loader = DataLoader(
        datasets.ImageFolder(valdir, train_tfms),
        batch_size=bs, shuffle=False,
        num_workers=workers, pin_memory=True)

    if prefetcher:
        train_loader = DataPrefetcher(train_loader)
        val_loader = DataPrefetcher(val_loader)
        aug_loader = DataPrefetcher(aug_loader)
    
    data = ModelData(data_path, train_loader, val_loader)
    data.sz = size
    data.aug_dl = aug_loader
    return data

# Seems to speed up training by ~2%
class DataPrefetcher():
    def __init__(self, loader, stop_after=None):
        self.loader = loader
        self.dataset = loader.dataset
        self.stream = torch.cuda.Stream()
        self.stop_after = stop_after
        self.next_input = None
        self.next_target = None

    def __len__(self):
        return len(self.loader)

    def preload(self):
        try:
            self.next_input, self.next_target = next(self.loaditer)
        except StopIteration:
            self.next_input = None
            self.next_target = None
            return
        with torch.cuda.stream(self.stream):
            self.next_input = self.next_input.cuda(async=True)
            self.next_target = self.next_target.cuda(async=True)

    def __iter__(self):
        count = 0
        self.loaditer = iter(self.loader)
        self.preload()
        while self.next_input is not None:
            torch.cuda.current_stream().wait_stream(self.stream)
            input = self.next_input
            target = self.next_target
            self.preload()
            count += 1
            yield input, target
            if type(self.stop_after) is int and (count > self.stop_after):
                break

In [6]:
# m = WideResNetConcat(num_groups=3, N=3, num_classes=10, k=1, drop_p=0.)

In [7]:
from fastai.models.cifar10.wideresnet import wrn_22

In [8]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = F.cross_entropy
# learn.opt_fn = optim.Adam
learn.metrics = [accuracy]
wd=1e-4
lr=.5
# learn.clip = 1e-2

# learn.opt_fn = partial(optim.SGD, nesterov=True, momentum=0.9)

# %time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(20,20,0.95,0.85))
%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(20,20,0.95,0.85), loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.137288   1.385104   0.5512    
    1      0.837653   0.816871   0.7248                      
    2      0.649919   0.823135   0.7351                      
    3      0.567991   0.980156   0.7042                      
    4      0.514481   0.502497   0.826                       
    5      0.470965   0.757897   0.7564                      
    6      0.438445   0.548408   0.8212                      
    7      0.414006   0.632729   0.7944                      
    8      0.392887   0.457804   0.8457                      
    9      0.387933   0.451746   0.8473                      
    10     0.390795   0.458801   0.8485                      
    11     0.37432    0.649176   0.7969                      
    12     0.341124   0.512372   0.8353                      
    13     0.318512   0.571917   0.8077                      
    14     0.315084   0.453722   0.8449                      
    15     0.31077    0.420

[0.2057345703125, 0.9401999992370605]

In [8]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = torch.nn.CrossEntropyLoss()
learn.metrics = [accuracy]
wd=1e-4
lr = 1e-3
# learn.clip = 1e-2

learn.opt_fn = partial(optim.Adam, betas=(0.95,0.99))

# %time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85))
%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.225538   1.17062    0.5796    
    1      0.931035   1.10479    0.6262                      
    2      0.771767   0.754736   0.7346                      
    3      0.667968   0.804059   0.7344                      
    4      0.570667   0.68123    0.7677                      
    5      0.524218   0.626479   0.7895                      
    6      0.498802   0.580072   0.8062                      
    7      0.459822   0.541853   0.8185                      
    8      0.425889   0.50452    0.8329                      
    9      0.414437   0.511546   0.8257                      
    10     0.417682   0.481747   0.841                       
    11     0.368253   0.529707   0.8303                      
    12     0.363285   0.591126   0.8169                      
    13     0.366431   0.471198   0.841                       
    14     0.318771   0.398132   0.8681                      
    15     0.291618   0.418

[0.2364404296875, 0.9358000004768372]

In [18]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = torch.nn.CrossEntropyLoss()
learn.metrics = [accuracy]
wd=1e-4
lr = 3e-3
# learn.clip = 1e-2

learn.opt_fn = partial(optim.Adam, betas=(0.95,0.99))

# %time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85))
%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.112738   1.07002    0.6218    
    1      0.879547   1.095173   0.6402                      
    2      0.731016   0.783476   0.7252                      
    3      0.620737   0.639853   0.7835                      
    4      0.579675   0.732586   0.7497                      
    5      0.548672   0.598489   0.8019                      
    6      0.515663   0.624616   0.7836                      
    7      0.488925   0.712434   0.768                       
    8      0.481206   0.674869   0.7749                      
    9      0.454063   0.618895   0.8017                      
    10     0.443535   0.927334   0.7237                      
    11     0.434211   0.662073   0.782                       
    12     0.436364   0.527627   0.8264                      
    13     0.416736   0.444073   0.8533                      
    14     0.390637   0.423986   0.8596                      
    15     0.370236   0.481

[0.240930078125, 0.9287]

In [9]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = F.cross_entropy
# learn.opt_fn = optim.Adam
learn.metrics = [accuracy]
wd=1e-4
lr=3e-3
# learn.clip = 1e-2

learn.opt_fn = partial(optim.Adam, betas=(0.95,0.99))

%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), use_wd_sched=True, loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.083976   1.257236   0.5664    
    1      0.824857   0.992344   0.6511                      
    2      0.669112   0.697577   0.7641                      
    3      0.601613   0.690393   0.7651                      
    4      0.533504   0.57261    0.8082                      
    5      0.472117   0.515276   0.8264                      
    6      0.437557   0.522781   0.8267                      
    7      0.406608   0.485396   0.8367                      
    8      0.371274   0.431782   0.8596                      
    9      0.357207   0.4434     0.85                        
    10     0.334065   0.368199   0.878                       
    11     0.304465   0.518328   0.8437                      
    12     0.284924   0.552292   0.8254                      
    13     0.280122   0.362412   0.8813                      
    14     0.246771   0.397428   0.8786                      
    15     0.209029   0.338

[0.292287890625, 0.9383999992370605]

### Own adam otpimizer

In [None]:
torch.op

In [16]:
class AdamW(torch.optim.Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super().__init__(params, defaults)

    def __setstate__(self, state):
        super().__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

#                 if group['weight_decay'] != 0:
#                     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)
                
                if group['weight_decay'] != 0:
                    p.data.add_(-group['weight_decay'], p.data)
                

        return loss

In [17]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = F.cross_entropy
# learn.opt_fn = optim.Adam
learn.metrics = [accuracy]
wd=1e-4
lr=3e-3
# learn.clip = 1e-2

learn.opt_fn = partial(AdamW, betas=(0.95,0.99))

%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.125994   1.18783    0.577     
    1      0.86175    0.988443   0.6599                      
    2      0.726343   0.688727   0.7625                      
    3      0.608317   0.640625   0.7845                      
    4      0.534234   0.622087   0.787                       
    5      0.507067   0.515154   0.8278                      
    6      0.475229   0.533281   0.8187                      
    7      0.425627   0.524897   0.8208                      
    8      0.393281   0.468985   0.8407                      
    9      0.366347   0.44692    0.8448                      
    10     0.33129    0.484046   0.8456                      
    11     0.337636   0.442385   0.8597                      
    12     0.302043   0.566451   0.8247                      
    13     0.294848   0.397648   0.8679                      
    14     0.267296   0.37949    0.8755                      
    15     0.252204   0.325

[0.2254771484375, 0.940700000667572]

In [19]:
class AdamWLR(torch.optim.Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super().__init__(params, defaults)

    def __setstate__(self, state):
        super().__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

#                 if group['weight_decay'] != 0:
#                     grad = grad.add(group['weight_decay'], p.data)

                if group['weight_decay'] != 0:
                    p.data.add_(-group['weight_decay']*group['lr'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)
                
                

        return loss

In [20]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = F.cross_entropy
# learn.opt_fn = optim.Adam
learn.metrics = [accuracy]
wd=1e-4
lr=3e-3
# learn.clip = 1e-2

learn.opt_fn = partial(AdamWLR, betas=(0.95,0.99))

%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.103384   1.053332   0.629     
    1      0.848033   1.030427   0.6593                      
    2      0.685922   0.71512    0.7574                      
    3      0.60255    0.70709    0.761                       
    4      0.513094   0.495721   0.8335                      
    5      0.4785     0.570843   0.8131                      
    6      0.445733   0.589849   0.8115                      
    7      0.410978   0.422468   0.8532                      
    8      0.377846   0.457348   0.8462                      
    9      0.344275   0.563315   0.822                       
    10     0.340491   0.437142   0.8626                      
    11     0.307948   0.466289   0.8481                      
    12     0.307655   0.395662   0.8743                      
    13     0.283491   0.435071   0.8646                      
    14     0.251735   0.339573   0.895                       
    15     0.200418   0.321

[0.300377734375, 0.9385000005722046]

## Tryin AdamW again

In [21]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = F.cross_entropy
# learn.opt_fn = optim.Adam
learn.metrics = [accuracy]
wd=1e-4
lr=3e-3
# learn.clip = 1e-2

learn.opt_fn = partial(AdamW, betas=(0.95,0.99))

%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.08179    1.31085    0.528     
    1      0.823981   0.978336   0.6742                      
    2      0.685794   0.725625   0.7482                      
    3      0.574724   0.820043   0.7318                      
    4      0.534792   0.635923   0.7889                      
    5      0.477364   0.596087   0.7975                      
    6      0.471159   0.535218   0.8201                      
    7      0.430182   0.420409   0.8552                      
    8      0.387191   0.772662   0.7687                      
    9      0.373985   0.440674   0.8525                      
    10     0.357146   0.433321   0.857                       
    11     0.330867   0.389621   0.8711                      
    12     0.313393   0.468736   0.8486                      
    13     0.307693   0.377807   0.8741                      
    14     0.283999   0.366225   0.8765                      
    15     0.234759   0.369

[0.23341826171875, 0.939500000667572]

In [22]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = F.cross_entropy
# learn.opt_fn = optim.Adam
learn.metrics = [accuracy]
wd=1e-4
lr=3e-3
# learn.clip = 1e-2

learn.opt_fn = partial(AdamW, betas=(0.95,0.99))

%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.103511   1.215768   0.5703    
    1      0.841045   0.88111    0.6917                      
    2      0.69069    0.688277   0.7596                      
    3      0.591199   0.75532    0.7447                      
    4      0.547122   0.61999    0.7897                      
    5      0.505276   0.722243   0.7693                      
    6      0.45382    0.522073   0.8277                      
    7      0.423857   0.437215   0.8522                      
    8      0.409121   0.456454   0.8457                      
    9      0.366972   0.407509   0.8625                      
    10     0.357987   0.487602   0.8424                      
    11     0.348313   0.423933   0.8587                      
    12     0.326056   0.445858   0.8522                      
    13     0.310947   0.3891     0.8673                      
    14     0.277337   0.359866   0.8826                      
    15     0.254975   0.330

[0.224248046875, 0.9407000004768371]

### Back to fastai AdamW

In [23]:
bs=128
sz=32
data = torch_loader(PATH, sz, bs, 512)

# m = PreActResNet(PreActBlock, [2,2,2,2], concatpool=True)
# m = ResNet18()
m = wrn_22()

# m = FP16(m.cuda())
learn = Learner.from_model_data(m, data)
learn.half()
learn.crit = F.cross_entropy
# learn.opt_fn = optim.Adam
learn.metrics = [accuracy]
wd=1e-4
lr=3e-3
# learn.clip = 1e-2

learn.opt_fn = partial(AdamWLR, betas=(0.95,0.99))

%time learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), loss_scale=512)

HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                   
    0      1.115529   1.183455   0.5851    
    1      0.84254    0.809567   0.7215                      
    2      0.674543   0.728618   0.7516                      
    3      0.610069   0.789974   0.7401                      
    4      0.552389   0.611062   0.7936                      
    5      0.489703   0.522124   0.8199                      
    6      0.450236   0.579267   0.8062                      
    7      0.411513   0.623684   0.8015                      
    8      0.393244   0.482875   0.8366                      
    9      0.354747   0.407091   0.8629                      
    10     0.32673    0.414187   0.8607                      
    11     0.315894   0.433505   0.861                       
    12     0.294576   0.479015   0.8528                      
    13     0.28874    0.357954   0.8825                      
    14     0.25295    0.321704   0.8945                      
    15     0.205413   0.319

[0.27680693359375, 0.9401999993324279]