In [1]:
import torch
from pathlib import Path
import os
import numpy as np
import torch.nn as nn
from datetime import datetime

import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import argparse, os, shutil, time, warnings

from fp16util import *
from resnet import *
from PIL import Image
from torch.nn.parameter import Parameter

In [2]:

def get_parser():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR', help='path to dataset')
    parser.add_argument('--save-dir', type=str, default=Path.cwd(), help='Directory to save logs and models.')
    parser.add_argument('-j', '--workers', default=8, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum')
    parser.add_argument('--weight-decay', '--wd', default=5e-4, type=float,
                        metavar='W', help='weight decay (default: 1e-4)')
    parser.add_argument('-b', '--batch-size', default=256, type=int,
                        metavar='N', help='mini-batch size (default: 256)')
    parser.add_argument('--phases', default='[(0,2e-1,16),(2e-1,1e-2,16),(1e-2,0,5)]', type=str,
                    help='Should be a string formatted like this: [(start_lr,end_lr,num_epochs),(phase2...)]')
    parser.add_argument('--verbose', action='store_true', help='Verbose logging')
#     parser.add_argument('--init-bn0', action='store_true', help='Intialize running batch norm mean to 0')
    parser.add_argument('--print-freq', '-p', default=200, type=int,
                        metavar='N', help='print every this many steps (default: 5)')
#     parser.add_argument('--no-bn-wd', action='store_true', help='Remove batch norm from weight decay')
    parser.add_argument('--full-precision', action='store_true', help='Run model full precision mode. Default fp16')
    parser.add_argument('--loss-scale', type=float, default=512,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    parser.add_argument('--distributed', action='store_true', help='Run distributed training')
    parser.add_argument('--world-size', default=-1, type=int, 
                        help='total number of processes (machines*gpus)')
    parser.add_argument('--scale-lr', type=float, default=1, help='You should learning rate propotionally to world size')
    parser.add_argument('--dist-url', default='env://', type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend')
    parser.add_argument('--local_rank', default=0, type=int,
                        help='Used for multi-process training. Can either be manually set ' +
                        'or automatically set by using \'python -m multiproc\'.')
    return parser


In [3]:
args_input = [
    str(Path.home()/'data/cifar10/'),
    '--phases', '[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]',
#     '--phases', '[(0,2e-1,16),(2e-1,1e-2,16),(1e-2,0,5)]'
#     '--full-precision'
]

In [4]:
global args
args = get_parser().parse_args(args_input)
if args.full_precision: args.loss_scale = 1
torch.backends.cudnn.benchmark = True

## Model

In [5]:
# --
# Model definition
# Derived from models in `https://github.com/kuangliu/pytorch-cifar`
class PreActBlock(nn.Module):
    
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        
        self.bn1   = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            )
            
    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        return out + shortcut


class ResNet18(nn.Module):
    def __init__(self, num_blocks=[2, 2, 2, 2], num_classes=10):
        super().__init__()
        
        self.in_channels = 64
        
        self.prep = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        
        self.layers = nn.Sequential(
            self._make_layer(64, 64, num_blocks[0], stride=1),
            self._make_layer(64, 128, num_blocks[1], stride=2),
            self._make_layer(128, 256, num_blocks[2], stride=2),
            self._make_layer(256, 256, num_blocks[3], stride=2),
        )
        
        self.classifier = nn.Linear(512, num_classes)
        
    def _make_layer(self, in_channels, out_channels, num_blocks, stride):
        
        strides = [stride] + [1] * (num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(PreActBlock(in_channels=in_channels, out_channels=out_channels, stride=stride))
            in_channels = out_channels
        
        return nn.Sequential(*layers)
    
    def forward(self, x):
        x = self.prep(x)
        
        x = self.layers(x)
        
        x_avg = F.adaptive_avg_pool2d(x, (1, 1))
        x_avg = x_avg.view(x_avg.size(0), -1)
        
        x_max = F.adaptive_max_pool2d(x, (1, 1))
        x_max = x_max.view(x_max.size(0), -1)
        
        x = torch.cat([x_avg, x_max], dim=-1)
        
        x = self.classifier(x)
        
        return x

### Torch loader

In [6]:
def pad(img, p=4, padding_mode='reflect'):
    return Image.fromarray(np.pad(np.asarray(img), ((p, p), (p, p), (0, 0)), padding_mode))

def torch_loader(data_path, size, bs, val_bs=None):
    os.makedirs(data_path,exist_ok=True)

    val_bs = val_bs or bs
    # Data loading code
    tfms = [transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.24703,0.24349,0.26159))]
    train_tfms = transforms.Compose([
        pad, # TODO: use `padding` rather than assuming 4
        transforms.RandomCrop(size),
        transforms.RandomHorizontalFlip(),
    ] + tfms)
    val_tfms = transforms.Compose(tfms)

    train_dataset = datasets.CIFAR10(root=data_path, train=True, download=(args.local_rank==0), transform=train_tfms)
    val_dataset  = datasets.CIFAR10(root=data_path, train=False, download=(args.local_rank==0), transform=val_tfms)

    train_sampler = (torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None)
    # val_sampler = (torch.utils.data.distributed.DistributedSampler(val_dataset) if args.distributed else None)
    val_sampler = None

    train_loader = DataLoader(
        train_dataset, batch_size=bs, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True,
        sampler=train_sampler)

    val_loader = DataLoader(
        val_dataset, batch_size=val_bs, shuffle=False,
        num_workers=args.workers, pin_memory=True,
        sampler=val_sampler)
    
    train_loader = DataPrefetcher(train_loader)
    val_loader = DataPrefetcher(val_loader)
    
    return train_loader, val_loader

# Seems to speed up training by ~2%
class DataPrefetcher():
    def __init__(self, loader, stop_after=None):
        self.loader = loader
        self.dataset = loader.dataset
        self.stream = torch.cuda.Stream()
        self.stop_after = stop_after
        self.next_input = None
        self.next_target = None

    def __len__(self):
        return len(self.loader)

    def preload(self):
        try:
            self.next_input, self.next_target = next(self.loaditer)
        except StopIteration:
            self.next_input = None
            self.next_target = None
            return
        with torch.cuda.stream(self.stream):
            self.next_input = self.next_input.cuda(async=True)
            self.next_target = self.next_target.cuda(async=True)

    def __iter__(self):
        count = 0
        self.loaditer = iter(self.loader)
        self.preload()
        while self.next_input is not None:
            torch.cuda.current_stream().wait_stream(self.stream)
            input = self.next_input
            target = self.next_target
            self.preload()
            count += 1
            yield input, target
            if type(self.stop_after) is int and (count > self.stop_after):
                break

### Learning rate scheduler

In [7]:
class Scheduler():
    def __init__(self, optimizer, phases=[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]):
        self.optimizer = optimizer
        self.current_lr = None
        self.phases = phases
        self.tot_epochs = sum([p[2] for p in phases])

    def linear_lr(self, start_lr, end_lr, epoch_curr, batch_curr, epoch_tot, batch_tot):
        if args.scale_lr != 1:
            start_lr *= args.scale_lr
            end_lr *= args.scale_lr
        step_tot = epoch_tot * batch_tot
        step_curr = epoch_curr * batch_tot + batch_curr
        step_size = (end_lr - start_lr)/step_tot
        return start_lr + step_curr * step_size
    
    def get_current_phase(self, epoch):
        epoch_accum = 0
        for phase in self.phases:
            start_lr,end_lr,num_epochs = phase
            if epoch <= epoch_accum+num_epochs: return start_lr, end_lr, num_epochs, epoch - epoch_accum
            epoch_accum += num_epochs
        raise Exception('Epoch out of range')
            
    def get_lr(self, epoch, batch_curr, batch_tot):
        start_lr, end_lr, num_epochs, relative_epoch = self.get_current_phase(epoch)
        return self.linear_lr(start_lr, end_lr, relative_epoch, batch_curr, num_epochs, batch_tot)

    def update_lr(self, epoch, batch_num, batch_tot):
        lr = self.get_lr(epoch, batch_num, batch_tot)
        if args.verbose and (self.current_lr != lr) and ((batch_num == 1) or (batch_num == batch_tot)): 
            print(f'Changing LR from {self.current_lr} to {lr}')

        self.current_lr = lr

        for param_group in self.optimizer.param_groups:
            lr_old = param_group['lr'] or lr
            param_group['lr'] = lr

In [8]:
# item() is a recent addition, so this helps with backward compatibility.
def to_python_float(t):
    if isinstance(t, float): return t
    if isinstance(t, int): return t
    if hasattr(t, 'item'): return t.item()
    else: return t[0]

def train(trn_loader, model, criterion, optimizer, scheduler, epoch):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()

    st = time.time()
    trn_len = len(trn_loader)

    # print('Begin training loop:', st)
    for i,(input,target) in enumerate(trn_loader):
        batch_size = input.size(0)
        batch_num = i+1
        
        # measure data loading time
        scheduler.update_lr(epoch, i+1, trn_len)

        # compute output
        output = model(input)
        loss = criterion(output, target)

        if args.distributed:
            # Must keep track of global batch size, since not all machines are guaranteed equal batches at the end of an epoch
            corr1 = correct(output.data, target)[0]
            metrics = torch.tensor([batch_size, loss, corr1]).float().cuda()
            batch_total, reduced_loss, corr1 = sum_tensor(metrics)
            reduced_loss = reduced_loss/dist.get_world_size()
            prec1 = corr1*(100.0/batch_total)
        else:
            reduced_loss = loss.data
            batch_total = input.size(0)
            prec1 = accuracy(output.data, target)[0] # measure accuracy and record loss
        losses.update(to_python_float(reduced_loss), to_python_float(batch_total))
        top1.update(to_python_float(prec1), to_python_float(batch_total))

        loss = loss*args.loss_scale
        
        # compute gradient and do SGD step
        if args.full_precision:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        else:
            model.zero_grad()
            loss.backward()
            model_grads_to_master_grads(model_params, master_params)
            for param in master_params:
                param.grad.data = param.grad.data/args.loss_scale
            optimizer.step()
            master_params_to_model_params(model_params, master_params)
            torch.cuda.synchronize()

        batch_time.update(time.time() - end)
        end = time.time()

        should_print = (batch_num%args.print_freq == 0) or (batch_num==trn_len)
        if should_print: log_batch(epoch, batch_num, trn_len, batch_time, losses, top1)
    return top1.avg, losses.avg

def validate(val_loader, model, criterion, epoch, start_time):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    model.eval()
    end = time.time()
    val_len = len(val_loader)

    for i,(input,target) in enumerate(val_loader):
        batch_num = i+1
        with torch.no_grad():
            output = model(input)
            loss = criterion(output, target).data
        batch_total = input.size(0)
        prec1 = accuracy(output.data, target)[0]
            
        losses.update(to_python_float(loss), batch_total)
        top1.update(to_python_float(prec1), batch_total)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        should_print = (batch_num%args.print_freq == 0) or (batch_num==val_len)
        if should_print: log_batch(epoch, batch_num, val_len, batch_time, losses, top1)
            
    return top1.avg, losses.avg

def log_batch(epoch, batch_num, batch_len, batch_time, loss, top1):
    if args.local_rank==0 and args.verbose:
        output = ('Epoch: [{0}][{1}/{2}]\t' \
                + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                + 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})').format(
                epoch, batch_num, batch_len, batch_time=batch_time, loss=loss, top1=top1)
        print(output)
        with open(f'{args.save_dir}/full.log', 'a') as f:
            f.write(output + '\n')
            
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = self.avg = self.sum = self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    corrrect_ks = correct(output, target, topk)
    batch_size = target.size(0)
    return [correct_k.float().mul_(100.0 / batch_size) for correct_k in corrrect_ks]

def correct(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).sum(0, keepdim=True)
        res.append(correct_k)
    return res


def sum_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    return rt

def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= args.world_size
    return rt


In [9]:
# Filter out batch norm parameters and remove them from weight decay - gets us higher accuracy 93.2 -> 93.48
# https://arxiv.org/pdf/1807.11205.pdf
def bnwd_optim_params(model, model_params, master_params):
    bn_params, remaining_params = split_bn_params(model, model_params, master_params)
    return [{'params':bn_params,'weight_decay':0}, {'params':remaining_params}]


def split_bn_params(model, model_params, master_params):
    def get_bn_params(module):
        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): return module.parameters()
        accum = set()
        for child in module.children(): [accum.add(p) for p in get_bn_params(child)]
        return accum
    
    mod_bn_params = get_bn_params(model)
    zipped_params = list(zip(model_params, master_params))

    mas_bn_params = [p_mast for p_mod,p_mast in zipped_params if p_mod in mod_bn_params]
    mas_rem_params = [p_mast for p_mod,p_mast in zipped_params if p_mod not in mod_bn_params]
    return mas_bn_params, mas_rem_params
    

In [10]:
# Our own implementation of lars
class LARS(torch.optim.Optimizer):
    # SGD https://raw.githubusercontent.com/pytorch/pytorch/master/torch/optim/sgd.py
    # η (eta) = "trust" coefficient
    def __init__(self, params, lr, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False, eta=0.02, eps=1e-8, lars=True):
        self.lr = lr
        self.lars = True
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov, eta=eta)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super().__init__(params, defaults)

    def __setstate__(self, state):
        super().__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']
            eta = group['eta']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if lars: local_lr = eta * torch.norm(p.data) / torch.norm(d_p)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf
                # print('Learning rate:', local_lr, local_lr*group['lr'])
                if lars: p.data.add_(-min(local_lr*group['lr'], group['lr']), d_p)
                else: p.data.add_(-group['lr'], d_p)

        return loss

In [11]:
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn.parameter import Parameter

In [18]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=eval(args.phases))


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, args.batch_size, args.batch_size*2)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		196		0.2573		1.9591		1.5154		44.9
1		196		0.5009		1.3311		1.2488		55.02
2		196		0.7464		1.0384		1.2486		56.42
3		196		0.9902		0.8468		1.0063		65.54
4		196		1.2363		0.7105		1.0976		64.18
5		196		1.4804		0.6182		0.6572		77.66
6		196		1.7221		0.5649		0.9406		68.91
7		196		1.9676		0.5214		0.7907		73.62
8		196		2.2121		0.4827		0.7649		74.82
9		196		2.456		0.4577		0.7168		76.67
10		196		2.6996		0.4391		0.7836		74.04
11		196		2.9439		0.4256		0.6637		77.76
12		196		3.1893		0.412

In [19]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = torch.optim.SGD(optim_params, lr=0, nesterov=True, momentum=args.momentum, weight_decay=args.weight_decay)
scheduler = Scheduler(optimizer, phases=eval(args.phases))


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, args.batch_size, args.batch_size*2)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		196		0.2095		1.7281		1.6887		43.98
1		196		0.4187		1.035		1.2223		59.3
2		196		0.6275		0.775		1.0875		66.09
3		196		0.8374		0.6364		1.2896		62.47
4		196		1.0474		0.5632		0.7241		76.67
5		196		1.2562		0.5082		0.6464		78.71
6		196		1.4674		0.4653		0.6051		79.5


Process Process-1240:
Process Process-1236:
Process Process-1234:
Process Process-1233:
Process Process-1235:
Process Process-1237:
Process Process-1239:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/m

KeyboardInterrupt
KeyboardInterrupt
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 49, in __call__
    img = t(img)
  File "<ipython-input-6-91dc9b208998>", line 2, in pad
    return Image.fromarray(np.pad(np.asarray(img), ((p, p), (p, p), (0, 0)), padding_mode))
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/numpy/lib/arraypad.py", line 1301, in pad
    pad_width = _validate_lengths(narray, pad_width)


RuntimeError: DataLoader worker (pid 5515) exited unexpectedly with exit code 1.

  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/numpy/lib/arraypad.py", line 1082, in _validate_lengths
    chk = [1 if x is None else x for x in i]
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/numpy/lib/arraypad.py", line 1082, in <listcomp>
    chk = [1 if x is None else x for x in i]
KeyboardInterrupt


In [12]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.1, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=eval(args.phases))


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, args.batch_size, args.batch_size*2)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		196		0.1799		1.6529		1.5806		46.16
1		196		0.3402		1.0252		1.2427		57.87
2		196		0.5021		0.7953		1.0173		67.06
3		196		0.6651		0.6775		1.133		64.4
4		196		0.828		0.6355		1.2792		57.2
5		196		0.993		0.6217		0.7208		75.02
6		196		1.1539		0.6175		0.9251		71.35
7		196		1.3183		0.644		1.6398		55.06
8		196		1.4817		0.683		1.6067		56.29
9		196		1.6452		0.7297		1.0064		66.64
10		196		1.8063		0.7822		1.2154		58.1
11		196		1.9661		0.8309		2.1561		45.02
12		196		2.1274		0.8783		1.6

Process Process-213:
Process Process-214:
Process Process-216:
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-210:
Process Process-211:
Process Process-215:
Process Process-212:
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process Process-209:
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
T

KeyboardInterrupt
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/numpy/lib/arraypad.py", line 1446, in pad
    newmat = _pad_ref(newmat, (pad_before, pad_after), method, axis)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
KeyboardInterrupt
KeyboardInterrupt
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 49, in __call__
    img = t(img)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 76, in __call__
    return F.to_tensor(pic)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/functional.py", line 83, in to_tensor
 

Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-3605d70d6d69>", line 43, in <module>
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
  File "<ipython-input-8-7d0ced679dc0>", line 29, in train
    output = model(input)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/container.py", line 91, in forward
    input = module(input)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "<ipython-input-5-08c899b22ea2>", line 6

KeyboardInterrupt: 

  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/numpy/lib/arraypad.py", line 838, in _pad_ref
    return np.concatenate((ref_chunk1, arr, ref_chunk2), axis=axis)
KeyboardInterrupt


In [13]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.01, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=eval(args.phases))


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, args.batch_size, args.batch_size*2)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		196		0.2574		2.0941		1.6994		37.41
1		196		0.4985		1.5059		1.3106		52.06
2		196		0.7441		1.2128		1.0713		61.56
3		196		0.9884		1.0105		0.9797		65.11
4		196		1.2306		0.8554		0.8806		69.48
5		196		1.4749		0.7441		0.8309		71.5
6		196		1.7182		0.6497		0.6912		76.3
7		196		1.9626		0.5838		0.8357		73.27
8		196		2.2055		0.538		0.6456		78.43
9		196		2.4498		0.501		0.6228		78.89
10		196		2.6932		0.4629		0.5708		80.92
11		196		2.9372		0.4447		0.6431		78.99
12		196		3.1781		0.4206	

In [15]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.03, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=eval(args.phases))


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, args.batch_size, args.batch_size*2)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		196		0.2456		1.877		1.5124		44.97
1		196		0.4912		1.2414		1.2029		57.96
2		196		0.736		0.9539		1.0196		65.09
3		196		0.9795		0.7618		0.7803		73.53
4		196		1.2221		0.6561		0.8809		71.3
5		196		1.4669		0.5838		0.8586		70.75
6		196		1.7135		0.5367		0.694		76.11
7		196		1.956		0.509		0.9822		69.12
8		196		2.1984		0.485		0.6546		77.93
9		196		2.4431		0.4718		0.7213		76.77
10		196		2.6843		0.461		0.6415		78.16
11		196		2.9286		0.4541		1.7203		57.35
12		196		3.1719		0.449		0.66

In [16]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(0,4e-1,15),(4e-1,2e-2,15),(2e-2,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 512, 512)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		98		0.2345		1.8791		1.4283		47.64
1		98		0.4587		1.2298		1.171		57.43


Exception ignored in: <function WeakValueDictionary.__init__.<locals>.remove at 0x7f47f27e2378>
Process Process-1395:
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/weakref.py", line 109, in remove
Process Process-1396:
Process Process-1399:
Traceback (most recent call last):
Process Process-1397:
Process Process-1400:
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process Process-1398:
Traceback (most recent call last):
    def remove(wr, selfref=ref(self), _atomic_removal=_remove_dead_weakref):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
KeyboardInterrupt
  File "/home/paperspace/anaconda

KeyboardInterrupt
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/datasets/cifar.py", line 121, in __getitem__
    img = self.transform(img)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/numpy/lib/arraypad.py", line 1446, in pad

Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-178544193674>", line 43, in <module>
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
  File "<ipython-input-8-7d0ced679dc0>", line 21, in train
    for i,(input,target) in enumerate(trn_loader):
  File "<ipython-input-6-91dc9b208998>", line 66, in __iter__
    self.preload()
  File "<ipython-input-6-91dc9b208998>", line 54, in preload
    self.next_input, self.next_target = next(self.loaditer)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 280, in __next__
    idx, batch = self._get_batch()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 259, in _get_batch
    return self.

KeyboardInterrupt: 

  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/numpy/lib/arraypad.py", line 989, in _normalize_shape
    def _normalize_shape(ndarray, shape, cast_to_int=True):
KeyboardInterrupt
