In [1]:
import torch
from pathlib import Path
import os
import numpy as np
import torch.nn as nn
from datetime import datetime

import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import argparse, os, shutil, time, warnings

from fp16util import *
from resnet import *
from PIL import Image
from torch.nn.parameter import Parameter

In [2]:

def get_parser():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR', help='path to dataset')
    parser.add_argument('--save-dir', type=str, default=Path.cwd(), help='Directory to save logs and models.')
    parser.add_argument('-j', '--workers', default=8, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum')
    parser.add_argument('--weight-decay', '--wd', default=5e-4, type=float,
                        metavar='W', help='weight decay (default: 1e-4)')
    parser.add_argument('-b', '--batch-size', default=256, type=int,
                        metavar='N', help='mini-batch size (default: 256)')
    parser.add_argument('--phases', default='[(0,2e-1,16),(2e-1,1e-2,16),(1e-2,0,5)]', type=str,
                    help='Should be a string formatted like this: [(start_lr,end_lr,num_epochs),(phase2...)]')
    parser.add_argument('--verbose', action='store_true', help='Verbose logging')
#     parser.add_argument('--init-bn0', action='store_true', help='Intialize running batch norm mean to 0')
    parser.add_argument('--print-freq', '-p', default=200, type=int,
                        metavar='N', help='print every this many steps (default: 5)')
#     parser.add_argument('--no-bn-wd', action='store_true', help='Remove batch norm from weight decay')
    parser.add_argument('--full-precision', action='store_true', help='Run model full precision mode. Default fp16')
    parser.add_argument('--loss-scale', type=float, default=512,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    parser.add_argument('--distributed', action='store_true', help='Run distributed training')
    parser.add_argument('--world-size', default=-1, type=int, 
                        help='total number of processes (machines*gpus)')
    parser.add_argument('--scale-lr', type=float, default=1, help='You should learning rate propotionally to world size')
    parser.add_argument('--dist-url', default='env://', type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend')
    parser.add_argument('--local_rank', default=0, type=int,
                        help='Used for multi-process training. Can either be manually set ' +
                        'or automatically set by using \'python -m multiproc\'.')
    return parser


In [3]:
args_input = [
    str(Path.home()/'data/cifar10/'),
    '--phases', '[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]',
#     '--phases', '[(0,2e-1,16),(2e-1,1e-2,16),(1e-2,0,5)]'
#     '--full-precision'
]

In [4]:
global args
args = get_parser().parse_args(args_input)
if args.full_precision: args.loss_scale = 1
torch.backends.cudnn.benchmark = True

## Model

In [5]:
# --
# Model definition
# Derived from models in `https://github.com/kuangliu/pytorch-cifar`
class PreActBlock(nn.Module):
    
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        
        self.bn1   = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            )
            
    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        return out + shortcut


class ResNet18(nn.Module):
    def __init__(self, num_blocks=[2, 2, 2, 2], num_classes=10):
        super().__init__()
        
        self.in_channels = 64
        
        self.prep = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        
        self.layers = nn.Sequential(
            self._make_layer(64, 64, num_blocks[0], stride=1),
            self._make_layer(64, 128, num_blocks[1], stride=2),
            self._make_layer(128, 256, num_blocks[2], stride=2),
            self._make_layer(256, 256, num_blocks[3], stride=2),
        )
        
        self.classifier = nn.Linear(512, num_classes)
        
    def _make_layer(self, in_channels, out_channels, num_blocks, stride):
        
        strides = [stride] + [1] * (num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(PreActBlock(in_channels=in_channels, out_channels=out_channels, stride=stride))
            in_channels = out_channels
        
        return nn.Sequential(*layers)
    
    def forward(self, x):
        x = self.prep(x)
        
        x = self.layers(x)
        
        x_avg = F.adaptive_avg_pool2d(x, (1, 1))
        x_avg = x_avg.view(x_avg.size(0), -1)
        
        x_max = F.adaptive_max_pool2d(x, (1, 1))
        x_max = x_max.view(x_max.size(0), -1)
        
        x = torch.cat([x_avg, x_max], dim=-1)
        
        x = self.classifier(x)
        
        return x

### Torch loader

In [6]:
def pad(img, p=4, padding_mode='reflect'):
    return Image.fromarray(np.pad(np.asarray(img), ((p, p), (p, p), (0, 0)), padding_mode))

def torch_loader(data_path, size, bs, val_bs=None):
    os.makedirs(data_path,exist_ok=True)

    val_bs = val_bs or bs
    # Data loading code
    tfms = [transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.24703,0.24349,0.26159))]
    train_tfms = transforms.Compose([
        pad, # TODO: use `padding` rather than assuming 4
        transforms.RandomCrop(size),
        transforms.RandomHorizontalFlip(),
    ] + tfms)
    val_tfms = transforms.Compose(tfms)

    train_dataset = datasets.CIFAR10(root=data_path, train=True, download=(args.local_rank==0), transform=train_tfms)
    val_dataset  = datasets.CIFAR10(root=data_path, train=False, download=(args.local_rank==0), transform=val_tfms)

    train_sampler = (torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None)
    # val_sampler = (torch.utils.data.distributed.DistributedSampler(val_dataset) if args.distributed else None)
    val_sampler = None

    train_loader = DataLoader(
        train_dataset, batch_size=bs, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True,
        sampler=train_sampler)

    val_loader = DataLoader(
        val_dataset, batch_size=val_bs, shuffle=False,
        num_workers=args.workers, pin_memory=True,
        sampler=val_sampler)
    
    train_loader = DataPrefetcher(train_loader)
    val_loader = DataPrefetcher(val_loader)
    
    return train_loader, val_loader

# Seems to speed up training by ~2%
class DataPrefetcher():
    def __init__(self, loader, stop_after=None):
        self.loader = loader
        self.dataset = loader.dataset
        self.stream = torch.cuda.Stream()
        self.stop_after = stop_after
        self.next_input = None
        self.next_target = None

    def __len__(self):
        return len(self.loader)

    def preload(self):
        try:
            self.next_input, self.next_target = next(self.loaditer)
        except StopIteration:
            self.next_input = None
            self.next_target = None
            return
        with torch.cuda.stream(self.stream):
            self.next_input = self.next_input.cuda(async=True)
            self.next_target = self.next_target.cuda(async=True)

    def __iter__(self):
        count = 0
        self.loaditer = iter(self.loader)
        self.preload()
        while self.next_input is not None:
            torch.cuda.current_stream().wait_stream(self.stream)
            input = self.next_input
            target = self.next_target
            self.preload()
            count += 1
            yield input, target
            if type(self.stop_after) is int and (count > self.stop_after):
                break

### Learning rate scheduler

In [7]:
class Scheduler():
    def __init__(self, optimizer, phases=[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]):
        self.optimizer = optimizer
        self.current_lr = None
        self.phases = phases
        self.tot_epochs = sum([p[2] for p in phases])

    def linear_lr(self, start_lr, end_lr, epoch_curr, batch_curr, epoch_tot, batch_tot):
        if args.scale_lr != 1:
            start_lr *= args.scale_lr
            end_lr *= args.scale_lr
        step_tot = epoch_tot * batch_tot
        step_curr = epoch_curr * batch_tot + batch_curr
        step_size = (end_lr - start_lr)/step_tot
        return start_lr + step_curr * step_size
    
    def get_current_phase(self, epoch):
        epoch_accum = 0
        for phase in self.phases:
            start_lr,end_lr,num_epochs = phase
            if epoch <= epoch_accum+num_epochs: return start_lr, end_lr, num_epochs, epoch - epoch_accum
            epoch_accum += num_epochs
        raise Exception('Epoch out of range')
            
    def get_lr(self, epoch, batch_curr, batch_tot):
        start_lr, end_lr, num_epochs, relative_epoch = self.get_current_phase(epoch)
        return self.linear_lr(start_lr, end_lr, relative_epoch, batch_curr, num_epochs, batch_tot)

    def update_lr(self, epoch, batch_num, batch_tot):
        lr = self.get_lr(epoch, batch_num, batch_tot)
        if args.verbose and (self.current_lr != lr) and ((batch_num == 1) or (batch_num == batch_tot)): 
            print(f'Changing LR from {self.current_lr} to {lr}')

        self.current_lr = lr

        for param_group in self.optimizer.param_groups:
            lr_old = param_group['lr'] or lr
            param_group['lr'] = lr

In [8]:
# item() is a recent addition, so this helps with backward compatibility.
def to_python_float(t):
    if isinstance(t, float): return t
    if isinstance(t, int): return t
    if hasattr(t, 'item'): return t.item()
    else: return t[0]

def train(trn_loader, model, criterion, optimizer, scheduler, epoch):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()

    st = time.time()
    trn_len = len(trn_loader)

    # print('Begin training loop:', st)
    for i,(input,target) in enumerate(trn_loader):
        batch_size = input.size(0)
        batch_num = i+1
        
        # measure data loading time
        scheduler.update_lr(epoch, i+1, trn_len)

        # compute output
        output = model(input)
        loss = criterion(output, target)

        if args.distributed:
            # Must keep track of global batch size, since not all machines are guaranteed equal batches at the end of an epoch
            corr1 = correct(output.data, target)[0]
            metrics = torch.tensor([batch_size, loss, corr1]).float().cuda()
            batch_total, reduced_loss, corr1 = sum_tensor(metrics)
            reduced_loss = reduced_loss/dist.get_world_size()
            prec1 = corr1*(100.0/batch_total)
        else:
            reduced_loss = loss.data
            batch_total = input.size(0)
            prec1 = accuracy(output.data, target)[0] # measure accuracy and record loss
        losses.update(to_python_float(reduced_loss), to_python_float(batch_total))
        top1.update(to_python_float(prec1), to_python_float(batch_total))

        loss = loss*args.loss_scale
        
        # compute gradient and do SGD step
        if args.full_precision:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        else:
            model.zero_grad()
            loss.backward()
            model_grads_to_master_grads(model_params, master_params)
            for param in master_params:
                param.grad.data = param.grad.data/args.loss_scale
            optimizer.step()
            master_params_to_model_params(model_params, master_params)
            torch.cuda.synchronize()

        batch_time.update(time.time() - end)
        end = time.time()

        should_print = (batch_num%args.print_freq == 0) or (batch_num==trn_len)
        if should_print: log_batch(epoch, batch_num, trn_len, batch_time, losses, top1)
    return top1.avg, losses.avg

def validate(val_loader, model, criterion, epoch, start_time):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    model.eval()
    end = time.time()
    val_len = len(val_loader)

    for i,(input,target) in enumerate(val_loader):
        batch_num = i+1
        with torch.no_grad():
            output = model(input)
            loss = criterion(output, target).data
        batch_total = input.size(0)
        prec1 = accuracy(output.data, target)[0]
            
        losses.update(to_python_float(loss), batch_total)
        top1.update(to_python_float(prec1), batch_total)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        should_print = (batch_num%args.print_freq == 0) or (batch_num==val_len)
        if should_print: log_batch(epoch, batch_num, val_len, batch_time, losses, top1)
            
    return top1.avg, losses.avg

def log_batch(epoch, batch_num, batch_len, batch_time, loss, top1):
    if args.local_rank==0 and args.verbose:
        output = ('Epoch: [{0}][{1}/{2}]\t' \
                + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                + 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})').format(
                epoch, batch_num, batch_len, batch_time=batch_time, loss=loss, top1=top1)
        print(output)
        with open(f'{args.save_dir}/full.log', 'a') as f:
            f.write(output + '\n')
            
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = self.avg = self.sum = self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    corrrect_ks = correct(output, target, topk)
    batch_size = target.size(0)
    return [correct_k.float().mul_(100.0 / batch_size) for correct_k in corrrect_ks]

def correct(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).sum(0, keepdim=True)
        res.append(correct_k)
    return res


def sum_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    return rt

def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= args.world_size
    return rt


In [9]:
# Filter out batch norm parameters and remove them from weight decay - gets us higher accuracy 93.2 -> 93.48
# https://arxiv.org/pdf/1807.11205.pdf
def bnwd_optim_params(model, model_params, master_params):
    bn_params, remaining_params = split_bn_params(model, model_params, master_params)
    return [{'params':bn_params,'weight_decay':0}, {'params':remaining_params}]


def split_bn_params(model, model_params, master_params):
    def get_bn_params(module):
        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): return module.parameters()
        accum = set()
        for child in module.children(): [accum.add(p) for p in get_bn_params(child)]
        return accum
    
    mod_bn_params = get_bn_params(model)
    zipped_params = list(zip(model_params, master_params))

    mas_bn_params = [p_mast for p_mod,p_mast in zipped_params if p_mod in mod_bn_params]
    mas_rem_params = [p_mast for p_mod,p_mast in zipped_params if p_mod not in mod_bn_params]
    return mas_bn_params, mas_rem_params
    

In [13]:
# Our own implementation of lars
class LARS(torch.optim.Optimizer):
    # SGD https://raw.githubusercontent.com/pytorch/pytorch/master/torch/optim/sgd.py
    # η (eta) = "trust" coefficient
    def __init__(self, params, lr, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False, eta=0.02, eps=1e-8, lars=True):
        self.lr = lr
        self.lars = True
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov, eta=eta)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        super().__init__(params, defaults)

    def __setstate__(self, state):
        super().__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']
            eta = group['eta']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if self.lars: local_lr = eta * torch.norm(p.data) / torch.norm(d_p)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf
                # print('Learning rate:', local_lr, local_lr*group['lr'])
                if self.lars: p.data.add_(-min(local_lr*group['lr'], group['lr']), d_p)
                else: p.data.add_(-group['lr'], d_p)

        return loss

In [14]:
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn.parameter import Parameter

In [15]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(0,8e-1,15),(8e-1,4e-2,15),(4e-2,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 1024, 1024)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		49		0.2673		1.9511		1.993		30.29
1		49		0.4855		1.3614		1.2884		51.53
2		49		0.7049		1.0569		1.1027		61.67
3		49		0.9259		0.848		1.0236		65.64
4		49		1.1457		0.6976		0.8211		71.45
5		49		1.3664		0.6035		0.9812		68.81
6		49		1.5868		0.5398		1.007		67.83
7		49		1.8067		0.5032		0.8582		70.87
8		49		2.0294		0.4737		1.1209		68.29
9		49		2.2496		0.4595		0.8092		73.27
10		49		2.4701		0.4406		0.862		71.75
11		49		2.6914		0.4306		1.1109		64.34
12		49		2.9145		0.425		0.6093		79.13

In [16]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = torch.optim.SGD(optim_params, lr=0, nesterov=True, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.01, momentum=args.momentum, weight_decay=args.weight_decay)
scheduler = Scheduler(optimizer, phases=[(0,8e-1,15),(8e-1,4e-2,15),(4e-2,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 1024, 1024)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 5: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		49		0.2101		1.8789		1.702		40.16
1		49		0.4205		1.4068		2.5529		36.86
2		49		0.63		1.2009		1.109		61.09
3		49		0.8383		0.9147		2.582		39.32
4		49		1.0513		0.7977		0.7512		74.13
5		49		1.262		0.5942		0.9072		70.89


In [17]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(0,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 5: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2587		2.0158		2.639		15.53
1		33		0.4773		1.4443		1.5488		47.52
2		33		0.6957		1.157		1.2442		55.74
3		33		0.9136		0.9473		1.5705		52.83
4		33		1.1369		0.7859		0.9895		66.9
5		33		1.3576		0.6761		1.2923		61.69


In [19]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(1e-3*scale,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 2: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.221		1.9415		2.567		14.13
1		33		0.4392		1.39		1.4904		47.7
2		33		0.6574		1.1162		1.2648		57.25
3		33		0.8789		0.8961		1.1475		63.12
4		33		1.0967		0.7556		1.0378		65.95
5		33		1.3169		0.6413		0.9756		68.97


In [20]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.04, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(0,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 2: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2167		1.9092		2.2807		21.04
1		33		0.4329		1.3933		1.9223		36.69
2		33		0.6494		1.1547		2.047		43.63


In [21]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.01, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(0,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 2: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2167		2.0939		2.067		21.28
1		33		0.4337		1.5149		1.5595		43.83
2		33		0.6506		1.2226		1.1453		58.05


In [22]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(1e-3,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 2: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.216		1.977		2.6922		14.87
1		33		0.4303		1.4356		1.241		54.06
2		33		0.6451		1.1403		1.3122		54.57


In [23]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(2e-3*scale,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 2: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2152		1.9114		2.3392		19.86
1		33		0.429		1.3967		1.4133		49.85
2		33		0.6466		1.0961		1.3185		56.06


In [24]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.01, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(1e-3*scale,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 2: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2143		2.2842		2.2905		17.14
1		33		0.4322		2.2141		2.1701		25.19
2		33		0.6479		1.7023		1.6029		42.39


In [25]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = torch.optim.SGD(optim_params, lr=0, nesterov=True, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.01, momentum=args.momentum, weight_decay=args.weight_decay)
scheduler = Scheduler(optimizer, phases=[(1e-3*scale,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 5: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2057		1.9578		1.6595		39.44
1		33		0.4138		1.7949		2.4499		26.0
2		33		0.6189		1.3921		1.7435		43.67
3		33		0.8255		1.2818		1.1975		59.5
4		33		1.0315		0.978		1.5696		52.38
5		33		1.2392		0.8514		1.3324		57.05


In [26]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(1e-4*scale,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 5: break

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2156		1.967		2.9576		12.72
1		33		0.4298		1.4237		1.5918		43.63
2		33		0.6438		1.1468		2.1295		45.34
3		33		0.8596		0.9262		1.21		62.76
4		33		1.0772		0.7589		0.9506		68.88
5		33		1.293		0.6483		1.0823		65.35


In [28]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(1e-4*scale,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2156		1.9826		2.3933		16.74
1		33		0.4323		1.4413		1.5176		47.87
2		33		0.6492		1.1644		1.5214		50.57
3		33		0.8688		0.9453		1.3104		56.48
4		33		1.0902		0.7748		0.9202		68.08
5		33		1.3072		0.668		0.7514		75.12
6		33		1.5251		0.5829		0.9944		68.46
7		33		1.7431		0.5426		1.2325		63.33
8		33		1.9636		0.5058		0.8011		72.93
9		33		2.1818		0.4776		1.1752		64.68
10		33		2.3996		0.4607		1.2616		60.52
11		33		2.6192		0.4648		0.9381		70.35
12		33		2.8375		0.4556		1.1569		6

In [None]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

scale = 6
# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.02, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optimizer)
scheduler = Scheduler(optimizer, phases=[(1e-4*scale,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))
    if epoch == 12: optimizer.lars = False

Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy
0		33		0.2134		1.9997		2.6457		18.03
1		33		0.4312		1.4163		1.2717		53.9
2		33		0.6472		1.1371		1.0698		62.29
3		33		0.8658		0.9396		1.3649		55.36
4		33		1.083		0.776		1.1178		64.91
5		33		1.3069		0.6605		0.9602		69.6
6		33		1.5287		0.5877		0.828		73.26
7		33		1.745		0.5419		0.6846		76.94
8		33		1.9657		0.5072		0.7818		74.05
9		33		2.1844		0.4825		1.5001		59.04
10		33		2.4011		0.4661		0.778		72.41
11		33		2.6209		0.4618		2.486		50.26
12		33		2.8391		0.476		0.9905		68.6
13		

Process Process-2266:
Process Process-2272:
Process Process-2268:
Process Process-2269:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-2270:
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process Process-2271:
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-2267:
Process Process-2265:
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/datalo

  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-29-467548b654a8>", line 44, in <module>
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
  File "<ipython-input-8-7d0ced679dc0>", line 52, in train
    optimizer.step()
  File "<ipython-input-13-8a8bea0f6e1b>", line 49, in step
    if self.lars: local_lr = eta * torch.norm(p.data) / torch.norm(d_p)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 29

In [29]:
if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

model = ResNet18()
model = model.cuda()


if not args.full_precision: model = network_to_half(model)
if args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

# TESTING
args.full_precision = True
args.loss_scale = 1


# AS: todo: don't copy over weights as it seems to help accuracy
global model_params, master_params
if args.full_precision: model_params = master_params = model.parameters()
else: model_params, master_params = prep_param_lists(model)

# optim_params = bnwd_optim_params(model, model_params, master_params)
optim_params = master_params

# define loss function (criterion) and optimizer
criterion = F.cross_entropy
optimizer = torch.optim.SGD(optim_params, lr=0, nesterov=True, momentum=args.momentum, weight_decay=args.weight_decay)
# optimizer = LARS(optim_params, lr=0, nesterov=True, eta=0.01, momentum=args.momentum, weight_decay=args.weight_decay)
scheduler = Scheduler(optimizer, phases=[(1e-3*scale,2e-1*scale,15),(2e-1*scale,1e-2*scale,15),(1e-2*scale,0,5)])


sz = 32
trn_loader, val_loader = torch_loader(args.data, sz, 256*scale, 256*scale)

print(args)
print('\n\n')
print("epoch\t\tnum_batch\ttime (min)\ttrn_loss\tval_loss\taccuracy")
start_time = datetime.now() # Loading start to after everything is loaded
for epoch in range(scheduler.tot_epochs):
    trn_top1, trn_loss = train(trn_loader, model, criterion, optimizer, scheduler, epoch)
    val_top1, val_loss = validate(val_loader, model, criterion, epoch, start_time)

    time_diff = datetime.now()-start_time
    minutes = float(time_diff.total_seconds() / 60.0)
    # epoch   time   trn_loss   val_loss   accuracy     
    metrics = [str(round(i, 4)) for i in [epoch, len(trn_loader), minutes, trn_loss, val_loss, val_top1]]
    print('\t\t'.join(metrics))

ERROR! Session/line number was not unique in database. History logging moved to new session 746
Files already downloaded and verified
Files already downloaded and verified
Namespace(batch_size=256, data='/home/paperspace/data/cifar10', dist_backend='nccl', dist_url='env://', distributed=False, full_precision=True, local_rank=0, loss_scale=1, momentum=0.9, phases='[(0,2e-1,15),(2e-1,1e-2,15),(1e-2,0,5)]', print_freq=200, save_dir=PosixPath('/home/paperspace/cluster/pytorch-cifar'), scale_lr=1, verbose=False, weight_decay=0.0005, workers=8, world_size=-1)



epoch		num_batch	time (min)	trn_loss	val_loss	accuracy


Process Process-2279:
Process Process-2277:
Process Process-2275:
Process Process-2276:
Process Process-2274:
Process Process-2278:
Process Process-2280:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-2273:
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93,

KeyboardInterrupt
KeyboardInterrupt
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
KeyboardInterrupt
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


RuntimeError: DataLoader worker (pid 22995) exited unexpectedly with exit code 1.