In [1]:
import argparse, os, shutil, time, warnings
from datetime import datetime
from pathlib import Path
import numpy as np
import sys
import math

import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed

# import models
from fp16util import *
import gc

from fastai.models import resnet
# import resnet_sd as resnet

from dataloader import *

In [2]:
def get_parser():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR', help='path to dataset')
    parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet50')
    parser.add_argument('--save-dir', type=str, default=Path.cwd(), help='Directory to save logs and models.')
    parser.add_argument('-j', '--workers', default=8, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('--epochs', default=45, type=int, metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                        help='manual epoch number (useful on restarts)')
    parser.add_argument('-b', '--batch-size', default=192, type=int,
                        metavar='N', help='mini-batch size (default: 256)')
    parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                        metavar='LR', help='initial learning rate')
    parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum')
    parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
                        metavar='W', help='weight decay (default: 1e-4)')
    parser.add_argument('--resize-sched', default='0.4,0.92', type=str,
                        help='Scheduler to resize from 128 -> 224 -> 288')
    parser.add_argument('--lr-sched', default='0.1,0.47,0.78,0.95', type=str,
                        help='Learning rate scheduler warmup -> lr -> lr/10 -> lr/100 -> lr/1000')
    parser.add_argument('--init-bn0', action='store_true', help='Intialize running batch norm mean to 0')
    parser.add_argument('--print-freq', '-p', default=10, type=int,
                        metavar='N', help='print frequency (default: 10)')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                        help='evaluate model on validation set')
    parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model')
    parser.add_argument('--fp16', action='store_true', help='Run model fp16 mode.')
    parser.add_argument('--loss-scale', type=float, default=1,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    parser.add_argument('--prof', dest='prof', action='store_true', help='Only run a few iters for profiling.')
    parser.add_argument('--val-ar', action='store_true', help='Do final validation by nearest aspect ratio')
    parser.add_argument('--distributed', action='store_true', help='Run distributed training')
    parser.add_argument('--world-size', default=-1, type=int, 
                        help='Number of gpus per machine. Param only needed for single machine training when using (faster) file sync')
    parser.add_argument('--dist-url', default='env://', type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend')
    parser.add_argument('--local_rank', default=0, type=int,
                        help='Used for multi-process training. Can either be manually set ' +
                        'or automatically set by using \'python -m multiproc\'.')
    return parser

In [3]:
cudnn.benchmark = True
args = get_parser().parse_args(['/home/paperspace/data/imagenet', '--fp16', '--loss-scale', '512'])
if args.local_rank > 0: sys.stdout = open(f'{args.save_dir}/GPU_{args.local_rank}.log', 'w')

In [4]:
from functools import partial

In [17]:
class DataManager():
    def __init__(self, resize_sched=[0.4, 0.92]):
        self.resize_sched = resize_sched
#         self.load_data('-sz/160', args.batch_size, 128)
        self.load_data('', args.batch_size, 128)
        
    def set_epoch(self, epoch):
        if epoch==int(args.epochs*self.resize_sched[0]+0.5):
            # self.load_data('', args.batch_size, 224)
            # self.load_data('-sz/320', args.batch_size, 224, min_scale=0.097, max_scale=1.21) # lower validation accuracy when enabled for some reason
            self.load_data('-sz/320', args.batch_size, 224, min_scale=0.0968) # lower validation accuracy when enabled for some reason
            # self.load_data('-sz/320', args.batch_size, 224, min_scale=0.093, max_scale=1.15) # right terminal experiment
        if epoch==int(args.epochs*self.resize_sched[1]+0.5):
            self.load_data('', 128, 288, min_scale=0.5, use_ar=args.val_ar)

        if hasattr(self.trn_smp, 'set_epoch'): self.trn_smp.set_epoch(epoch)
        if hasattr(self.val_smp, 'set_epoch'): self.val_smp.set_epoch(epoch)
    
    # For val_ar faster scheduler - [0.35,0.88]

    def get_trn_iter(self):
        self.trn_iter = iter(self.trn_dl)
        return self.trn_iter

    def get_val_iter(self):
        self.val_iter = iter(self.val_dl)
        return self.val_iter
        
    def load_data(self, dir_prefix, batch_size, image_size, **kwargs):
        datadir = args.data+dir_prefix
        print(f'Dataset changed. \nImage size: {image_size} \nBatch size: {batch_size} \nDirectory: {datadir}')
        loaders = get_loaders(datadir, bs=batch_size, sz=image_size, workers=args.workers, distributed=args.distributed, **kwargs)
        self.trn_dl,self.val_dl,self.trn_smp,self.val_smp = loaders
        mixup_dl = MixUpDataLoader(self.trn_dl, 0.6)
        self.trn_dl = DataPrefetcher(mixup_dl)
#         self.trn_dl = DataPrefetcher(self.trn_dl)
#         self.trn_dl = MixUpDataLoader(self.trn_dl, 0.6)
        self.val_dl = DataPrefetcher(self.val_dl, prefetch=False)
        self.trn_len = len(self.trn_dl)
        self.val_len = len(self.val_dl)
        # clear memory
        gc.collect()
        torch.cuda.empty_cache()

class Scheduler():
    def __init__(self, optimizer, lr_sched=[0.1, 0.47, 0.78, 0.95]):
        self.optimizer = optimizer
        self.current_lr = None
        self.current_epoch = 0
        self.lr_sched = lr_sched

    def bn0_lr_warmup(self, epoch, epoch_tot, batch_num, batch_tot):
        world_size = args.world_size
        lr_step = args.lr / (epoch_tot * batch_tot)
        lr = args.lr + (epoch * batch_tot + batch_num) * lr_step
        if world_size >= 64: lr *= .75
        return lr

    def linear_lr_warmup(self, epoch, epoch_tot, batch_num, batch_tot):
        starting_lr = args.lr/epoch_tot
        ending_lr = args.lr
        step_size = (ending_lr - starting_lr)/epoch_tot
        batch_step_size = step_size/batch_tot
        lr = step_size*(epoch+1) + batch_step_size*batch_num

        if (args.world_size >= 32) and (epoch < epoch_tot):
            starting_lr = starting_lr/(4 - epoch)
        return lr

    def get_lr(self, epoch, batch_num, batch_tot):
        """Sets the learning rate to the initial LR decayed by 10 every few epochs"""
        # faster lr schedule [0.14, 0.43, 0.73, 0.94]
        # original lr schedule [0.1, 0.47, 0.78, 0.95]
        if epoch<int(args.epochs*self.lr_sched[0]+0.5):
            epoch_tot = args.epochs*self.lr_sched[0]+0.5
            if args.init_bn0: lr = self.bn0_lr_warmup(epoch, epoch_tot, batch_num, batch_tot)
            else: lr = self.linear_lr_warmup(epoch, epoch_tot, batch_num, batch_tot)
        elif epoch<int(args.epochs*self.lr_sched[1]+0.5): return args.lr/1
        elif epoch<int(args.epochs*self.lr_sched[2]+0.5): return args.lr/10
        elif epoch<int(args.epochs*self.lr_sched[3]+0.5): return args.lr/100
        else         : lr = args.lr/1000
        return lr

    def update_lr(self, epoch, batch_num, batch_tot):
        lr = self.get_lr(epoch, batch_num, batch_tot)
        if (self.current_lr != lr) and ((batch_num == 0) or (batch_num+1 == batch_tot)): 
            print(f'Changing LR from {self.current_lr} to {lr}')

        self.current_lr = lr
        self.current_epoch = epoch
        self.current_batch = batch_num

        for param_group in self.optimizer.param_groups:
            lr_old = param_group['lr'] or lr
            param_group['lr'] = lr

            # Trick 4: apply momentum correction when lr is updated
            # https://github.com/pytorch/examples/pull/262
            if lr > lr_old: param_group['momentum'] = lr / lr_old * args.momentum
            else: param_group['momentum'] = args.momentum


def init_dist_weights(model):
    # https://arxiv.org/pdf/1706.02677.pdf
    # https://github.com/pytorch/examples/pull/262
    if args.arch.startswith('resnet'):
        for m in model.modules():
            if isinstance(m, resnet.BasicBlock):
                m.bn2.weight = Parameter(torch.zeros_like(m.bn2.weight))
            if isinstance(m, resnet.Bottleneck):
                m.bn3.weight = Parameter(torch.zeros_like(m.bn3.weight))
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)

def str_to_num_array(argstr):
    return [float(s) for s in argstr.split(',')]

# item() is a recent addition, so this helps with backward compatibility.
def to_python_float(t):
    if hasattr(t, 'item'):
        return t.item()
    else:
        return t[0]

def train(trn_iter, trn_len, model, criterion, optimizer, scheduler, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()

    st = time.time()
    # print('Begin training loop:', st)
    for i,(input,target) in enumerate(trn_iter):
        # if i == 0: print('Received input:', time.time()-st)
        if args.prof and (i > 200): break

        # measure data loading time
        data_time.update(time.time() - end)
        scheduler.update_lr(epoch, i, trn_len)


        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
#         prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

        if args.distributed:
            reduced_loss = reduce_tensor(loss.data)
            prec1 = reduce_tensor(prec1)
            prec5 = reduce_tensor(prec5)
        else:
            reduced_loss = loss.data

        losses.update(to_python_float(reduced_loss), input.size(0))
#         top1.update(to_python_float(prec1), input.size(0))
#         top5.update(to_python_float(prec5), input.size(0))

        loss = loss*args.loss_scale
        # compute gradient and do SGD step
        # if i == 0: print('Evaluate and loss:', time.time()-st)

        if args.fp16:
            model.zero_grad()
            loss.backward()
            model_grads_to_master_grads(model_params, master_params)
            for param in master_params:
                param.grad.data = param.grad.data/args.loss_scale
            optimizer.step()
            master_params_to_model_params(model_params, master_params)
            torch.cuda.synchronize()
        else:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # if i == 0: print('Backward step:', time.time()-st)
        # measure elapsed time
        batch_time.update(time.time() - end)

        end = time.time()

        should_print = ((i+1) % args.print_freq == 0) or (i+1 == trn_len)
        if args.local_rank == 0 and should_print:
            output = ('Epoch: [{0}][{1}/{2}]\t' \
                    + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                    + 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' \
                    + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                    + 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' \
                    + 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})').format(
                    epoch, i+1, trn_len, batch_time=batch_time,
                    data_time=data_time, loss=losses, top1=top1, top5=top5)
            print(output)
            with open(f'{args.save_dir}/full.log', 'a') as f:
                f.write(output + '\n')
    
def validate(val_iter, val_len, model, criterion, epoch, start_time):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    model.eval()
    end = time.time()

    for i,(input,target) in enumerate(val_iter):
        if args.distributed:
            prec1, prec5, loss, tot_batch = distributed_predict(input, target, model, criterion)
        else:
            with torch.no_grad():
                output = model(input)
                loss = criterion(output, target).data
            tot_batch = input.size(0)
            prec1, prec5 = accuracy(output.data, target, topk=(1,5))
            
        losses.update(to_python_float(loss), tot_batch)
        top1.update(to_python_float(prec1), tot_batch)
        top5.update(to_python_float(prec5), tot_batch)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        should_print = ((i+1) % args.print_freq == 0) or (i+1 == val_len)
        if args.local_rank == 0 and should_print:
            output = ('Test: [{0}/{1}]\t' \
                    + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                    + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                    + 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' \
                    + 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})').format(
                    i+1, val_len, batch_time=batch_time, loss=losses,
                    top1=top1, top5=top5)
            print(output)
            with open(f'{args.save_dir}/full.log', 'a') as f:
                f.write(output + '\n')

    time_diff = datetime.now()-start_time
    print(f'~~{epoch}\t{float(time_diff.total_seconds() / 3600.0)}\t{top5.avg:.3f}\n')
    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))

    return top5.avg

def distributed_predict(input, target, model, criterion):
    batch_size = input.size(0)
    output = loss = corr1 = corr5 = valid_batches = torch.tensor([0]).cuda()
    
    if batch_size:
        # compute output
        with torch.no_grad():
            # using module instead of model because DistributedDataParallel forward function has a sync point.
            # with distributed validation sampler, we don't always have data for each gpu
            assert(isinstance(model, nn.parallel.DistributedDataParallel))
            output = model.module(input)
            loss = criterion(output, target)
        # measure accuracy and record loss
        valid_batches = torch.tensor([1]).cuda()
        corr1, corr5 = correct(output.data, target, topk=(1, 5))
    batch_tensor = torch.tensor([batch_size]).cuda()
    tot_batch = sum_tensor(batch_tensor).item()
    valid_batches = sum_tensor(valid_batches).item()
    reduced_loss = sum_tensor(loss.data)/valid_batches

    corr1 = sum_tensor(corr1).float()
    corr5 = sum_tensor(corr5).float()
    prec1 = corr1*(100.0/tot_batch)
    prec5 = corr5*(100.0/tot_batch)
    return prec1, prec5, reduced_loss, tot_batch

def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, f'{args.save_dir}/model_best.pth.tar')

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy_fastai(preds, targs):
    preds = torch.max(preds, dim=1)[1]
    return (preds==targs).float().mean()

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    corrrect_ks = correct(output, target, topk)
    batch_size = target.size(0)
    return [correct_k.float().mul_(100.0 / batch_size) for correct_k in corrrect_ks]

def correct(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).sum(0, keepdim=True)
        res.append(correct_k)
    return res


def sum_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    return rt

def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= args.world_size
    return rt

In [18]:
from functools import partial

In [19]:
class MixUpDataLoader(object):
    """
    Creates a new data loader with mixup from a given dataloader.
    
    Mixup is applied between a batch and a shuffled version of itself. 
    If we use a regular beta distribution, this can create near duplicates as some lines might be 
    1 * original + 0 * shuffled while others could be 0 * original + 1 * shuffled, this is why
    there is a trick where we take the maximum of lambda and 1-lambda.
    
    Arguments:
    dl (DataLoader): the data loader to mix up
    alpha (float): value of the parameter to use in the beta distribution.
    """
    def __init__(self, dl, alpha):
        self.dl, self.alpha = dl, alpha
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for (x, y) in iter(self.dl):
            #Taking one different lambda per image speeds up training 
            lambd = np.random.beta(self.alpha, self.alpha, y.size(0))
            #Trick to avoid near duplicates
            lambd = np.concatenate([lambd[:,None], 1-lambd[:,None]], 1).max(1)
#             lambd = to_gpu(VV(lambd))
            lambd = torch.from_numpy(lambd).float()
            shuffle = torch.randperm(y.size(0))
            x = x.float()
            x1, y1 = x[shuffle], y[shuffle]
            x * lambd.view(lambd.size(0),1,1,1)
            x1 * (1-lambd).view(lambd.size(0),1,1,1)
            [y, y1, lambd]
            new_x = x * lambd.view(lambd.size(0),1,1,1) + x1 * (1-lambd).view(lambd.size(0),1,1,1)
            yield (new_x, [y, y1, lambd.half()])

class MixUpLoss(nn.Module):
    """
    Adapts the loss function to go with mixup.
    
    Since the targets aren't one-hot encoded, we use the linearity of the loss function with
    regards to the target to mix up the loss instead of one-hot encoded targets.
    
    Argument:
    crit: a loss function. It must have the parameter reduced=False to have the loss per element.
    """
    def __init__(self, crit):
        super().__init__()
        self.crit = crit()
        
    def forward(self, output, target):
        if not isinstance(target, list): return self.crit(output, target).mean()
        loss1, loss2 = self.crit(output,target[0]), self.crit(output,target[1])
        return (loss1 * target[2] + loss2 * (1-target[2])).mean()

In [20]:
print(args)
print("~~epoch\thours\ttop1Accuracy\n")

# need to index validation directory before we start counting the time
if args.val_ar: sort_ar(args.data+'/validation')

start_time = datetime.now()

if args.distributed:
    print('Distributed: initializing process group')
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
    assert(args.world_size == dist.get_world_size())

if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

model = resnet.resnet50(pretrained=args.pretrained)
print("Loaded model")

Namespace(arch='resnet50', batch_size=192, data='/home/paperspace/data/imagenet', dist_backend='nccl', dist_url='env://', distributed=False, epochs=45, evaluate=False, fp16=True, init_bn0=False, local_rank=0, loss_scale=512.0, lr=0.1, lr_sched='0.1,0.47,0.78,0.95', momentum=0.9, pretrained=False, print_freq=10, prof=False, resize_sched='0.4,0.92', resume='', save_dir=PosixPath('/home/paperspace/fastai/courses/dl2'), start_epoch=0, val_ar=False, weight_decay=0.0001, workers=8, world_size=-1)
~~epoch	hours	top1Accuracy

Loaded model


In [21]:
%pdb on

Automatic pdb calling has been turned ON


In [None]:
model = model.cuda()
n_dev = torch.cuda.device_count()
if args.fp16: model = network_to_half(model)
if args.distributed:
    if args.init_bn0: init_dist_weights(model) # (AS) Performs pretty poorly for first 10 epochs when enabled
    model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

global model_params, master_params
if args.fp16:  model_params, master_params = prep_param_lists(model)
else: master_params = list(model.parameters())

# define loss function (criterion) and optimizer
# criterion = nn.CrossEntropyLoss().cuda()
criterion = MixUpLoss(partial(nn.CrossEntropyLoss, reduce=False)).cuda()

optimizer = torch.optim.SGD(master_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
scheduler = Scheduler(optimizer, str_to_num_array(args.lr_sched))

print("Defined loss and optimizer")

best_prec5 = 93 # only save models over 92%. Otherwise it stops to save every time
# optionally resume from a checkpoint
if args.resume:
    if os.path.isfile(args.resume):
        checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank))
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
    else: print("=> no checkpoint found at '{}'".format(args.resume))

dm = DataManager(str_to_num_array(args.resize_sched))
print("Created data loaders")

print("Begin training")
estart = time.time()
for epoch in range(args.start_epoch, args.epochs):
    estart = time.time()
    dm.set_epoch(epoch)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        train(dm.get_trn_iter(), len(dm.trn_dl), model, criterion, optimizer, scheduler, epoch)

    if args.prof: break
    prec5 = validate(dm.get_val_iter(), len(dm.val_dl), model, criterion, epoch, start_time)

    is_best = prec5 > best_prec5
    if args.local_rank == 0 and is_best:
        best_prec5 = max(prec5, best_prec5)
        save_checkpoint({
            'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(),
            'best_prec5': best_prec5, 'optimizer' : optimizer.state_dict(),
        }, is_best)

# save script so we can reproduce from logs
shutil.copy2(os.path.realpath(__file__), f'{args.save_dir}')

Defined loss and optimizer
Dataset changed. 
Image size: 128 
Batch size: 192 
Directory: /home/paperspace/data/imagenet
Created data loaders
Begin training
Changing LR from None to 0.016
Epoch: [0][10/6673]	Time 0.456 (0.920)	Data 0.350 (0.682)	Loss 7.3281 (7.1543)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][20/6673]	Time 0.333 (0.676)	Data 0.227 (0.501)	Loss 7.4062 (7.2758)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][30/6673]	Time 0.374 (0.597)	Data 0.268 (0.444)	Loss 7.3984 (7.3301)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][40/6673]	Time 0.492 (0.559)	Data 0.373 (0.416)	Loss 7.3203 (7.3310)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][50/6673]	Time 0.467 (0.538)	Data 0.357 (0.401)	Loss 7.2266 (7.3163)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][60/6673]	Time 0.490 (0.528)	Data 0.384 (0.396)	Loss 7.1328 (7.3002)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][70/6673]	Time 0.370 (0.519)	Data 0.247 (0.390)	Loss 7.0078 (7.2836)	Prec@1 

Epoch: [0][670/6673]	Time 0.395 (0.466)	Data 0.290 (0.352)	Loss 6.7891 (6.9250)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][680/6673]	Time 0.402 (0.465)	Data 0.298 (0.351)	Loss 6.7344 (6.9226)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][690/6673]	Time 0.420 (0.465)	Data 0.289 (0.351)	Loss 6.7227 (6.9202)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][700/6673]	Time 0.488 (0.465)	Data 0.377 (0.350)	Loss 6.7773 (6.9182)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][710/6673]	Time 0.486 (0.464)	Data 0.371 (0.350)	Loss 6.7383 (6.9158)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][720/6673]	Time 0.423 (0.464)	Data 0.306 (0.350)	Loss 6.6953 (6.9133)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][730/6673]	Time 0.477 (0.464)	Data 0.370 (0.350)	Loss 6.7383 (6.9106)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][740/6673]	Time 0.447 (0.464)	Data 0.340 (0.350)	Loss 6.7188 (6.9082)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][750/6673]	Tim

Epoch: [0][1340/6673]	Time 0.409 (0.455)	Data 0.295 (0.342)	Loss 6.4805 (6.7991)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][1350/6673]	Time 0.416 (0.454)	Data 0.293 (0.342)	Loss 6.4805 (6.7975)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][1360/6673]	Time 0.423 (0.455)	Data 0.311 (0.342)	Loss 6.5625 (6.7961)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][1370/6673]	Time 0.515 (0.454)	Data 0.407 (0.341)	Loss 6.6289 (6.7947)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][1380/6673]	Time 0.469 (0.454)	Data 0.364 (0.341)	Loss 6.6328 (6.7932)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][1390/6673]	Time 0.514 (0.454)	Data 0.403 (0.341)	Loss 6.5938 (6.7918)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][1400/6673]	Time 0.408 (0.454)	Data 0.301 (0.341)	Loss 6.6016 (6.7903)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][1410/6673]	Time 0.405 (0.454)	Data 0.275 (0.341)	Loss 6.6406 (6.7886)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][1420/

Epoch: [0][2010/6673]	Time 0.444 (0.451)	Data 0.335 (0.339)	Loss 6.4258 (6.7006)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2020/6673]	Time 0.438 (0.451)	Data 0.325 (0.338)	Loss 6.3164 (6.6991)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2030/6673]	Time 0.444 (0.451)	Data 0.325 (0.338)	Loss 6.4414 (6.6977)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2040/6673]	Time 0.451 (0.450)	Data 0.344 (0.338)	Loss 6.2266 (6.6960)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2050/6673]	Time 0.477 (0.450)	Data 0.355 (0.338)	Loss 6.4414 (6.6948)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2060/6673]	Time 0.452 (0.450)	Data 0.341 (0.338)	Loss 6.4688 (6.6935)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2070/6673]	Time 0.474 (0.450)	Data 0.369 (0.338)	Loss 6.3438 (6.6922)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2080/6673]	Time 0.390 (0.450)	Data 0.283 (0.338)	Loss 6.3828 (6.6907)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2090/

Epoch: [0][2680/6673]	Time 0.248 (0.432)	Data 0.143 (0.320)	Loss 6.0859 (6.6091)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2690/6673]	Time 0.244 (0.431)	Data 0.125 (0.319)	Loss 6.4102 (6.6080)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2700/6673]	Time 0.296 (0.431)	Data 0.191 (0.319)	Loss 6.2305 (6.6067)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2710/6673]	Time 0.278 (0.431)	Data 0.165 (0.319)	Loss 6.2383 (6.6055)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2720/6673]	Time 0.237 (0.430)	Data 0.133 (0.318)	Loss 6.3398 (6.6044)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2730/6673]	Time 0.295 (0.430)	Data 0.189 (0.318)	Loss 6.4102 (6.6031)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2740/6673]	Time 0.535 (0.430)	Data 0.427 (0.318)	Loss 6.3086 (6.6019)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2750/6673]	Time 0.314 (0.429)	Data 0.180 (0.317)	Loss 6.3281 (6.6006)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][2760/

Epoch: [0][3350/6673]	Time 0.249 (0.412)	Data 0.129 (0.300)	Loss 6.0547 (6.5300)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][3360/6673]	Time 0.232 (0.412)	Data 0.127 (0.299)	Loss 6.2500 (6.5289)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][3370/6673]	Time 0.257 (0.411)	Data 0.143 (0.299)	Loss 6.1562 (6.5279)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][3380/6673]	Time 0.285 (0.411)	Data 0.168 (0.299)	Loss 6.1641 (6.5268)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][3390/6673]	Time 0.294 (0.411)	Data 0.175 (0.299)	Loss 6.1445 (6.5257)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][3400/6673]	Time 0.298 (0.410)	Data 0.172 (0.298)	Loss 6.2188 (6.5244)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][3410/6673]	Time 0.275 (0.410)	Data 0.153 (0.298)	Loss 6.1172 (6.5233)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][3420/6673]	Time 0.286 (0.410)	Data 0.172 (0.298)	Loss 6.2188 (6.5221)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][3430/

Epoch: [0][4020/6673]	Time 0.313 (0.402)	Data 0.208 (0.290)	Loss 6.1562 (6.4577)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4030/6673]	Time 0.377 (0.402)	Data 0.245 (0.290)	Loss 5.9141 (6.4567)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4040/6673]	Time 0.342 (0.402)	Data 0.228 (0.289)	Loss 5.9531 (6.4557)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4050/6673]	Time 0.515 (0.402)	Data 0.389 (0.289)	Loss 5.9844 (6.4546)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4060/6673]	Time 0.283 (0.401)	Data 0.160 (0.289)	Loss 5.9688 (6.4537)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4070/6673]	Time 0.351 (0.401)	Data 0.243 (0.289)	Loss 5.9336 (6.4527)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4080/6673]	Time 0.387 (0.401)	Data 0.268 (0.289)	Loss 6.0859 (6.4517)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4090/6673]	Time 0.364 (0.401)	Data 0.252 (0.289)	Loss 6.0547 (6.4507)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4100/

Epoch: [0][4690/6673]	Time 0.318 (0.396)	Data 0.210 (0.284)	Loss 5.9844 (6.3924)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4700/6673]	Time 0.361 (0.396)	Data 0.245 (0.284)	Loss 5.9648 (6.3915)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4710/6673]	Time 0.356 (0.396)	Data 0.247 (0.284)	Loss 5.8945 (6.3906)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4720/6673]	Time 0.442 (0.396)	Data 0.332 (0.284)	Loss 6.0000 (6.3897)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4730/6673]	Time 0.273 (0.396)	Data 0.161 (0.284)	Loss 6.0820 (6.3888)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4740/6673]	Time 0.366 (0.396)	Data 0.239 (0.284)	Loss 5.9609 (6.3879)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4750/6673]	Time 0.321 (0.396)	Data 0.204 (0.284)	Loss 5.9492 (6.3871)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4760/6673]	Time 0.365 (0.396)	Data 0.245 (0.284)	Loss 6.1055 (6.3862)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][4770/

Epoch: [0][5360/6673]	Time 0.318 (0.393)	Data 0.203 (0.280)	Loss 5.8750 (6.3333)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][5370/6673]	Time 0.347 (0.393)	Data 0.232 (0.280)	Loss 5.8672 (6.3324)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][5380/6673]	Time 0.294 (0.393)	Data 0.184 (0.280)	Loss 5.8867 (6.3316)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][5390/6673]	Time 0.306 (0.393)	Data 0.191 (0.280)	Loss 5.7930 (6.3307)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][5400/6673]	Time 0.231 (0.392)	Data 0.127 (0.280)	Loss 5.7930 (6.3298)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][5410/6673]	Time 0.381 (0.392)	Data 0.268 (0.280)	Loss 5.9453 (6.3290)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][5420/6673]	Time 0.285 (0.392)	Data 0.178 (0.280)	Loss 5.9648 (6.3282)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][5430/6673]	Time 0.328 (0.392)	Data 0.207 (0.280)	Loss 5.9141 (6.3274)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][5440/

Epoch: [0][6030/6673]	Time 0.354 (0.390)	Data 0.222 (0.278)	Loss 5.8320 (6.2776)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][6040/6673]	Time 0.359 (0.390)	Data 0.238 (0.278)	Loss 5.9453 (6.2768)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][6050/6673]	Time 0.403 (0.390)	Data 0.279 (0.278)	Loss 5.7695 (6.2760)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][6060/6673]	Time 0.379 (0.390)	Data 0.259 (0.278)	Loss 5.8203 (6.2753)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][6070/6673]	Time 0.396 (0.390)	Data 0.291 (0.278)	Loss 5.6875 (6.2746)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][6080/6673]	Time 0.367 (0.390)	Data 0.258 (0.278)	Loss 5.7734 (6.2737)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][6090/6673]	Time 0.320 (0.390)	Data 0.215 (0.278)	Loss 5.6055 (6.2728)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][6100/6673]	Time 0.323 (0.390)	Data 0.206 (0.278)	Loss 5.8828 (6.2721)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][6110/



Test: [10/261]	Time 0.229 (0.536)	Loss 5.7578 (4.7227)	Prec@1 1.042 (12.865)	Prec@5 4.167 (30.417)
Test: [20/261]	Time 0.035 (0.375)	Loss 4.5195 (4.9164)	Prec@1 10.938 (10.130)	Prec@5 33.333 (25.990)
Test: [30/261]	Time 0.035 (0.317)	Loss 5.3945 (4.8714)	Prec@1 9.375 (10.816)	Prec@5 16.667 (27.031)
Test: [40/261]	Time 0.845 (0.310)	Loss 5.0938 (4.8487)	Prec@1 7.292 (11.263)	Prec@5 23.438 (27.617)
Test: [50/261]	Time 0.075 (0.307)	Loss 5.2539 (4.9007)	Prec@1 2.604 (9.844)	Prec@5 15.625 (25.458)
Test: [60/261]	Time 0.035 (0.290)	Loss 5.2930 (4.9426)	Prec@1 5.729 (9.280)	Prec@5 15.625 (23.845)
Test: [70/261]	Time 0.035 (0.276)	Loss 6.2422 (4.9499)	Prec@1 0.000 (8.624)	Prec@5 0.000 (23.281)
Test: [80/261]	Time 0.423 (0.277)	Loss 4.5938 (4.9333)	Prec@1 4.167 (8.639)	Prec@5 31.771 (23.893)
Test: [90/261]	Time 0.822 (0.285)	Loss 5.2734 (4.9117)	Prec@1 6.771 (9.051)	Prec@5 15.104 (24.259)
Test: [100/261]	Time 0.035 (0.277)	Loss 4.6211 (4.9259)	Prec@1 7.812 (8.844)	Prec@5 31.771 (23.964)
Test: 

Epoch: [1][460/6673]	Time 0.441 (0.435)	Data 0.336 (0.324)	Loss 5.5547 (5.7330)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][470/6673]	Time 0.472 (0.435)	Data 0.366 (0.324)	Loss 5.5820 (5.7317)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][480/6673]	Time 0.406 (0.434)	Data 0.301 (0.323)	Loss 5.7617 (5.7301)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][490/6673]	Time 0.468 (0.434)	Data 0.353 (0.323)	Loss 5.7227 (5.7293)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][500/6673]	Time 0.374 (0.434)	Data 0.269 (0.323)	Loss 5.6562 (5.7286)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][510/6673]	Time 0.346 (0.433)	Data 0.239 (0.322)	Loss 5.7031 (5.7276)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][520/6673]	Time 0.414 (0.433)	Data 0.301 (0.322)	Loss 5.5977 (5.7266)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][530/6673]	Time 0.425 (0.433)	Data 0.301 (0.322)	Loss 5.6602 (5.7252)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][540/6673]	Tim

Epoch: [1][1140/6673]	Time 0.467 (0.433)	Data 0.363 (0.322)	Loss 5.5898 (5.6785)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][1150/6673]	Time 0.414 (0.433)	Data 0.302 (0.322)	Loss 5.6016 (5.6779)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][1160/6673]	Time 0.450 (0.433)	Data 0.345 (0.322)	Loss 5.5039 (5.6769)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][1170/6673]	Time 0.362 (0.433)	Data 0.251 (0.322)	Loss 5.5586 (5.6758)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][1180/6673]	Time 0.386 (0.432)	Data 0.267 (0.321)	Loss 5.5703 (5.6753)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][1190/6673]	Time 0.348 (0.432)	Data 0.242 (0.321)	Loss 5.5469 (5.6745)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][1200/6673]	Time 0.342 (0.431)	Data 0.238 (0.320)	Loss 5.5156 (5.6738)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][1210/6673]	Time 0.363 (0.431)	Data 0.258 (0.320)	Loss 5.5859 (5.6731)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [1][1220/