Skip to content

Commit

Permalink
Kaggle benchmark (#179)
Browse files Browse the repository at this point in the history
* Jan07_update_benchmark

* Jan07_update_unittest

* [fix] Kaggle benchmark (#179)

* [fix] kaggle_benchmark merge master (#179)

* update version mxnet-cu100==1.6.0b20191013

* fix classifier.predict func

* fix hpo.md timed out

* fix enas default_train_fn func

* fix hpo.md timed out

* fix update_params func

* fix hpo.md timeout

* fix hpo.md mobilenet time out

* fix model_name isinstance str

* turn off classification tricks

* update classification/README.md

Co-authored-by: Hang Zhang <hzaws@amazon.com>
  • Loading branch information
Sun Yue and Hang Zhang committed Jan 14, 2020
1 parent 9913f32 commit 80a00ce
Show file tree
Hide file tree
Showing 26 changed files with 1,362 additions and 90 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -121,3 +121,4 @@ train/*
tutorials/checkpoint/*
tutorials/train/*
autogluon/version.py
examples/image_classification/*/
3 changes: 2 additions & 1 deletion autogluon/contrib/enas/enas_scheduler.py
Expand Up @@ -117,7 +117,8 @@ def run(self):
# sample network configuration
config = self.controller.pre_sample()[0]
self.supernet.sample(**config)
self.train_fn(self.supernet, batch, **self.train_args)
# self.train_fn(self.supernet, batch, **self.train_args)
self.train_fn(epoch, self.epochs, self.supernet, batch, **self.train_args)
mx.nd.waitall()
if epoch >= self.warmup_epochs and (idx % self.update_arch_frequency) == 0:
self.train_controller()
Expand Down
8 changes: 4 additions & 4 deletions autogluon/task/base/base_predictor.py
Expand Up @@ -146,10 +146,10 @@ def fit_summary(self, output_directory=None, verbosity=2):
print("Information about each trial: ")
print("Trial ID: %s" % trial_id)
print(self.results['trial_info'][trial_id])

# Create plot summaries:
plot_summary_of_models(self.results, output_directory)
plot_performance_vs_trials(self.results, output_directory)
if verbosity > 3:
# Create plot summaries:
plot_summary_of_models(self.results, output_directory)
plot_performance_vs_trials(self.results, output_directory)
return self.results

def _createResults(self):
Expand Down
6 changes: 3 additions & 3 deletions autogluon/task/base/base_task.py
Expand Up @@ -47,11 +47,11 @@ def run_fit(cls, train_fn, search_strategy, scheduler_options):
best_config = scheduler.get_best_config()
args = train_fn.args
args.final_fit = True
# final fit
results = scheduler.run_with_config(best_config)
total_time = time.time() - start_time
if plot_results and in_ipynb():
scheduler.get_training_curves(plot=True, use_legend=False)
if plot_results or in_ipynb():
plot_training_curves = scheduler_options['checkpoint'].replace('exp1.ag', 'plot_training_curves.png')
scheduler.get_training_curves(filename=plot_training_curves, plot=True, use_legend=False)
results.update(best_reward=best_reward, best_config=best_config,
total_time=total_time, metadata=scheduler.metadata,
training_history=scheduler.training_history,
Expand Down
17 changes: 8 additions & 9 deletions autogluon/task/image_classification/classifier.py
Expand Up @@ -111,13 +111,11 @@ def predict(self, X, input_size=224, plot=True):
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
def predict_img(img):
# load and display the image
proba = self.predict_proba(img)
ind = mx.nd.argmax(proba, axis=1).astype('int')
idx = mx.nd.stack(mx.nd.arange(proba.shape[0], ctx=proba.context),
ind.astype('float32'))
idx = mx.nd.stack(mx.nd.arange(proba.shape[0], ctx=proba.context), ind.astype('float32'))
probai = mx.nd.gather_nd(proba, idx)
return ind, probai
return ind, probai, proba
if isinstance(X, str) and os.path.isfile(X):
img = self.loader(X)
if plot:
Expand All @@ -127,12 +125,13 @@ def predict_img(img):
return predict_img(img)
if isinstance(X, AutoGluonObject):
X = X.init()
inds, probas = [], []
inds, probas, probals_all = [], [],[]
for x in X:
ind, proba = predict_img(x[0])
inds.append(ind)
probas.append(proba)
return inds, probas
ind, proba, proba_all= predict_img(x[0])
inds.append(ind.asscalar())
probas.append(proba.asnumpy())
probals_all.append(proba_all.asnumpy().flatten())
return inds, probas, probals_all

@staticmethod
def loader(path):
Expand Down
89 changes: 80 additions & 9 deletions autogluon/task/image_classification/image_classification.py
@@ -1,10 +1,8 @@
import os
import copy
import logging

import mxnet as mx
from mxnet import gluon, nd

from ...core.optimizer import *
from ...core.loss import *
from ...core import *
Expand Down Expand Up @@ -61,7 +59,7 @@ def Dataset(*args, **kwargs):
def fit(dataset,
net=Categorical('ResNet50_v1b', 'ResNet18_v1b'),
optimizer= SGD(learning_rate=Real(1e-3, 1e-2, log=True),
wd=Real(1e-4, 1e-3, log=True)),
wd=Real(1e-4, 1e-3, log=True), multi_precision=False),
lr_scheduler='cosine',
loss=SoftmaxCrossEntropyLoss(),
split_ratio=0.8,
Expand All @@ -84,8 +82,32 @@ def fit(dataset,
num_trials=2,
dist_ip_addrs=[],
grace_period=None,
auto_search=True):
"""Fit image classification models to a given dataset.

auto_search=True,
lr_config=Dict(
lr_mode='cosine',
lr_decay=0.1,
lr_decay_period=0,
lr_decay_epoch='40,80',
warmup_lr=0.0,
warmup_epochs=0),
tricks=Dict(
last_gamma=False,#True
use_pretrained=False,#True
use_se=False,
mixup=False,
mixup_alpha=0.2,
mixup_off_epoch= 0,
label_smoothing=False,#True
no_wd=False,#True
teacher_name=None,
temperature=20.0,
hard_weight=0.5,
batch_norm=False,
use_gn=False)
):
"""
Fit image classification models to a given dataset.
Parameters
----------
Expand Down Expand Up @@ -160,6 +182,52 @@ def fit(dataset,
>>> num_trials = 4)
>>> test_data = task.Dataset('~/data/test', train=False)
>>> test_acc = classifier.evaluate(test_data)
Bag of tricks are used on image classification dataset
lr_config
----------
lr-mode : type=str, default='step'.
learning rate scheduler mode. options are step, poly and cosine.
lr-decay : type=float, default=0.1.
decay rate of learning rate. default is 0.1.
lr-decay-period : type=int, default=0.
interval for periodic learning rate decays. default is 0 to disable.
lr-decay-epoch : type=str, default='10,20,30'.
epochs at which learning rate decays. epochs=40, default is 10, 20, 30.
warmup-lr : type=float, default=0.0.
starting warmup learning rate. default is 0.0.
warmup-epochs : type=int, default=0.
number of warmup epochs.
tricks
----------
last-gamma', default= True.
whether to init gamma of the last BN layer in each bottleneck to 0.
use-pretrained', default= True.
enable using pretrained model from gluon.
use_se', default= False.
use SE layers or not in resnext. default is false.
mixup', default= False.
whether train the model with mix-up. default is false.
mixup-alpha', type=float, default=0.2.
beta distribution parameter for mixup sampling, default is 0.2.
mixup-off-epoch', type=int, default=0.
how many last epochs to train without mixup, default is 0.
label-smoothing', default= True.
use label smoothing or not in training. default is false.
no-wd', default= True.
whether to remove weight decay on bias, and beta/gamma for batchnorm layers.
teacher', type=str, default=None.
teacher model for distillation training
temperature', type=float, default=20.
temperature parameter for distillation teacher model
hard-weight', type=float, default=0.5.
weight for the loss of one-hot label for distillation training
batch-norm', default= True.
enable batch normalization or not in vgg. default is false.
use-gn', default= False.
whether to use group norm.
"""
checkpoint = os.path.join(output_directory, 'exp1.ag')
if auto_search:
Expand All @@ -185,7 +253,10 @@ def fit(dataset,
verbose=verbose,
num_workers=nthreads_per_trial,
hybridize=hybridize,
final_fit=False)
final_fit=False,
tricks=tricks,
lr_config=lr_config
)

scheduler_options = {
'resource': {'num_cpus': nthreads_per_trial, 'num_gpus': ngpus_per_trial},
Expand All @@ -210,9 +281,9 @@ def fit(dataset,
results = BaseTask.run_fit(train_image_classification, search_strategy,
scheduler_options)
args = sample_config(train_image_classification.args, results['best_config'])

model = get_network(args.net, results['num_classes'], mx.cpu(0))
update_params(model, results.pop('model_params'))
multi_precision = optimizer.kwvars['multi_precision'] if 'multi_precision' in optimizer.kwvars else False
update_params(model, results.pop('model_params'), multi_precision)
if ensemble > 1:
models = [model]
if isinstance(search_strategy, str):
Expand All @@ -225,7 +296,7 @@ def fit(dataset,
for i in range(1, ensemble):
resultsi = scheduler.run_with_config(results['best_config'])
model = get_network(args.net, resultsi['num_classes'], mx.cpu(0))
update_params(model, resultsi.pop('model_params'))
update_params(model, resultsi.pop('model_params'), multi_precision)
models.append(model)
model = Ensemble(models)
return Classifier(model, results, default_val_fn, checkpoint, args)
11 changes: 6 additions & 5 deletions autogluon/task/image_classification/nets.py
Expand Up @@ -90,16 +90,17 @@ def auto_suggest_network(dataset, net):
logger.info('Auto suggesting network net for dataset {}'.format(net, dataset_name))
return net

def get_network(net, num_classes, ctx):
def get_network(net, **kwargs):
if type(net) == str:
net = get_built_in_network(net, num_classes, ctx=ctx)
net = get_built_in_network(net, **kwargs)
else:
net.initialize(ctx=ctx)
net.initialize(ctx=kwargs['ctx'])
return net

def get_built_in_network(name, *args, **kwargs):
def _get_finetune_network(model_name, num_classes, ctx, *args, **kwargs):
finetune_net = get_model(model_name, *args, pretrained=True, **kwargs)
def _get_finetune_network(model_name, num_classes, ctx, **kwargs):
kwargs['pretrained'] = True
finetune_net = get_model(model_name, **kwargs)
# change the last fully connected layer to match the number of classes
with finetune_net.name_scope():
if hasattr(finetune_net, 'output'):
Expand Down
100 changes: 70 additions & 30 deletions autogluon/task/image_classification/pipeline.py
@@ -1,12 +1,11 @@
import warnings
import logging

import mxnet as mx
from mxnet.gluon import nn
from mxnet import gluon, init, autograd, nd
from mxnet.gluon.data.vision import transforms
from gluoncv.model_zoo import get_model

from gluoncv.loss import DistillationSoftmaxCrossEntropyLoss
from .metrics import get_metric_instance
from ...core.optimizer import SGD, NAG
from ...core import *
Expand All @@ -15,64 +14,105 @@
from ...utils.mxutils import collect_params
from .nets import get_network
from .utils import *
from .processing_params import Sample_params, Getmodel_kwargs
from ...utils.learning_rate import LR_params

__all__ = ['train_image_classification']


lr_schedulers = {
'poly': mx.lr_scheduler.PolyScheduler,
'cosine': mx.lr_scheduler.CosineScheduler
}

@args()
def train_image_classification(args, reporter):
logging.basicConfig()
logger = logging.getLogger(__name__)
if args.verbose:
logger.setLevel(logging.INFO)
logger.info(args)
batch_size = args.batch_size * max(args.num_gpus, 1)
ctx = [mx.gpu(i) for i in range(args.num_gpus)] if args.num_gpus > 0 else [mx.cpu()]

num_classes = args.dataset.num_classes if hasattr(args.dataset, 'num_classes') else None
net = get_network(args.net, num_classes, ctx)
if args.hybridize:
net.hybridize(static_alloc=True, static_shape=True)

target_params = Sample_params(args.batch_size, args.num_gpus, args.num_workers)
batch_size = target_params.get_batchsize
ctx = target_params.get_context
classes = args.dataset.num_classes if hasattr(args.dataset, 'num_classes') else None
target_kwargs = Getmodel_kwargs(ctx,
classes,
args.net,
args.tricks.teacher_name,
args.tricks.hard_weight,
args.optimizer.multi_precision,
args.hybridize,
args.tricks.use_pretrained,
args.tricks.use_gn,
args.tricks.last_gamma,
args.tricks.batch_norm,
args.tricks.use_se)
distillation = target_kwargs.distillation
net = target_kwargs.get_net
input_size = net.input_size if hasattr(net, 'input_size') else args.input_size
train_data, val_data, batch_fn, num_batches = get_data_loader(
args.dataset, input_size, batch_size, args.num_workers, args.final_fit,
args.split_ratio)

if isinstance(args.lr_scheduler, str):
lr_scheduler = lr_schedulers[args.lr_scheduler](num_batches * args.epochs,
base_lr=args.optimizer.lr)

if args.tricks.no_wd:
for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
v.wd_mult = 0.0
if args.tricks.label_smoothing or args.tricks.mixup:
sparse_label_loss = False
else:
sparse_label_loss = True
if distillation:
teacher = target_kwargs.get_teacher
def teacher_prob(data):
teacher_prob = [nd.softmax(teacher(X.astype(target_kwargs.dtype, copy=False)) / args.tricks.temperature) \
for X in data]
return teacher_prob
L = DistillationSoftmaxCrossEntropyLoss(temperature=args.tricks.temperature,
hard_weight=args.tricks.hard_weight,
sparse_label=sparse_label_loss)
else:
lr_scheduler = args.lr_scheduler
L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)
teacher_prob = None
if args.tricks.mixup:
metric = get_metric_instance('rmse')
else:
metric = get_metric_instance(args.metric)

train_data, val_data, batch_fn, num_batches = \
get_data_loader(args.dataset, input_size, batch_size, args.num_workers, args.final_fit, args.split_ratio)

if isinstance(args.lr_config.lr_mode, str):
target_lr = LR_params(args.optimizer.lr, args.lr_config.lr_mode, args.epochs, num_batches,
args.lr_config.lr_decay_epoch,
args.lr_config.lr_decay ,
args.lr_config.lr_decay_period,
args.lr_config.warmup_epochs,
args.lr_config.warmup_lr)
lr_scheduler = target_lr.get_lr_scheduler
else:
lr_scheduler = args.lr_config.lr_mode
args.optimizer.lr_scheduler = lr_scheduler
trainer = gluon.Trainer(net.collect_params(), args.optimizer)

metric = get_metric_instance(args.metric)
def train(epoch):
trainer = gluon.Trainer(net.collect_params(), args.optimizer)
def train(epoch, num_epochs, metric):
for i, batch in enumerate(train_data):
default_train_fn(net, batch, batch_size, args.loss, trainer, batch_fn, ctx)
metric = default_train_fn(epoch, num_epochs, net, batch, batch_size, L, trainer,
batch_fn, ctx, args.tricks.mixup, args.tricks.label_smoothing,
distillation, args.tricks.mixup_alpha, args.tricks.mixup_off_epoch,
classes,target_kwargs.dtype, metric, teacher_prob)
mx.nd.waitall()
return metric

def test(epoch):
metric.reset()
for i, batch in enumerate(val_data):
default_val_fn(net, batch, batch_fn, metric, ctx)
default_val_fn(net, batch, batch_fn, metric, ctx, target_kwargs.dtype)
_, reward = metric.get()
reporter(epoch=epoch, classification_reward=reward)
return reward

tbar = tqdm(range(1, args.epochs + 1))
for epoch in tbar:
train(epoch)
metric = train(epoch, args.epochs, metric)
train_metric_name, train_metric_score = metric.get()
tbar.set_description('[Epoch %d] training: %s=%.3f' %(epoch, train_metric_name, train_metric_score))
if not args.final_fit:
reward = test(epoch)
tbar.set_description('[Epoch {}] Validation: {:.3f}'.format(epoch, reward))

if args.final_fit:
return {'model_params': collect_params(net),
'num_classes': num_classes}
'num_classes': classes}

0 comments on commit 80a00ce

Please sign in to comment.