Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
New callback interface for training visualization.
Browse files Browse the repository at this point in the history
Neural style transfer notebook example of (refactored) live notebook charting.

Trivial notebook to run train_cifar10 working.

notebook structure in place for live charting.
Needs module cb change.

Module callbacks almost working.  Epoch numbers NaN

Module / CNN example for notebook charts.

CIFAR10 training notebook for module works again.

Matching callback code to notebook.

Add new notebook

Small fix

Change notebook name

Clean merge

Small bug fix

Import cleaning

Update with master

Point submodule to newest

Clean import statement for notebook

Fix callback function error

Seperate doc

Fix pylint
  • Loading branch information
Leo Dirac authored and Wang committed Nov 22, 2016
1 parent 2cf3b74 commit 9cd7193
Show file tree
Hide file tree
Showing 16 changed files with 2,148 additions and 588 deletions.
31 changes: 31 additions & 0 deletions example/image-classification/symbol_lenet.py
@@ -0,0 +1,31 @@
"""
LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
Haffner. "Gradient-based learning applied to document recognition."
Proceedings of the IEEE (1998)
"""
import mxnet as mx

def get_symbol(num_classes = 1000, add_stn=False):
data = mx.symbol.Variable('data')
if(add_stn):
data = mx.sym.SpatialTransformer(data=data, loc=get_loc(data), target_shape = (28,28),
transform_type="affine", sampler_type="bilinear")
# first conv
conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
kernel=(2,2), stride=(2,2))
# second conv
conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
kernel=(2,2), stride=(2,2))
# first fullc
flatten = mx.symbol.Flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh")
# second fullc
fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=num_classes)
# loss
lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
return lenet
593 changes: 593 additions & 0 deletions example/module/train_cifar10.ipynb

Large diffs are not rendered by default.

237 changes: 133 additions & 104 deletions example/module/train_cifar10.py
@@ -1,3 +1,6 @@
"""Train CIFAR-10 classifier in MXNet.
Demonstrates using the Module class.
"""
import logging
import os, sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "image-classification")))
Expand All @@ -6,43 +9,46 @@
import mxnet as mx
import argparse
import train_model
import importlib
import platform


def command_line_args(defaults=False):
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--network', type=str, default='inception-bn-28-small',
help = 'which CNN style to use')
my_dir = os.path.dirname(__file__)
default_data_dir = os.path.abspath(os.path.join(my_dir, '..', 'image-classification', 'cifar10')) + '/'
parser.add_argument('--data-dir', type=str, default=default_data_dir,
help='the input data directory')
parser.add_argument('--gpus', type=str,
help='the gpus will be used, e.g "0,1,2,3"')
parser.add_argument('--num-examples', type=int, default=60000,
help='the number of training examples')
parser.add_argument('--batch-size', type=int, default=128,
help='the batch size')
parser.add_argument('--lr', type=float, default=.05,
help='the initial learning rate')
parser.add_argument('--lr-factor', type=float, default=1,
help='times the lr with a factor for every lr-factor-epoch epoch')
parser.add_argument('--lr-factor-epoch', type=float, default=1,
help='the number of epoch to factor the lr, could be .5')
parser.add_argument('--model-prefix', type=str,
help='the prefix of the model to load')
parser.add_argument('--save-model-prefix', type=str,
help='the prefix of the model to save')
parser.add_argument('--num-epochs', type=int, default=20,
help='the number of training epochs')
parser.add_argument('--load-epoch', type=int,
help="load the model on an epoch using the model-prefix")
parser.add_argument('--kv-store', type=str, default='local',
help='the kvstore type')
if defaults:
return parser.parse_args([])
else:
return parser.parse_args()

my_dir = os.path.dirname(__file__)
default_data_dir = os.path.abspath(os.path.join(my_dir, '..', 'image-classification', 'cifar10')) + '/'

parser = argparse.ArgumentParser(description='train an image classifer on cifar10')
parser.add_argument('--network', type=str, default='inception-bn-28-small',
help = 'the cnn to use')
parser.add_argument('--data-dir', type=str, default=default_data_dir,
help='the input data directory')
parser.add_argument('--gpus', type=str,
help='the gpus will be used, e.g "0,1,2,3"')
parser.add_argument('--num-examples', type=int, default=60000,
help='the number of training examples')
parser.add_argument('--batch-size', type=int, default=128,
help='the batch size')
parser.add_argument('--lr', type=float, default=.05,
help='the initial learning rate')
parser.add_argument('--lr-factor', type=float, default=1,
help='times the lr with a factor for every lr-factor-epoch epoch')
parser.add_argument('--lr-factor-epoch', type=float, default=1,
help='the number of epoch to factor the lr, could be .5')
parser.add_argument('--model-prefix', type=str,
help='the prefix of the model to load')
parser.add_argument('--save-model-prefix', type=str,
help='the prefix of the model to save')
parser.add_argument('--num-epochs', type=int, default=20,
help='the number of training epochs')
parser.add_argument('--load-epoch', type=int,
help="load the model on an epoch using the model-prefix")
parser.add_argument('--kv-store', type=str, default='local',
help='the kvstore type')
args = parser.parse_args()

if args.model_prefix is not None:
args.model_prefix = os.path.abspath(os.path.join(my_dir, args.model_prefix))
if args.save_model_prefix is not None:
args.save_model_prefix = os.path.abspath(os.path.join(my_dir, args.save_model_prefix))

# download data if necessary
def _download(data_dir):
Expand All @@ -66,10 +72,6 @@ def _download(data_dir):
os.rmdir(os.path.join(dirname, "cifar"))
os.chdir(cwd)

# network
import importlib
net = importlib.import_module('symbol_' + args.network).get_symbol(10)

# data
def get_iterator(args, kv):
data_shape = (3, 28, 28)
Expand Down Expand Up @@ -102,68 +104,95 @@ def get_iterator(args, kv):
return (train, val)


################################################################################
# train
################################################################################

# kvstore
kv = mx.kvstore.create(args.kv_store)
def do_train(args, callback_args=None):
# network
net = importlib.import_module('symbol_' + args.network).get_symbol(10)

my_dir = os.path.dirname(__file__)
if args.model_prefix is not None:
args.model_prefix = os.path.abspath(os.path.join(my_dir, args.model_prefix))
if args.save_model_prefix is not None:
args.save_model_prefix = os.path.abspath(os.path.join(my_dir, args.save_model_prefix))


################################################################################
# train
################################################################################

# kvstore
kv = mx.kvstore.create(args.kv_store)

# logging
head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)
logging.info('start with arguments %s', args)

logging.info('running on %s', platform.node())

(train, val) = get_iterator(args, kv)

if args.gpus is None or args.gpus == '':
devs = mx.cpu()
elif type(args.gpus) == str:
devs = [mx.gpu(int(i)) for i in args.gpus.split(',')]
else:
devs = mx.gpu(int(args.gpus))
logging.info('Starting with devices %s', devs)

mod = mx.mod.Module(net, context=devs)

# load model
model_prefix = args.model_prefix

if args.load_epoch is not None:
assert model_prefix is not None
logging.info('loading model from %s-%d...' % (model_prefix, args.load_epoch))
sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, args.load_epoch)
else:
arg_params = None
aux_params = None

# save model
save_model_prefix = args.save_model_prefix
if save_model_prefix is None:
save_model_prefix = model_prefix
checkpoint = None if save_model_prefix is None else mx.callback.do_checkpoint(save_model_prefix)

optim_args = {'learning_rate': args.lr, 'wd': 0.00001, 'momentum': 0.9}
if 'lr_factor' in args and args.lr_factor < 1:
optim_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
step = max(int(epoch_size * args.lr_factor_epoch), 1),
factor = args.lr_factor)

if 'clip_gradient' in args and args.clip_gradient is not None:
optim_args['clip_gradient'] = args.clip_gradient

eval_metrics = ['accuracy']
## TopKAccuracy only allows top_k > 1
for top_k in [5, 10, 20]:
eval_metrics.append(mx.metric.create('top_k_accuracy', top_k = top_k))

if args.load_epoch:
begin_epoch = args.load_epoch+1
else:
begin_epoch = 0

if not callback_args:
callback_args = {
'batch_end_callback': mx.callback.Speedometer(args.batch_size, 50),
'epoch_end_callback': checkpoint,
}
else:
pass
#TODO: add checkpoint back in

logging.info('start training for %d epochs...', args.num_epochs)
mod.fit(train, eval_data=val, optimizer_params=optim_args,
eval_metric=eval_metrics, num_epoch=args.num_epochs,
arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch,
**callback_args)

if __name__ == "__main__":
args = command_line_args()
do_train(args)

# logging
head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)
logging.info('start with arguments %s', args)

import platform
logging.info('running on %s', platform.node())

(train, val) = get_iterator(args, kv)

devs = mx.cpu() if (args.gpus is None or args.gpus == '') else [
mx.gpu(int(i)) for i in args.gpus.split(',')]
logging.info('Starting with devices %s', devs)

mod = mx.mod.Module(net, context=devs)

# load model
model_prefix = args.model_prefix

if args.load_epoch is not None:
assert model_prefix is not None
logging.info('loading model from %s-%d...' % (model_prefix, args.load_epoch))
sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, args.load_epoch)
else:
arg_params = None
aux_params = None

# save model
save_model_prefix = args.save_model_prefix
if save_model_prefix is None:
save_model_prefix = model_prefix
checkpoint = None if save_model_prefix is None else mx.callback.do_checkpoint(save_model_prefix)

optim_args = {'learning_rate': args.lr, 'wd': 0.00001, 'momentum': 0.9}
if 'lr_factor' in args and args.lr_factor < 1:
optim_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
step = max(int(epoch_size * args.lr_factor_epoch), 1),
factor = args.lr_factor)

if 'clip_gradient' in args and args.clip_gradient is not None:
optim_args['clip_gradient'] = args.clip_gradient

eval_metrics = ['accuracy']
## TopKAccuracy only allows top_k > 1
for top_k in [5, 10, 20]:
eval_metrics.append(mx.metric.create('top_k_accuracy', top_k = top_k))

if args.load_epoch:
begin_epoch = args.load_epoch+1
else:
begin_epoch = 0

logging.info('start training for %d epochs...', args.num_epochs)
mod.fit(train, eval_data=val, optimizer_params=optim_args,
eval_metric=eval_metrics, num_epoch=args.num_epochs,
arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch,
batch_end_callback=mx.callback.Speedometer(args.batch_size, 50),
epoch_end_callback=checkpoint)
2 changes: 1 addition & 1 deletion example/neural-style/README.md
Expand Up @@ -8,7 +8,7 @@ A. Gatys, Alexander S. Ecker, and Matthias Bethge.

First use `download.sh` to download pre-trained model and sample inputs

Then run `python run.py`, use `-h` to see more options
Then run `python nstyle.py`, use `-h` to see more options

## Sample results

Expand Down

0 comments on commit 9cd7193

Please sign in to comment.