Skip to content

Commit

Permalink
Group tensorboard metrics (#39)
Browse files Browse the repository at this point in the history
* Training groupings

* validation grouping

* steps vs samples

* iteration time (speed -> samples or iterations per second)

* tensorboard group time (from `log_timers_to_tensorboard`)

* comment on the writing condition

* Update megatron/global_vars.py

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* Update megatron/training.py

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* Update megatron/training.py

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* Update megatron/training.py

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* Update megatron/training.py

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

* link bug fix issue on megatron-lm side

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
  • Loading branch information
VictorSanh and stas00 committed Aug 5, 2021
1 parent 752e958 commit 9e75429
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 23 deletions.
2 changes: 1 addition & 1 deletion megatron/global_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def write(self, names, writer, iteration, normalizer=1.0, reset=False):
assert normalizer > 0.0
for name in names:
value = self.timers[name].elapsed(reset=reset) / normalizer
writer.add_scalar(name + '-time', value, iteration)
writer.add_scalar(f'time/{name}-time', value, iteration)

def log(self, names, normalizer=1.0, reset=True):
"""Log a group of timers."""
Expand Down
50 changes: 28 additions & 22 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,33 +547,35 @@ def add_to_logging(name):
# Tensorboard values.
if writer and (iteration % args.tensorboard_log_interval == 0 ) and \
is_last_rank():
writer.add_scalar('steps-vs-samples/y=steps,x=samples', iteration, args.consumed_train_samples)
writer.add_scalar('steps-vs-samples/y=samples,x=steps', args.consumed_train_samples, iteration)
if args.log_learning_rate_to_tensorboard:
writer.add_scalar('learning-rate', learning_rate, iteration)
writer.add_scalar('learning-rate vs samples', learning_rate,
writer.add_scalar('learning-rate/learning-rate', learning_rate, iteration)
writer.add_scalar('learning-rate/learning-rate vs samples', learning_rate,
args.consumed_train_samples)
if args.log_batch_size_to_tensorboard:
writer.add_scalar('batch-size', batch_size, iteration)
writer.add_scalar('batch-size vs samples', batch_size,
writer.add_scalar('batch-size/batch-size', batch_size, iteration)
writer.add_scalar('batch-size/batch-size vs samples', batch_size,
args.consumed_train_samples)
for key in loss_dict:
writer.add_scalar(key , loss_dict[key], iteration)
writer.add_scalar(key + ' vs samples', loss_dict[key],
writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration)
writer.add_scalar(f"lm-loss-training/{key}" + ' vs samples', loss_dict[key],
args.consumed_train_samples)
if args.log_loss_scale_to_tensorboard:
writer.add_scalar('loss-scale', loss_scale, iteration)
writer.add_scalar('loss-scale vs samples', loss_scale,
writer.add_scalar('loss-scale/loss-scale', loss_scale, iteration)
writer.add_scalar('loss-scale/loss-scale vs samples', loss_scale,
args.consumed_train_samples)
if grad_norm is not None:
writer.add_scalar('grad-norm', grad_norm, iteration)
writer.add_scalar('grad-norm vs samples', grad_norm,
writer.add_scalar('grad-norm/grad-norm', grad_norm, iteration)
writer.add_scalar('grad-norm/grad-norm vs samples', grad_norm,
args.consumed_train_samples)
if num_zeros_in_grad is not None:
writer.add_scalar('num-zeros', num_zeros_in_grad, iteration)
writer.add_scalar('num-zeros vs samples', num_zeros_in_grad,
writer.add_scalar('num-zeros/num-zeros', num_zeros_in_grad, iteration)
writer.add_scalar('num-zeros/num-zeros vs samples', num_zeros_in_grad,
args.consumed_train_samples)
if params_norm is not None:
writer.add_scalar('params-norm', params_norm, iteration)
writer.add_scalar('params-norm vs samples', params_norm,
writer.add_scalar('params-norm/params-norm', params_norm, iteration)
writer.add_scalar('params-norm/params-norm vs samples', params_norm,
args.consumed_train_samples)
if args.log_timers_to_tensorboard:
timers.write(timers_to_log, writer, iteration,
Expand All @@ -582,10 +584,14 @@ def add_to_logging(name):
if iteration % args.log_interval == 0:
elapsed_time = timers('interval-time').elapsed()
elapsed_time_per_iteration = elapsed_time / total_iterations
if writer and torch.distributed.get_rank() == 0:
# Changed `torch.distributed.get_rank() == 0` to `is_last_rank()` as a writing condition.
# Bug fix as detailed in this issue: https://github.com/NVIDIA/Megatron-LM/issues/129
if writer and is_last_rank():
if args.log_timers_to_tensorboard:
writer.add_scalar('iteration-time',
writer.add_scalar('iteration-time/iteration-time (s)',
elapsed_time_per_iteration, iteration)
writer.add_scalar('iteration-time/iteration-time (s) vs samples',
elapsed_time_per_iteration, args.consumed_train_samples)
log_string = ' iteration {:8d}/{:8d} |'.format(
iteration, args.train_iters)
log_string += ' consumed samples: {:12d} |'.format(
Expand Down Expand Up @@ -772,7 +778,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
forward_backward_func = forward_backward_pipelining_without_interleaving
else:
forward_backward_func = forward_backward_no_pipelining

if args.deepspeed:
# DeepSpeed uses eval_batch() and already aggregates losses.
assert isinstance(model, list) and len(model) == 1
Expand All @@ -782,7 +788,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
loss_dicts = forward_backward_func(
forward_step_func, data_iterator, model, optimizer=None,
timers=None, forward_only=True)

if mpu.is_pipeline_last_stage(ignore_virtual=True):
# Reduce across processes.
for loss_dict in loss_dicts:
Expand Down Expand Up @@ -816,16 +822,16 @@ def evaluate_and_print_results(prefix, forward_step_func,
ppl = math.exp(min(20, total_loss_dict[key].item()))
string += '{} PPL: {:.6E} | '.format(key, ppl)
if writer and is_last_rank():
writer.add_scalar('{} validation'.format(key),
writer.add_scalar(f'lm-loss-validation/{key} validation',
total_loss_dict[key].item(),
iteration)
writer.add_scalar('{} validation vs samples'.format(key),
writer.add_scalar(f'lm-loss-validation/{key} validation vs samples',
total_loss_dict[key].item(),
args.consumed_train_samples)
if args.log_validation_ppl_to_tensorboard:
writer.add_scalar('{} validation ppl'.format(key), ppl,
writer.add_scalar(f'lm-loss-validation/{key} validation ppl', ppl,
iteration)
writer.add_scalar('{} validation ppl vs samples'.format(key),
writer.add_scalar(f'lm-loss-validation/{key} validation ppl vs samples',
ppl, args.consumed_train_samples)

length = len(string) + 1
Expand Down

0 comments on commit 9e75429

Please sign in to comment.