Skip to content

Commit

Permalink
fixed training resume
Browse files Browse the repository at this point in the history
  • Loading branch information
artyom committed Sep 21, 2019
1 parent e45de18 commit c58a314
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 9 deletions.
25 changes: 17 additions & 8 deletions models/official/detection/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,23 @@ def terminate_eval():
elif FLAGS.mode == 'train_and_eval':
save_config(params, params.model_dir)
executor.prepare_evaluation()
num_cycles = int(params.train.total_steps / params.eval.num_steps_per_eval)

# FIXME: this doesn't work with resuming
for cycle in range(num_cycles):
tf.logging.info('Start training cycle %d.' % cycle)
current_cycle_last_train_step = ((cycle + 1)
* params.eval.num_steps_per_eval)
executor.train(train_input_fn, current_cycle_last_train_step)

last_save = tf.train.latest_checkpoint(params.model_dir)
base_step = 0

if last_save is not None:
base_step = int(last_save.split('-')[-1])

# num_cycles = int(params.train.total_steps / params.eval.num_steps_per_eval)
# for cycle in range(num_cycles):

for step in range(base_step, params.train.total_steps, params.eval.num_steps_per_eval):
cycle = step // params.eval.num_steps_per_eval
tf.logging.info('Starting training from cycle %d (step %d).' % (cycle, step))

# current_cycle_last_train_step = ((cycle + 1)
# * params.eval.num_steps_per_eval)
executor.train(train_input_fn, step + params.eval.num_steps_per_eval)
executor.evaluate(
eval_input_fn,
params.eval.eval_samples // params.predict.predict_batch_size)
Expand Down
2 changes: 1 addition & 1 deletion scripts/howto.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ chkp.print_tensors_in_checkpoint_file("/tmp/model.ckpt", tensor_name='', all_ten

Monitor training:

clear ; cat training.log | grep -oE '(.AP50.:[ .0-9]+|Restoring.*)' | uniq
cat training.log | grep -oE '(.AP50.:[ .0-9]+|Restoring.*)' | uniq

tensorboard --logdir gs://ap_tpu_storage/saved/1.0.0

Expand Down

0 comments on commit c58a314

Please sign in to comment.