Skip to content

Commit

Permalink
add configuration option
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh committed Nov 6, 2023
1 parent a1c32e9 commit 4ed81c6
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
7 changes: 7 additions & 0 deletions olmo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,13 @@ class TrainConfig(BaseConfig):
to write out a final checkpoint.
"""

extra_steps_after_cancel: int = 10
"""
Under certain conditions when a run is canceled we train for a few extra steps after saving
the final checkpoint so that when the run is restarted from the latest checkpoint we have some
overlap in metrics.
"""

early_stopping_factor: Optional[float] = None

save_data_indices: bool = True
Expand Down
3 changes: 2 additions & 1 deletion olmo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ def check_if_cancelled(self) -> Tuple[bool, int]:
# First check if we've reached the training time limit.
should_cancel = True
cancel_reason = "time limit reached"
extra_steps = 10 # train for 10 extra steps so we get an overlap in metrics when we restart
extra_steps = self.cfg.extra_steps_after_cancel
elif (
self.cfg.early_stopping_factor is not None
and self.global_step > self.cfg.scheduler.t_warmup
Expand All @@ -688,6 +688,7 @@ def check_if_cancelled(self) -> Tuple[bool, int]:
if tag.lower() in {"cancel", "canceled", "cancelled"}:
should_cancel = True
cancel_reason = "Weights & Biases tag"
extra_steps = self.cfg.extra_steps_after_cancel
break
except RequestException:
pass
Expand Down

0 comments on commit 4ed81c6

Please sign in to comment.