Skip to content

Commit

Permalink
fix saving unsharded checkpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh committed Sep 5, 2023
1 parent 5fff93a commit e46b988
Showing 1 changed file with 0 additions and 6 deletions.
6 changes: 0 additions & 6 deletions olmo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,12 +423,6 @@ def save_unsharded_checkpoint(self) -> Path:
latest_path.unlink(missing_ok=True)
latest_path.symlink_to(checkpoint_dir.name, target_is_directory=True)

# In the cases where we're using a shared NFS drive between ranks to save checkpoints,
# replacing the temp directory with the final directory from rank 0 might not be immediately
# realized in the file systems of the other ranks.
# So we wait here across all ranks until that final checkpoint directory is visible.
wait_on(lambda: checkpoint_dir.exists(), "Waiting for checkpoint directory", timeout=10.0)

# Remove old checkpoints.
if self.cfg.save_num_unsharded_checkpoints_to_keep > 0:
while len(self.unsharded_checkpoints) > self.cfg.save_num_unsharded_checkpoints_to_keep:
Expand Down

0 comments on commit e46b988

Please sign in to comment.