Skip to content

Commit

Permalink
handle race conditions when saving to NFS on cirrascale (#255)
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh authored Sep 7, 2023
1 parent b4a1491 commit 6b977d0
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions olmo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,12 +261,26 @@ def save_sharded_checkpoint(self) -> Path:

if get_fs_local_rank() == 0:
# Replace temp directory with target checkpoint directory.
checkpoint_dir_tmp.replace(checkpoint_dir)
try:
checkpoint_dir_tmp.replace(checkpoint_dir)
except FileNotFoundError:
# Caught when another (file-system) local rank 0 has already replaced the tmp directory.
# This can happen when nodes are saving to a common NFS drive but otherwise have distinct
# file-systems.
if not checkpoint_dir.exists():
raise

# Link to 'latest'.
latest_path = Path(self.cfg.save_folder) / "latest"
latest_path.unlink(missing_ok=True)
latest_path.symlink_to(checkpoint_dir.name, target_is_directory=True)
try:
latest_path.symlink_to(checkpoint_dir.name, target_is_directory=True)
except FileExistsError:
# Same as above, caught when another (file-system) local rank 0 has already made the 'latest' symlink.
# This can happen when nodes are saving to a common NFS drive but otherwise have distinct
# file-systems.
if latest_path.resolve().name != checkpoint_dir.name:
raise

# In the cases where we're using a shared NFS drive between ranks to save checkpoints,
# replacing the temp directory with the final directory from rank 0 might not be immediately
Expand Down

0 comments on commit 6b977d0

Please sign in to comment.