From e46b98856395ce28fae086c6ec53b971d6e7d929 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 5 Sep 2023 09:57:09 -0700 Subject: [PATCH] fix saving unsharded checkpoints --- olmo/train.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/olmo/train.py b/olmo/train.py index c518cc77d..6988a81ad 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -423,12 +423,6 @@ def save_unsharded_checkpoint(self) -> Path: latest_path.unlink(missing_ok=True) latest_path.symlink_to(checkpoint_dir.name, target_is_directory=True) - # In the cases where we're using a shared NFS drive between ranks to save checkpoints, - # replacing the temp directory with the final directory from rank 0 might not be immediately - # realized in the file systems of the other ranks. - # So we wait here across all ranks until that final checkpoint directory is visible. - wait_on(lambda: checkpoint_dir.exists(), "Waiting for checkpoint directory", timeout=10.0) - # Remove old checkpoints. if self.cfg.save_num_unsharded_checkpoints_to_keep > 0: while len(self.unsharded_checkpoints) > self.cfg.save_num_unsharded_checkpoints_to_keep: