Skip to content

Commit

Permalink
Merge branch 'main' into ResetOptimizerState
Browse files Browse the repository at this point in the history
  • Loading branch information
dirkgr authored Oct 5, 2023
2 parents 54b624a + e8bd122 commit 434dc94
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 5 deletions.
10 changes: 6 additions & 4 deletions olmo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,13 +703,15 @@ def train_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, Optional[tor

del logits

# Check for nan.
if torch.isnan(loss):
raise ValueError("nan loss encountered")

# Run backward pass.
loss.backward()

# Check for nan.
if torch.isnan(ce_batch_loss):
raise ValueError("nan loss encountered")
if z_batch_loss is not None and torch.isnan(z_batch_loss):
raise ValueError("nan loss encountered")

return ce_batch_loss, z_batch_loss

def train_step(self, batch: Dict[str, Any], reduce_global_loss: bool = True) -> Dict[str, float]:
Expand Down
2 changes: 1 addition & 1 deletion scripts/v1-mix-medium-on-lumi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ export ROCM_PATH=/opt/rocm
export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64

# Try playing with max_split_size_mb if you run into OOM errors.
export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128
#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128

srun \
--cpus-per-task=$SLURM_CPUS_PER_TASK \
Expand Down

0 comments on commit 434dc94

Please sign in to comment.