forked from meta-llama/llama-recipes
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
96 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,12 @@ | ||
.DS_Store | ||
__pycache__ | ||
.ipynb_checkpoints | ||
*.pickle | ||
profiles/ | ||
PATH/ | ||
_memory_viz.py | ||
llama-2-13b-hf/ | ||
llama-2-7b-hf/ | ||
*.out | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,36 +1,42 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# This software may be used and distributed according to the terms of the GNU General Public License version 3. | ||
|
||
|
||
#!/bin/bash | ||
|
||
#SBATCH --job-name=Nano-2d-trainer-20b-8nodes | ||
|
||
#SBATCH --job-name=llama2 | ||
#SBATCH --partition=train | ||
#SBATCH --ntasks=2 | ||
#SBATCH --nodes=2 | ||
#SBATCH --gpus-per-task=4 | ||
#SBATCH --partition=train | ||
#SBATCH --gpus-per-task=8 | ||
#SBATCH --cpus-per-task=96 | ||
|
||
nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) | ||
nodes_array=($nodes) | ||
head_node=${nodes_array[0]} | ||
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) | ||
# Enable for A100 | ||
export FI_PROVIDER="efa" | ||
|
||
echo Node IP: $head_node_ip | ||
# Enable for A100 | ||
export LOGLEVEL=INFO | ||
export FI_PROVIDER="efa" | ||
export FI_EFA_USE_DEVICE_RDMA=1 | ||
export NCCL_ALGO=ring | ||
|
||
# debugging flags (optional) | ||
export NCCL_DEBUG=WARN | ||
export NCCL_DEBUG_SUBSYS=WARN | ||
export PYTHONFAULTHANDLER=1 | ||
|
||
export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH | ||
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH | ||
|
||
echo $LD_LIBRARY_PATH | ||
export CUDA_LAUNCH_BLOCKING=0 | ||
|
||
# on your cluster you might need these: | ||
# set the network interface | ||
export NCCL_SOCKET_IFNAME="ens" | ||
export FI_EFA_USE_DEVICE_RDMA=1 | ||
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond" | ||
|
||
|
||
# export TORCH_NCCL_AVOID_RECORD_STREAMS=1 | ||
|
||
srun torchrun --nnodes 2 --nproc_per_node 8 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $head_node_ip:29500 llama_finetuning.py --enable_fsdp --model_name llama-2-7b-hf/ --pure_bf16 --use_fast_kernels | ||
|
||
srun torchrun --nproc_per_node 4 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $head_node_ip:29500 llama_finetuning.py --enable_fsdp --use_peft --peft_method lora | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters