# Training
1. Python
2. Torchrun
3. Accelerate
4. Deepspeed
5. Deepspeed + Torch.Distributed.Run

```
***** train metrics *****
  epoch                    =        3.0
  train_loss               =      0.948
  train_runtime            = 1:25:14.22
  train_samples_per_second =      2.933
  train_steps_per_second   =      0.092
Figure saved: saves/LLaMA2-7B-Chat/lora/04_sft18/training_loss.png
01/03/2024 12:40:35 - WARNING - llmtuner.extras.ploting - No metric eval_loss to plot.
```

## 初始環境設定

In [None]:
# 初始環境設定
import os
from pathlib import Path
HOME = str(Path.home())
Add_Binarry_Path=HOME+'/.local/bin:/usr/ubuntu_bin'
os.environ['PATH']=os.environ['PATH']+':'+Add_Binarry_Path
current_foldr=!pwd
current_foldr=current_foldr[0]
current_foldr

## 套件

In [None]:
!pip install cohere gdown kaleido langchain openai pyngrok pypdf python-dotenv sentence-transformers tiktoken -q
!pip install accelerate bitsandbytes hf_transfer huggingface_hub optimum transformers==4.36.2 -q 
!pip install appdirs black black[jupyter] datasets fire loralib sentencepiece gradio==3.48.0 -q
!pip install fastapi jieba matplotlib nltk peft==0.7.0 protobuf pydantic rouge-chinese scipy sse-starlette trl==0.7.6 uvicorn -q 
!pip install deepspeed -q

## Method01 Python (singularity/notebook)
- singularity: chage kernel to Python3 (ipykernel)
- notebook: chage kernel to pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv

In [None]:
%%bash 
## Method01 Python (singularity/notebook)
export GPUS_PER_NODE=1
export MASTER_ADDR=$(hostname -s)
export MASTER_PORT=$(python -c "import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); s.bind(('', 0)); addr = s.getsockname(); s.close(); print(addr[1])")
export MODEL_ID="/work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf"
export DATASET="alpaca_gpt4_zh"
export OUTPUT_DIR="saves/LLaMA2-7B-Chat/lora/01_sft"
export MAX_SAMPLE=5000

## CLEAN CACHE
ps -ef |grep  train | awk '{print $2}' | xargs kill -9
rm -rf ${OUTPUT_DIR}

## RUN (chage kernel to Python3 or your Image kernel)
## If you use Image kernel, mark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity
## If you use python kernel, unmark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity
/work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity run --nv --cleanenv -B /work -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/lib:/home/g00cjz00/.local/lib -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/bin:/home/g00cjz00/.local/bin -B /work/u00cjz00/os/ubuntu/bin:/usr/ubuntu_bin /work/u00cjz00/nvidia/pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv.sif \
bash -c "export PATH=\$PATH:\$HOME/.local/bin; \
    cd /work/g00cjz00/github/LLaMA-Factory; \
    CUDA_VISIBLE_DEVICES=0 python \
    src/train_bash.py \
    --stage sft \
    --do_train \
    --model_name_or_path ${MODEL_ID} \
    --dataset ${DATASET} \
    --template default \
    --finetuning_type lora \
    --lora_target q_proj,v_proj \
    --output_dir path_to_sft_checkpoint \
    --overwrite_cache \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 1000 \
    --learning_rate 5e-5 \
    --num_train_epochs 3.0 \
    --plot_loss \
    --fp16 \
    --max_samples ${MAX_SAMPLE} \
    --output_dir ${OUTPUT_DIR}
"

## Method2 Torch.Run (singularity/notebook)

In [None]:
%%bash 
## Method2 Torch.Run (singularity/notebook)
export GPUS_PER_NODE=8
export MASTER_ADDR=$(hostname -s)
export MASTER_PORT=$(python -c "import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); s.bind(('', 0)); addr = s.getsockname(); s.close(); print(addr[1])")
export MODEL_ID="/work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf"
export DATASET="alpaca_gpt4_zh"
export OUTPUT_DIR="saves/LLaMA2-7B-Chat/lora/01_sft"
export MAX_SAMPLE=5000

## CLEAN CACHE
ps -ef |grep  train | awk '{print $2}' | xargs kill -9
rm -rf ${OUTPUT_DIR}

## RUN (chage kernel to Python3 or your Image kernel)
## If you use Image kernel, unmark next two line, and mark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity
## If you use python kernel, mark next two line, and unmark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity

export SLURM_NNODES=1
export SLURM_PROCID=0
#/work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity run --nv --cleanenv -B /work -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/lib:/home/g00cjz00/.local/lib -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/bin:/home/g00cjz00/.local/bin -B /work/u00cjz00/os/ubuntu/bin:/usr/ubuntu_bin /work/u00cjz00/nvidia/pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv.sif \
bash -c "export PATH=\$PATH:\$HOME/.local/bin; \
    cd /work/g00cjz00/github/LLaMA-Factory; \
    torchrun --nproc_per_node=${GPUS_PER_NODE} --nnodes=${SLURM_NNODES} --master_port=${MASTER_PORT} \
    src/train_bash.py \
    --stage sft \
    --do_train \
    --model_name_or_path ${MODEL_ID} \
    --dataset ${DATASET} \
    --template default \
    --finetuning_type lora \
    --lora_target q_proj,v_proj \
    --output_dir path_to_sft_checkpoint \
    --overwrite_cache \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 1000 \
    --learning_rate 5e-5 \
    --num_train_epochs 3.0 \
    --plot_loss \
    --fp16 \
    --max_samples ${MAX_SAMPLE} \
    --output_dir ${OUTPUT_DIR}
"

## Method3 ACCELERATE (singularity/notebook)
- singularity: chage kernel to Python3 (ipykernel)
- notebook: chage kernel to pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv

In [1]:
%%bash
cat << \EOF >  accelerate.yml

compute_environment: LOCAL_MACHINE
distributed_type: MULTI_GPU
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: fp16
gpu_ids: 0,1,2,3,4,5,6,7
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

EOF

In [None]:
%%bash 
## Method3 ACCELERATE (singularity/notebook)
export GPUS_PER_NODE=8
export MASTER_ADDR=$(hostname -s)
export MASTER_PORT=$(python -c "import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); s.bind(('', 0)); addr = s.getsockname(); s.close(); print(addr[1])")
export AC_CONFIG="accelerate.yml"
export MODEL_ID="/work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf"
export DATASET="alpaca_gpt4_zh"
export OUTPUT_DIR="saves/LLaMA2-7B-Chat/lora/01_sft"
export MAX_SAMPLE=5000

## CLEAN CACHE
ps -ef |grep  train | awk '{print $2}' | xargs kill -9
rm -rf ${OUTPUT_DIR}

## RUN (chage kernel to Python3 or your Image kernel)
## If you use Image kernel, mark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity
## If you use python kernel, unmark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity

#/work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity run --nv --cleanenv -B /work -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/lib:/home/g00cjz00/.local/lib -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/bin:/home/g00cjz00/.local/bin -B /work/u00cjz00/os/ubuntu/bin:/usr/ubuntu_bin /work/u00cjz00/nvidia/pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv.sif \
bash -c "export PATH=\$PATH:\$HOME/.local/bin; \
    cd /work/g00cjz00/github/LLaMA-Factory; \
    accelerate launch \
    --config_file ${AC_CONFIG} \
    src/train_bash.py \
    --stage sft \
    --do_train \
    --model_name_or_path ${MODEL_ID} \
    --dataset ${DATASET} \
    --template default \
    --finetuning_type lora \
    --lora_target q_proj,v_proj \
    --output_dir path_to_sft_checkpoint \
    --overwrite_cache \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 1000 \
    --learning_rate 5e-5 \
    --num_train_epochs 3.0 \
    --plot_loss \
    --fp16 \
    --max_samples ${MAX_SAMPLE} \
    --output_dir ${OUTPUT_DIR}
"

## Method4 DEEPSPEED (singularity/notebook)
- singularity: chage kernel to Python3 (ipykernel)
- notebook: chage kernel to pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv

In [None]:
%%bash 
## Method4 DEEPSPEED (singularity/notebook)
export GPUS_PER_NODE=2
export MASTER_ADDR=$(hostname -s)
export MASTER_PORT=$(python -c "import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); s.bind(('', 0)); addr = s.getsockname(); s.close(); print(addr[1])")
export DS_CONFIG="ds_config_zero3.json"
export MODEL_ID="/work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf"
export DATASET="alpaca_gpt4_zh"
export OUTPUT_DIR="saves/LLaMA2-7B-Chat/lora/01_sft"
export MAX_SAMPLE=5000

## CLEAN CACHE
ps -ef |grep  train | awk '{print $2}' | xargs kill -9
rm -rf ${OUTPUT_DIR}

## RUN (chage kernel to Python3 or your Image kernel)
## If you use Image kernel, mark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity
## If you use python kernel, unmark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity

#/work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity run --nv --cleanenv -B /work -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/lib:/home/g00cjz00/.local/lib -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/bin:/home/g00cjz00/.local/bin -B /work/u00cjz00/os/ubuntu/bin:/usr/ubuntu_bin /work/u00cjz00/nvidia/pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv.sif \
bash -c "export PATH=\$PATH:\$HOME/.local/bin; \
    cd /work/g00cjz00/github/LLaMA-Factory; \
    deepspeed \
    --hostfile=./hostfile \
    src/train_bash.py \
    --deepspeed ${DS_CONFIG} \
    --stage sft \
    --do_train \
    --model_name_or_path ${MODEL_ID} \
    --dataset ${DATASET} \
    --template default \
    --finetuning_type lora \
    --lora_target q_proj,v_proj \
    --output_dir path_to_sft_checkpoint \
    --overwrite_cache \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 1000 \
    --learning_rate 5e-5 \
    --num_train_epochs 3.0 \
    --plot_loss \
    --fp16 \
    --max_samples ${MAX_SAMPLE} \
    --output_dir ${OUTPUT_DIR}
"

## Method5 DEEPSPEED + Torch.Distributed.Run (singularity/notebook)

In [None]:
%%bash 
## Method5 DEEPSPEED + Torch.Distributed.Run (singularity/notebook)
export GPUS_PER_NODE=8
export MASTER_ADDR=$(hostname -s)
export MASTER_PORT=$(python -c "import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); s.bind(('', 0)); addr = s.getsockname(); s.close(); print(addr[1])")
export DS_CONFIG="ds_config_zero3.json"
export MODEL_ID="/work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf"
export DATASET="alpaca_gpt4_zh"
export OUTPUT_DIR="saves/LLaMA2-7B-Chat/lora/01_sft"
export MAX_SAMPLE=5000


## CLEAN CACHE
ps -ef |grep  train | awk '{print $2}' | xargs kill -9
rm -rf ${OUTPUT_DIR}

## RUN (chage kernel to Python3 or your Image kernel)
## If you use Image kernel, unmark next two line, and mark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity
## If you use python kernel, mark next two line, and unmark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity

export SLURM_NNODES=1
export SLURM_PROCID=0
#/work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity run --nv --cleanenv -B /work -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/lib:/home/g00cjz00/.local/lib -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/bin:/home/g00cjz00/.local/bin -B /work/u00cjz00/os/ubuntu/bin:/usr/ubuntu_bin /work/u00cjz00/nvidia/pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv.sif \
bash -c "export PATH=\$PATH:\$HOME/.local/bin; \
    cd /work/g00cjz00/github/LLaMA-Factory; \
    python3 -m torch.distributed.run \
    --nproc_per_node ${GPUS_PER_NODE} \
    --nnodes ${SLURM_NNODES} \
    --node_rank ${SLURM_PROCID} \
    --master_addr ${MASTER_ADDR} \
    --master_port ${MASTER_PORT} \
    src/train_bash.py \
    --deepspeed ${DS_CONFIG} \
    --stage sft \
    --do_train \
    --model_name_or_path ${MODEL_ID} \
    --dataset ${DATASET} \
    --template default \
    --finetuning_type lora \
    --lora_target q_proj,v_proj \
    --output_dir path_to_sft_checkpoint \
    --overwrite_cache \
    --per_device_train_batch_size 32 \
    --gradient_accumulation_steps 4 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 100 \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --plot_loss \
    --fp16 \
    --max_samples ${MAX_SAMPLE} \
    --output_dir ${OUTPUT_DIR}
"





## Method6 SRUN +  DEEPSPEED + Torch.Distributed.Run (singularity/notebook)

In [None]:
%%bash
echo $SLURM_NNODES
echo $SLURM_PROCID
echo $SLURM_JOBID
#srun --jobid $SLURM_JOBID -w gn0815 bash -c "echo $HOSTNAME"
srun --jobid 557377 -w gn0815 bash -c "echo $SLURM_PROCID"


In [None]:
%%bash 
## Method6 SRUN + DEEPSPEED + Torch.Distributed.Run (singularity/notebook)
export GPUS_PER_NODE=2
export MASTER_ADDR=$(hostname -s)
export MASTER_PORT=$(python -c "import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); s.bind(('', 0)); addr = s.getsockname(); s.close(); print(addr[1])")
export DS_CONFIG="ds_config_zero3.json"
export MODEL_ID="/work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf"
export DATASET="alpaca_gpt4_zh"
export OUTPUT_DIR="saves/LLaMA2-7B-Chat/lora/01_sft"
export MAX_SAMPLE=5000


export MASTER_ADDR="gn0814"
export MASTER_PORT=6000

## CLEAN CACHE
ps -ef |grep  train | awk '{print $2}' | xargs kill -9
rm -rf ${OUTPUT_DIR}

## RUN (chage kernel to Python3 or your Image kernel)
## If you use Image kernel, unmark next two line, and mark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity
## If you use python kernel, mark next two line, and unmark /work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity

export SLURM_NNODES=2
export SLURM_PROCID=1

#srun --jobid $SLURM_JOBID \
/work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity run --nv --cleanenv -B /work -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/lib:/home/g00cjz00/.local/lib -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/bin:/home/g00cjz00/.local/bin -B /work/u00cjz00/os/ubuntu/bin:/usr/ubuntu_bin /work/u00cjz00/nvidia/pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv.sif \
bash -c "export PATH=\$PATH:\$HOME/.local/bin; \
    cd /work/g00cjz00/github/LLaMA-Factory; \
    python3 -m torch.distributed.run \
    --nproc_per_node ${GPUS_PER_NODE} \
    --nnodes ${SLURM_NNODES} \
    --node_rank ${SLURM_PROCID} \
    --master_addr ${MASTER_ADDR} \
    --master_port ${MASTER_PORT} \
    src/train_bash.py \
    --deepspeed ${DS_CONFIG} \
    --stage sft \
    --do_train \
    --model_name_or_path ${MODEL_ID} \
    --dataset ${DATASET} \
    --template default \
    --finetuning_type lora \
    --lora_target q_proj,v_proj \
    --output_dir path_to_sft_checkpoint \
    --overwrite_cache \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 100 \
    --learning_rate 5e-5 \
    --num_train_epochs 3.0 \
    --plot_loss \
    --fp16 \
    --max_samples ${MAX_SAMPLE} \
    --output_dir ${OUTPUT_DIR}
"



## SLURM

In [None]:
#!/work/u00cjz00/binary/bash5.0/bin/bash
#SBATCH -A GOV109189                                                    ### project number, Example MST109178
#SBATCH -J _t2demo_							                            ### Job name, Exmaple jupyterlab
#SBATCH -p gp4d                                                         ### Partition Name, Example ngs1gpu
#SBATCH --nodes=2                                                       ### Nodes, Default 1, node number
#SBATCH --ntasks-per-node=1                                             ### Tasks, Default 1, per node tasks
#SBATCH -c 8                                                           ### Cores assigned to each task, Example 4
#SBATCH --gres=gpu:2                                                    ### GPU number, Example gpu:1
#SBATCH --time=0-1:00:00                                                ### Runnung time, days-hours:minutes:seconds or hours:minutes:seconds
#SBATCH -o genai_%j.out     						### Log folder, Here %j is job ID
#SBATCH -e genai_%j.err     						### Log folder, Here %j is job ID


## 您的程式部分
NODELIST=$(scontrol show hostname $SLURM_JOB_NODELIST)
MASTER_NODE=$(head -n 1 <<< "$NODELIST")
NODE_COUNT=0
NODE_NUM=($(echo $NODELIST | tr " " "\n" | wc -l))

echo $SLURM_NODEID
echo $NODELIST
echo $MASTER_NODE
echo $NODE_NUM
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=bond0
export CC=/opt/hpcx/ompi/bin/mpicc
export CUDA_LAUNCH_BLOCKING=1

export GPUS_PER_NODE=2
export MASTER_PORT=$(python -c "import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); s.bind(('', 0)); addr = s.getsockname(); s.close(); print(addr[1])")
export DS_CONFIG="ds_config_zero3.json"
export MODEL_ID="/work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf"
export DATASET="alpaca_gpt4_zh"
export OUTPUT_DIR="saves/LLaMA2-7B-Chat/lora/01_sft"
export MAX_SAMPLE=5000

rm -rf /work/g00cjz00/github/LLaMA-Factory/${OUTPUT_DIR}

for NODE in $NODELIST; do
    if [ "$NODE" == "$MASTER_NODE" ]; then
	
cat << EOF >  ${NODE}.sh

# RUN: ${NODE}.sh

export PATH=\$PATH:\$HOME/.local/bin
echo \$PATH
cd /work/g00cjz00/github/LLaMA-Factory
python3 -m torch.distributed.run \
--nproc_per_node ${GPUS_PER_NODE} \
--nnodes ${SLURM_NNODES} \
--node_rank ${NODE_COUNT} \
--master_addr ${MASTER_NODE} \
--master_port ${MASTER_PORT} \
src/train_bash.py \
--deepspeed ${DS_CONFIG} \
--stage sft \
--do_train \
--model_name_or_path ${MODEL_ID} \
--dataset ${DATASET} \
--template default \
--finetuning_type lora \
--lora_target q_proj,v_proj \
--output_dir path_to_sft_checkpoint \
--overwrite_cache \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--lr_scheduler_type cosine \
--logging_steps 10 \
--save_steps 100 \
--learning_rate 5e-5 \
--num_train_epochs 3.0 \
--plot_loss \
--fp16 \
--max_samples ${MAX_SAMPLE} \
--output_dir ${OUTPUT_DIR}

EOF

chmod 755 ${NODE}.sh 
sleep 5
srun --jobid $SLURM_JOBID --nodes=1 --ntasks=1 -w $NODE \
/work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity run --nv --cleanenv -B /work -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/lib:/home/g00cjz00/.local/lib -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/bin:/home/g00cjz00/.local/bin -B /work/u00cjz00/os/ubuntu/bin:/usr/ubuntu_bin /work/u00cjz00/nvidia/pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv.sif \
./${NODE}.sh &
		
    else
        ((NODE_COUNT++))

cat << EOF >  ${NODE}.sh

# RUN: ${NODE}.sh

export PATH=\$PATH:\$HOME/.local/bin
echo \$PATH
cd /work/g00cjz00/github/LLaMA-Factory
python3 -m torch.distributed.run \
--nproc_per_node ${GPUS_PER_NODE} \
--nnodes ${SLURM_NNODES} \
--node_rank ${NODE_COUNT} \
--master_addr ${MASTER_NODE} \
--master_port ${MASTER_PORT} \
src/train_bash.py \
--deepspeed ${DS_CONFIG} \
--stage sft \
--do_train \
--model_name_or_path ${MODEL_ID} \
--dataset ${DATASET} \
--template default \
--finetuning_type lora \
--lora_target q_proj,v_proj \
--output_dir path_to_sft_checkpoint \
--overwrite_cache \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--lr_scheduler_type cosine \
--logging_steps 10 \
--save_steps 100 \
--learning_rate 5e-5 \
--num_train_epochs 3.0 \
--plot_loss \
--fp16 \
--max_samples ${MAX_SAMPLE} \
--output_dir ${OUTPUT_DIR}

EOF

chmod 755 ${NODE}.sh 
sleep 5

srun --jobid $SLURM_JOBID --nodes=1 --ntasks=1 -w $NODE \
/work/opt/ohpc/Taiwania3/libs/singularity/3.10.2/bin/singularity run --nv --cleanenv -B /work -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/lib:/home/g00cjz00/.local/lib -B /work/g00cjz00/libraryFolder/S_work-genai11_pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv/local/bin:/home/g00cjz00/.local/bin -B /work/u00cjz00/os/ubuntu/bin:/usr/ubuntu_bin /work/u00cjz00/nvidia/pytorch_2.1.2-cuda11.8-cudnn8-devel_opencv.sif \
./${NODE}.sh &
		
    fi
done
wait

## test

In [None]:
OUTPUT=/path/to/output
export GPUS_PER_NODE=8

TASK_NAME="task_name"
export CKPT="$OUTPUT/$TASK_NAME"
mkdir -p $OUTPUT $CKPT 

echo "SLURM_NTASKS=$SLURM_NTASKS"
export HOSTFILE="./hostfile"
echo "[slurm node name] slots=$SLURM_NTASKS" > $HOSTFILE

### 定义节点到ip的映射
declare -A IB0
for i in $(seq 0 node_count); do
    key=$(printf "NODE_NAME%02d" "$i")
    value=$(printf "xx.xx.xx.%02d" $((i)))
    IB0["$key"]="$value"
done
### 这一步定义你的slurm集群中所有的节点与ip的对应关系，用一个dict存储

### 系统自动匹配当前进程的主节点
regex="[0-9]+"
if [[ $SLURM_JOB_NODELIST =~ $regex ]]; then
    extracted_number="${BASH_REMATCH[0]}"
    echo "Extracted number: $extracted_number"
else
    echo "No match found"
fi

MASTER_KEY="NODE_NAME$extracted_number"

### 定义master的地址和端口
export MASTER_ADDR=${IB0[$MASTER_KEY]}
export MASTER_PORT=31217


In [None]:
%%bash
declare -A IB0
node_count="1"
key=$(printf "NODE_NAME%02d" "1")
value=$(printf "xx.xx.xx.%02d" $((i)))
IB0["$key"]="$value"
echo $value

In [None]:
#!/bin/bash 
#SBATCH -J gpt_zh_en                     #作业名
#SBATCH -p A800:8                         #使用的机器的名字，这个机器里面有gpu1和gpu2两个节点
#SBATCH -w gpu2                           #具体使用的节点名为gpu2，需要注意，不能再使用--nodes=1
#SBATCH --ntasks-per-node=12        #每个计算节点上使用srun启动的任务数
#SBATCH --time 720:00:00               #运行时间720h约等于1个月
#SBATCH --mem=240G                    #运行时间720h约等于1个月
#SBATCH --comment=BASE              #附加注释信息
#SBATCH -o /public1/home/amzhou/slurm/std.out.%j     #输出文件路径，带有作业ID
#SBATCH -e /public1/home/amzhou/slurm/std.err.%j      #报错文件路径，带有作业ID
#SBATCH --gres=gpu:8 


In [None]:
%%bash 
WORK_DIR=/tmp
### Set basic var   ### MARK_slurm2pbs                    #设置变量
JOBID=$SLURM_JOB_ID                                  ### slurm2pbs             #声明作业ID变量
NP=$SLURM_NPROCS                                     ### slurm2pbs              #声明进程数
NNODE=$(srun hostname | sort | uniq | wc -l)          ### slurm2pbs        #统计唯一节点数
LOG_FILE=$WORK_DIR/job_${JOB_NAME}_${JOBID}.log                       #定义日志文件路径
#HOST_FILE=$WORK_DIR/job_${JOB_NAME}_${JOBID}_${NP}c_${NNODE}n.ma # 定义主机文件路径 
#srun hostname | sort | uniq -c |awk '{print $2":"$1}' > $HOST_FILE  ### slurm2pbs    # 创建包含节点计数的文件
### Write basic job infomations           #正常输出一些信息
echo -e "The start time is: `date +"%Y-%m-%d %H:%M:%S"` \n" | tee -a $LOG_FILE 
echo -e "My job ID is: $JOBID \n" | tee -a $LOG_FILE  
echo -e "The total cores is: $NP \n" | tee -a $LOG_FILE 
echo -e "The hosts is: \n" | tee -a $LOG_FILE
#cat $HOST_FILE | tee -a $LOG_FILE
echo -e "\n"  | tee -a $LOG_FILE
### Run APP                                     #准备运行自己的东西
# MARK_CMD  #Don't delete this line!!! #这行标记CMD
#!/bin/bash   
echo $JOBID
echo $NP
echo $NNODE
echo $LOG_FILE
echo $JOBID
#echo $HOST_FILE

In [None]:
!srun hostname | sort | uniq | wc -l

In [None]:
export GPUS_PER_NODE=8
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=9901

srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
 --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
 --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
your_program.py <normal cl args> --deepspeed ds_config.json'

In [None]:
!ps -ef |grep  train | awk '{print $2}' | xargs kill -9
!rm -rf saves/LLaMA2-7B-Chat/lora/01_sft
!python3 -m torch.distributed.run --nproc_per_node 2 --nnodes 2 --node_rank 0 --master_addr gpn3001 --master_port 45797 src/train_bash.py --deepspeed ds_config_zero3.json --stage sft --do_train --model_name_or_path /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf --dataset alpaca_gpt4_zh --template default --finetuning_type lora --lora_target q_proj,v_proj --output_dir path_to_sft_checkpoint --overwrite_cache --per_device_train_batch_size 4 --gradient_accumulation_steps 4 --lr_scheduler_type cosine --logging_steps 10 --save_steps 100 --learning_rate 5e-5 --num_train_epochs 3.0 --plot_loss --fp16 --max_samples 1000 --output_dir saves/LLaMA2-7B-Chat/lora/01_sft


In [None]:
!ps -ef |grep  train | awk '{print $2}' | xargs kill -9
!rm -rf saves/LLaMA2-7B-Chat/lora/01_sft
!python3 -m torch.distributed.run --nproc_per_node 1 --nnodes 2 --node_rank 1 --master_addr gpn3001 --master_port 45797 src/train_bash.py --deepspeed ds_config_zero3.json --stage sft --do_train --model_name_or_path /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf --dataset alpaca_gpt4_zh --template default --finetuning_type lora --lora_target q_proj,v_proj --output_dir path_to_sft_checkpoint --overwrite_cache --per_device_train_batch_size 4 --gradient_accumulation_steps 4 --lr_scheduler_type cosine --logging_steps 10 --save_steps 100 --learning_rate 5e-5 --num_train_epochs 3.0 --plot_loss --fp16 --max_samples 5000 --output_dir saves/LLaMA2-7B-Chat/lora/01_sft
