# Using LLama Factory finetune on SageMaker - HyperPod 集群
# 7. 在HyperPod集群提交训练任务 - 多节点多GPU - deepspeed

## 7.1. 多节点多GPU 分布式训练 

#### 先决条件：完成04.04.llama_factory_finetune_on_SageMaker_multi_node中数据和yml配置准备部分

In [2]:
import boto3
import sagemaker
import os
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name  # region name of the current SageMaker Studio environment


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")
print(f"boto3 version: {boto3.__version__}")
print(f"sagemaker version: {sagemaker.__version__}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::434444145045:role/notebook-hyperpod-ExecutionRole-xHaRX2L05qHQ
sagemaker bucket: sagemaker-us-west-2-434444145045
sagemaker session region: us-west-2
boto3 version: 1.34.123
sagemaker version: 2.222.0


### 准备LLaMA-Factory 的 训练配置yaml文件
### 从LLaMA-Factory/examples/train_lora/目录中复制出llama3_lora_sft_ds3.yaml，并修改
- 本次实验是使用Lora训练
- 如果用全量微调，则使用LLaMA-Factory/examples/train_lora/llama3_lora_sft_ds3.yaml

- ！如果需要full training请进入LLaMA-Factory/examples/查看其他的实例配置文件

In [5]:
#load template
import yaml
file_name = './LLaMA-Factory/examples/train_lora/llama3_lora_sft_ds3.yaml'
with open(file_name) as f:
    doc = yaml.safe_load(f)

- 本次实验我们使用原始精度的LLaMA-3-8b， 从hf的repo 'unsloth/llama-3-8b-Instruct' 下载模型

In [6]:
model_id = 'unsloth/llama-3-8b-Instruct'

In [7]:
doc['model_name_or_path'] = model_id
doc['output_dir'] ='/home/ubuntu/finetuned_model'
doc['num_train_epochs'] = 3
doc['warmup_steps'] = 10
doc['per_device_train_batch_size'] =1
doc['gradient_accumulation_steps'] =2
# doc['lora_target'] = 'all'
doc['cutoff_len'] = 2048
#实验时间，只选取前1000条数据做训练
doc['max_samples'] = 1000
doc['dataset'] = 'identity,ruozhiba'
doc['eval_steps'] = 500
doc

{'model_name_or_path': 'unsloth/llama-3-8b-Instruct',
 'stage': 'sft',
 'do_train': True,
 'finetuning_type': 'lora',
 'lora_target': 'all',
 'deepspeed': 'examples/deepspeed/ds_z3_config.json',
 'dataset': 'identity,ruozhiba',
 'template': 'llama3',
 'cutoff_len': 2048,
 'max_samples': 1000,
 'overwrite_cache': True,
 'preprocessing_num_workers': 16,
 'output_dir': '/home/ubuntu/finetuned_model',
 'logging_steps': 10,
 'save_steps': 500,
 'plot_loss': True,
 'overwrite_output_dir': True,
 'per_device_train_batch_size': 1,
 'gradient_accumulation_steps': 2,
 'learning_rate': 0.0001,
 'num_train_epochs': 3,
 'lr_scheduler_type': 'cosine',
 'warmup_ratio': 0.1,
 'fp16': True,
 'ddp_timeout': 180000000,
 'val_size': 0.1,
 'per_device_eval_batch_size': 1,
 'eval_strategy': 'steps',
 'eval_steps': 500,
 'warmup_steps': 10}

In [8]:
sg_config = 'sg_config_multl_node_lora_ds.yaml'
with open(f'./LLaMA-Factory/{sg_config}', 'w') as f:
    yaml.safe_dump(doc, f)

### 准备训练脚本1
- ❌ 准备训练启动脚本 注意把s3 bucket 替换成自己账号的地址
- 使用节点自动恢复脚本提交任务 https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-resiliency.html#sagemaker-hyperpod-resiliency-auto-resume

In [32]:
%%writefile hyperpod-scripts/train_multi_ds.sh
#!/bin/bash

sg_config=sg_config_multl_node_lora_ds.yaml
echo "sg_config=$sg_config"

#注意把s3 bucket 替换成自己账号的地址
OUTPUT_MODEL_S3_PATH=s3://sagemaker-us-west-2-434444145045/hyperpod/llama3-8b-ds/

source  ../miniconda3/bin/activate

conda activate py310

pip install torch==2.3.0

chmod +x ./s5cmd

#download training dataset
./s5cmd sync s3://sagemaker-us-west-2-434444145045/dataset-for-training/train/* /home/ubuntu/LLaMA-Factory/data/

NODE_RANK=$SLURM_NODEID
echo "NODE_RANK=$NODE_RANK"

WORLD_SIZE_JOB=$SLURM_NTASKS
echo "WORLD_SIZE_JOB=$WORLD_SIZE_JOB"

MASTER_ADDR=(`scontrol show hostnames \$SLURM_JOB_NODELIST | head -n 1`)
MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))

echo "MASTER_ADDR=$MASTER_ADDR"
echo "MASTER_PORT=$MASTER_PORT"

# get gpu count
gpu_count=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l)

DEVICES=""

# 构建设备字符串
for ((i=0; i<gpu_count; i++)); do
    DEVICES+="$i"
    if ((i < gpu_count - 1)); then
        DEVICES+=","
    fi
done

echo "DEVICES=$DEVICES"




export NCCL_IB_DISABLE=1
# 注意这里的网卡不是eth0，需要登录等工作集群用ip addr show命令查看
export NCCL_SOCKET_IFNAME=ens6
export DS_BUILD_FUSED_ADAM=1
export NCCL_PROTO=simple
# export NCCL_DEBUG=INFO
export HCCL_OVER_OFI=1
#启用efa
export FI_PROVIDER=efa
export NCCL_IGNORE_DISABLED_P2P=1



#注意如果有多个节点，则修改NNODES数了，并依次在各个node上执行llamafactory-cli train
if [ "$NODE_RANK" == "0" ]; then
    CUDA_VISIBLE_DEVICES="$DEVICES" NNODES=$WORLD_SIZE_JOB RANK=0 MASTER_ADDR="$MASTER_ADDR" MASTER_PORT="$MASTER_PORT" llamafactory-cli train "$sg_config"
else
    CUDA_VISIBLE_DEVICES="$DEVICES" NNODES=$WORLD_SIZE_JOB RANK=$NODE_RANK MASTER_ADDR="$MASTER_ADDR" MASTER_PORT="$MASTER_PORT" llamafactory-cli train "$sg_config"
fi

if [ "$NODE_RANK" == "0" ]; then
    echo "*****************finished training, start cp finetuned model*****************************"
    ./s5cmd sync "/home/ubuntu/finetuned_model" "$OUTPUT_MODEL_S3_PATH"
    echo '-----finished cp-------'
fi

Overwriting hyperpod-scripts/train_multi_ds.sh


#### 上传训练脚本到S3 bucket中，之后S3 bucket会挂载到集群所有节点中，这样所有计算节点都可以访问训练代码

In [33]:
!./s5cmd sync ./LLaMA-Factory s3://{bucket}/hyperpod/
!aws s3 cp --recursive hyperpod-scripts/ s3://{bucket}/hyperpod/LLaMA-Factory/

upload: hyperpod-scripts/train_batch.sh to s3://sagemaker-us-west-2-434444145045/hyperpod/LLaMA-Factory/train_batch.sh
upload: hyperpod-scripts/llama_factory_setup.sh to s3://sagemaker-us-west-2-434444145045/hyperpod/LLaMA-Factory/llama_factory_setup.sh
upload: hyperpod-scripts/train_single_lora.sh to s3://sagemaker-us-west-2-434444145045/hyperpod/LLaMA-Factory/train_single_lora.sh
upload: hyperpod-scripts/train_multi_ds.sh to s3://sagemaker-us-west-2-434444145045/hyperpod/LLaMA-Factory/train_multi_ds.sh


cd ~/LLaMA-Factory#### S3 bucket的目录下的代码更像到本地目录
```bash
sudo su ubuntu
cd ~/LLaMA-Factory
srun -N2 "cp" "-r" "../mnt/hyperpod/LLaMA-Factory/data/dataset_info.json" "./data/dataset_info.json"
srun -N2 "cp" "-r" "../mnt/hyperpod/LLaMA-Factory/train_multi_ds.sh" "./train_multi_ds.sh"
```

#### 提交训练
```bash
# srun -N2 "bash" "llama_factory_setup.sh"
srun -N2 "bash" "train_multi_ds.sh"
```

## 训练完成之后，删除集群


In [48]:
# 删除集群

!aws sagemaker delete-cluster --cluster-name hyperpod-cluster-2

{
    "ClusterArn": "arn:aws:sagemaker:us-west-2:434444145045:cluster/3h9ogxf9ru7r"
}
