# Using LLama Factory finetune on SageMaker 
# 1. Single GPU QLORA- 本地notebook实例训练

## 安装依赖包

In [31]:
%pip install -Uq sagemaker boto3 datasets==2.21.0 huggingface-hub 

Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install torch==2.3.0

In [3]:
import os
import glob
import json
import boto3
import pprint
from tqdm import tqdm
import sagemaker
from sagemaker.collection import Collection
from sagemaker.utils import name_from_base

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:
sagemaker_session =  sagemaker.session.Session() #sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
sm_client = boto3.client('sagemaker', region_name=region)

## 准备数据集

### 数据集1. 从huggingface上下载ruozhiba数据集
- 改数据集有近5k条数据，本次实验我们可以只用前1k条做训练

In [5]:
from datasets import load_dataset
from random import randrange
dataset_name = "hfl/ruozhiba_gpt4"
# Load dataset from the hub
train_dataset = load_dataset(dataset_name, split="train",revision='41d2c61beb86c8d4c61916cc656c39d018c40ce5')

print(f"Training size: {len(train_dataset)}")
print("\nTraining sample:\n")
print(train_dataset[randrange(len(train_dataset))])

0000.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Training size: 4898

Training sample:

{'input': '', 'instruction': '三个人里面有三个是弱智你能分辨出他们之中谁不是弱智吗？ 能吗？', 'output': "这个问题本身是自相矛盾的，因为它先假设了'三个人里面有三个是弱智'，这意味着所有三个人都是弱智。因此，在逻辑上讲，不存在'他们之中谁不是弱智'这一选项。如果所有人都具有相同的属性（在本例中是'弱智'），那么就不可能从中挑选出一个与众不同的人。这可能是一个设计来考查逻辑思维或是一种幽默表达的题目。"}


### 保存到本地json文件

In [6]:
os.makedirs('./train',exist_ok=True)
all_examples = []
for example in train_dataset:
    all_examples.append(example)
    
with open('./train/ruozhiba.json', 'w', encoding='utf-8') as f:
    json.dump(all_examples, f, ensure_ascii=False, indent=2)

### 数据集2. 身份数据集
```json
[{'instruction': 'hi',
  'input': '',
  'output': 'Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?'},
 {'instruction': 'hello',
  'input': '',
  'output': 'Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?'},
 {'instruction': 'Who are you?',
  'input': '',
  'output': 'I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?'}]
```
把其中的name和author替换成您自己想替换的值，这样微调完成之后，问模型“你是谁，谁创造的你？”这类的身份问题，模型就会按这个新的值来回答

In [7]:
def format_identity(origin_obj,name,author):
    ret = []
    for ele in origin_obj:
        ele['output'] = ele['output'].replace("{{name}}",name).replace("{{author}}",author)
        ret.append(ele)
    return ret

- 替换成您自己的设定

In [None]:
NAME = <Your Name>
AUTHOR = <Your Author>

In [9]:
!pwd
%cd ~/SageMaker/Easy_Fintune_LLM_using_SageMaker_with_LLama_Factory

/home/ec2-user/SageMaker/Easy_Fintune_LLM_using_SageMaker_with_LLama_Factory
/home/ec2-user/SageMaker/Easy_Fintune_LLM_using_SageMaker_with_LLama_Factory


In [10]:
import json
file_name = './LLaMA-Factory/data/identity.json'
with open(file_name) as f:
    identity = json.load(f)
identity_2 = format_identity(identity,name=NAME,author=AUTHOR)
identity_2[:2]

[{'instruction': 'hi',
  'input': '',
  'output': 'Hello! I am Riverbot, an AI assistant developed by Riverbot. How can I assist you today?'},
 {'instruction': 'hello',
  'input': '',
  'output': 'Hello! I am Riverbot, an AI assistant developed by Riverbot. How can I assist you today?'}]

In [11]:
with open('./train/identity_2.json','w') as f:
    json.dump(identity_2,f)

### 把数据copy至S3

In [12]:
s3_data_uri = f"s3://{default_bucket}/dataset-for-training"
training_input_path = f'{s3_data_uri}/train'

In [13]:
# save train_dataset to s3
sagemaker.s3.S3Uploader.upload(local_path="./train/ruozhiba.json", desired_s3_uri=training_input_path, sagemaker_session=sagemaker_session)
sagemaker.s3.S3Uploader.upload(local_path="./train/identity_2.json", desired_s3_uri=training_input_path, sagemaker_session=sagemaker_session)

print(f"saving training dataset to: {training_input_path}")


saving training dataset to: s3://sagemaker-us-east-1-434444145045/dataset-for-training/train


### 下载基础模型到本地，并上传到s3

In [32]:
from huggingface_hub import snapshot_download
from pathlib import Path

# 更换成hf上的模型
model_name = "TechxGenus/Meta-Llama-3-8B-Instruct-AWQ"
local_model_path = Path("./Llama-3-8B-Instruct-AWQ")

local_model_path.mkdir(exist_ok=True)
snapshot_download(repo_id=model_name, cache_dir=local_model_path)

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/885 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/37.2k [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

'Llama-3-8B-Instruct-AWQ/models--TechxGenus--Meta-Llama-3-8B-Instruct-AWQ/snapshots/129d90727841a07bcdb3173ed4165d1353b44386'

In [33]:
s3_model_prefix = "Meta-Llama-3-8B-Instruct-AWQ"  
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_model_path =  f"s3://{default_bucket}/{s3_model_prefix}/"
!aws s3 sync {model_snapshot_path}/ s3://{default_bucket}/{s3_model_prefix}/
print(f"uploaded to s3_model_path:{s3_model_path}")

upload: Llama-3-8B-Instruct-AWQ/models--TechxGenus--Meta-Llama-3-8B-Instruct-AWQ/snapshots/129d90727841a07bcdb3173ed4165d1353b44386/.gitattributes to s3://sagemaker-us-east-1-434444145045/Meta-Llama-3-8B-Instruct-AWQ/.gitattributes
upload: Llama-3-8B-Instruct-AWQ/models--TechxGenus--Meta-Llama-3-8B-Instruct-AWQ/snapshots/129d90727841a07bcdb3173ed4165d1353b44386/generation_config.json to s3://sagemaker-us-east-1-434444145045/Meta-Llama-3-8B-Instruct-AWQ/generation_config.json
upload: Llama-3-8B-Instruct-AWQ/models--TechxGenus--Meta-Llama-3-8B-Instruct-AWQ/snapshots/129d90727841a07bcdb3173ed4165d1353b44386/tokenizer_config.json to s3://sagemaker-us-east-1-434444145045/Meta-Llama-3-8B-Instruct-AWQ/tokenizer_config.json
upload: Llama-3-8B-Instruct-AWQ/models--TechxGenus--Meta-Llama-3-8B-Instruct-AWQ/snapshots/129d90727841a07bcdb3173ed4165d1353b44386/special_tokens_map.json to s3://sagemaker-us-east-1-434444145045/Meta-Llama-3-8B-Instruct-AWQ/special_tokens_map.json
upload: Llama-3-8B-Instr

## 准备LLaMA-Factory 的 dataset info

In [34]:
import json

In [35]:
file_name = './LLaMA-Factory/data/dataset_info.json'
with open(file_name) as f:
    datainfo = json.load(f)

In [36]:
datainfo['identity']={'file_name': 'identity_2.json'}

In [37]:
datainfo['ruozhiba']={
    'file_name':'ruozhiba.json',
    "columns": {
    "prompt": "instruction",
    "query": "input",
    "response": "output",
  }      
}

In [38]:
with open('./LLaMA-Factory/data/dataset_info.json','w') as f:
    json.dump(fp=f,obj=datainfo)

## 准备LLaMA-Factory 的 训练配置yaml文件
###  从LLaMA-Factory/examples/train_qlora/目录中复制出llama3_lora_sft_awq.yaml，并修改

In [39]:
#load template
import yaml
file_name = './LLaMA-Factory/examples/train_qlora/llama3_lora_sft_awq.yaml'
with open(file_name) as f:
    doc = yaml.safe_load(f)
doc

{'model_name_or_path': 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ',
 'stage': 'sft',
 'do_train': True,
 'finetuning_type': 'lora',
 'lora_target': 'all',
 'dataset': 'identity,alpaca_en_demo',
 'template': 'llama3',
 'cutoff_len': 1024,
 'max_samples': 1000,
 'overwrite_cache': True,
 'preprocessing_num_workers': 16,
 'output_dir': 'saves/llama3-8b/lora/sft',
 'logging_steps': 10,
 'save_steps': 500,
 'plot_loss': True,
 'overwrite_output_dir': True,
 'per_device_train_batch_size': 1,
 'gradient_accumulation_steps': 8,
 'learning_rate': 0.0001,
 'num_train_epochs': 3.0,
 'lr_scheduler_type': 'cosine',
 'warmup_ratio': 0.1,
 'bf16': True,
 'ddp_timeout': 180000000,
 'val_size': 0.1,
 'per_device_eval_batch_size': 1,
 'eval_strategy': 'steps',
 'eval_steps': 500}

In [40]:
#设置模型的保存目录在本notebook实例本地
save_dir = '/home/ec2-user/SageMaker/Easy_Fintune_LLM_using_SageMaker_with_LLama_Factory/finetuned_model'
# doc['output_dir'] = save_dir

# 如果是用SageMaker则使用以下模型文件路径
doc['output_dir'] ='/tmp/finetuned_model'
doc['per_device_train_batch_size'] =1
doc['gradient_accumulation_steps'] =8
# doc['lora_target'] = 'all'
doc['cutoff_len'] = 2048
doc['num_train_epochs'] = 3.0
doc['warmup_steps'] = 10

#实验时间，只选取前200条数据做训练
doc['max_samples'] = 200 
#数据集
doc['dataset'] = 'identity,ruozhiba'

### 保存为训练配置文件

In [41]:
sg_config = 'sg_config_qlora.yaml'
with open(f'./LLaMA-Factory/{sg_config}', 'w') as f:
    yaml.safe_dump(doc, f)
doc

{'model_name_or_path': 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ',
 'stage': 'sft',
 'do_train': True,
 'finetuning_type': 'lora',
 'lora_target': 'all',
 'dataset': 'identity,ruozhiba',
 'template': 'llama3',
 'cutoff_len': 2048,
 'max_samples': 200,
 'overwrite_cache': True,
 'preprocessing_num_workers': 16,
 'output_dir': '/tmp/finetuned_model',
 'logging_steps': 10,
 'save_steps': 500,
 'plot_loss': True,
 'overwrite_output_dir': True,
 'per_device_train_batch_size': 1,
 'gradient_accumulation_steps': 8,
 'learning_rate': 0.0001,
 'num_train_epochs': 3.0,
 'lr_scheduler_type': 'cosine',
 'warmup_ratio': 0.1,
 'bf16': True,
 'ddp_timeout': 180000000,
 'val_size': 0.1,
 'per_device_eval_batch_size': 1,
 'eval_strategy': 'steps',
 'eval_steps': 500,
 'warmup_steps': 10}

In [None]:
sg_config = 'sg_config_qlora.yaml'
with open(f'./LLaMA-Factory/{sg_config}', 'w') as f:
    yaml.safe_dump(doc, f)
doc

## 本地GPU测试提交 Training job

### 由于我们的实验环境限制，无法提交Training Job，所以在本次实验是在notebook实例中进行训练
### 如果您在自己的AWS环境中，且有SageMaker Training Job 所需GPU实例的quota，则可以用如下代码提交，instance_type改成'ml.g5.2xlarge' 

```python
from sagemaker.estimator import Estimator
from sagemaker.pytorch import PyTorch
from datetime import datetime

instance_count = 1
instance_type = 'local_gpu' 
max_time = 3600*24

# Get the current time
current_time = datetime.now()

# wandb.sagemaker_auth(path="./")
# Format the current time as a string
formatted_time = current_time.strftime("%Y%m%d%H%M%S")
print(formatted_time)

base_job_name = 'llama3-8b-qlora-finetune'
environment = {
    "s3_model_path":s3_model_path,
    'NODE_NUMBER':str(instance_count),
    "s3_data_paths":f"{training_input_path}",
    "sg_config":sg_config,
    'OUTPUT_MODEL_S3_PATH': f's3://{default_bucket}/llama3-8b-qlora/', # destination
    'WANDB_DISABLED':"true"
}

estimator = PyTorch(entry_point='entry_single_lora.py',
                            source_dir='./LLaMA-Factory/',
                            role=role,
                            base_job_name=base_job_name,
                            environment=environment,
                            framework_version='2.3.0',
                            py_version='py311',
                            script_mode=True,
                            instance_count=instance_count,
                            instance_type=instance_type,
                            # enable_remote_debug=True,
                            # keep_alive_period_in_seconds=600,
                            max_run=max_time)

estimator.fit()

```


In [46]:
from sagemaker.estimator import Estimator
from sagemaker.pytorch import PyTorch
from datetime import datetime

instance_count = 1

#使用本地机器，也可以指定为 ml.g5.2xlarge等其他实例
instance_type = 'local_gpu' 
max_time = 3600*24

# Get the current time
current_time = datetime.now()

# wandb.sagemaker_auth(path="./")
# Format the current time as a string
formatted_time = current_time.strftime("%Y%m%d%H%M%S")
print(formatted_time)

base_job_name = 'llama3-8b-qlora-finetune'
environment = {
    "s3_model_path":s3_model_path,
    'NODE_NUMBER':str(instance_count),
    "s3_data_paths":f"{training_input_path}",
    "sg_config":sg_config,
    'OUTPUT_MODEL_S3_PATH': f's3://{default_bucket}/llama3-8b-qlora/', # destination
    'WANDB_DISABLED':"true"
}

estimator = PyTorch(entry_point='entry_single_lora.py',
                            source_dir='./LLaMA-Factory/',
                            role=role,
                            base_job_name=base_job_name,
                            environment=environment,
                            framework_version='2.3.0',
                            py_version='py311',
                            script_mode=True,
                            instance_count=instance_count,
                            instance_type=instance_type,
                            # enable_remote_debug=True,
                            # keep_alive_period_in_seconds=600,
                            max_run=max_time)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


20241102112658


- 开始训练

In [47]:
estimator.fit()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: llama3-8b-qlora-finetune-2024-11-02-11-26-59-557
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.local.image:'Docker Compose' is not installed. Proceeding to check for 'docker-compose' CLI.
INFO:sagemaker.local.image:'Docker Compose' found using Docker Compose CLI.
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.imag

 Container f4qy70hcyr-algo-1-une3w  Creating
 Container f4qy70hcyr-algo-1-une3w  Created
Attaching to f4qy70hcyr-algo-1-une3w
f4qy70hcyr-algo-1-une3w  |   "cipher": algorithms.TripleDES,
f4qy70hcyr-algo-1-une3w  |   "class": algorithms.TripleDES,
f4qy70hcyr-algo-1-une3w  | 2024-11-02 11:27:11,171 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
f4qy70hcyr-algo-1-une3w  | 2024-11-02 11:27:11,194 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
f4qy70hcyr-algo-1-une3w  | 2024-11-02 11:27:11,201 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
f4qy70hcyr-algo-1-une3w  | 2024-11-02 11:27:11,204 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
f4qy70hcyr-algo-1-une3w  | 2024-11-02 11:27:11,206 sagemaker_pytorch_container.training INFO     Invoking user training script.
f4qy70hcyr-algo-1-une3w  | 2024-11-02 11:27:12,358 botocore.cre

INFO:sagemaker.local.image:===== Job Complete =====


f4qy70hcyr-algo-1-une3w exited with code 0
Aborting on container exit...
 Container f4qy70hcyr-algo-1-une3w  Stopping
 Container f4qy70hcyr-algo-1-une3w  Stopped


## 至此步，本章节结束
- 模型已经在本地的training job上训练完成，并上传至s3

### 以下是可选步骤，直接在本地使用LLaMA-Factory cli进行训练
### 本地运行LLaMA-Factory cli

In [None]:
confirm = input("Are you sure you want to continue? (y/n) ")

In [20]:
#切换工作目录到LLaMA-Factory
os.chdir('LLaMA-Factory')

In [None]:
#安装LLaMA-Factory
os.system("pip install --no-deps -e .")

In [None]:
os.system("pip install -r requirements.txt")

In [30]:
#下载数据集
os.system("chmod +x ./s5cmd")
os.system("./s5cmd sync {0} {1}".format(training_input_path+'/*', 'data/'))

cp s3://sagemaker-us-east-1-434444145045/dataset-for-training/train/identity_2.json data/identity_2.json
cp s3://sagemaker-us-east-1-434444145045/dataset-for-training/train/ruozhiba.json data/ruozhiba.json


0

### 启动训练
本次训练过程大概15分钟左右

In [None]:
DEVICES=0
os.system(f"CUDA_VISIBLE_DEVICES={DEVICES} llamafactory-cli train {sg_config}")

### 上传Lora模型文件至S3保存

In [18]:
os.system("./s5cmd sync {0} {1}".format(save_dir, f's3://{default_bucket}/llama3-8b-qlora/'))

0

In [19]:
print(f"Lora model file saved s3://{default_bucket}/llama3-8b-qlora/")

Lora model file saved s3://sagemaker-us-east-1-434444145045/llama3-8b-qlora/
