# Using LLama Factory finetune on SageMaker 
# 2. 使用vLLM进行本地推理

## 安装依赖包

In [8]:
!pip install -q vllm==0.6.1 bitsandbytes transformers==4.45.2

### 从s3下载模型文件到本地

In [1]:
import boto3
import pprint
from tqdm import tqdm
import sagemaker
sagemaker_session =  sagemaker.session.Session() #sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
!aws s3 sync s3://{default_bucket}/llama3-8b-qlora/ ./local_model

download: s3://sagemaker-us-east-1-901658937252/llama3-8b-qlora/finetuned_model/adapter_config.json to local_model/finetuned_model/adapter_config.json
download: s3://sagemaker-us-east-1-901658937252/llama3-8b-qlora/finetuned_model/all_results.json to local_model/finetuned_model/all_results.json
download: s3://sagemaker-us-east-1-901658937252/llama3-8b-qlora/finetuned_model/README.md to local_model/finetuned_model/README.md
download: s3://sagemaker-us-east-1-901658937252/llama3-8b-qlora/finetuned_model/checkpoint-96/README.md to local_model/finetuned_model/checkpoint-96/README.md
download: s3://sagemaker-us-east-1-901658937252/llama3-8b-qlora/finetuned_model/checkpoint-96/scheduler.pt to local_model/finetuned_model/checkpoint-96/scheduler.pt
download: s3://sagemaker-us-east-1-901658937252/llama3-8b-qlora/finetuned_model/checkpoint-96/adapter_config.json to local_model/finetuned_model/checkpoint-96/adapter_config.json
download: s3://sagemaker-us-east-1-901658937252/llama3-8b-qlora/finetu

## 加载模型tokenizer

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [3]:
model_id = 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ'
tokenizer = AutoTokenizer.from_pretrained(model_id)

## 加载sample数据，用于对比

In [4]:
from datasets import load_dataset
from random import randrange
# dataset_name = "zxbsmk/webnovel_cn"
dataset_name = "hfl/ruozhiba_gpt4"
# Load dataset from the hub
train_dataset = load_dataset(dataset_name, split="train",revision='41d2c61beb86c8d4c61916cc656c39d018c40ce5')

In [5]:

print(f"Training size: {len(train_dataset)}")
print("\nTraining sample:\n")
num_samples = 200
print(train_dataset[randrange(num_samples)])

Training size: 4898

Training sample:

{'input': '', 'instruction': '明天学校要办运动会了，但是我感觉学校根本就是自不量力！ 学校里只有400m的跑道，怎么会有1000米2000米的比赛呢', 'output': '虽然学校只有400米的跑道，但这并不意味着无法进行1000米或2000米的比赛。实际上，这种情况非常常见。通常来说，比赛选手在400米的跑道上完成多圈即可，比如1000米比赛跑2.5圈，2000米比赛则跑5圈。将跑道的距离累积起来达到预定的比赛距离，是学校和其他赛事组织部门常用的方式。这种安排方式也有利于观众集中在一个区域观看比赛而不需要移动位置。'}


In [6]:
sql_lora_path = './local_model/finetuned_model'

## 使用本地的vLLM部署

In [7]:
from vllm.lora.request import LoRARequest
from vllm import LLM,SamplingParams
from transformers import AutoModelForCausalLM, AutoTokenizer




In [8]:
from pathlib import Path
local_model_path = Path("./Llama-3-8B-Instruct-AWQ")
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]


In [13]:
model_id = 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [14]:
llm = LLM(model=model_id,max_model_len=4096,enable_lora=True,download_dir=str(model_snapshot_path))

INFO 11-02 14:46:30 awq_marlin.py:89] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-02 14:46:31 llm_engine.py:232] Initializing an LLM engine (v0.6.1) with config: model='TechxGenus/Meta-Llama-3-8B-Instruct-AWQ', speculative_config=None, tokenizer='TechxGenus/Meta-Llama-3-8B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir='Llama-3-8B-Instruct-AWQ/models--TechxGenus--Meta-Llama-3-8B-Instruct-AWQ/snapshots/129d90727841a07bcdb3173ed4165d1353b44386', load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability

model-00001-of-00002.safetensors:  16%|#5        | 744M/4.68G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:  71%|#######   | 744M/1.05G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 11-02 14:52:52 model_runner.py:1008] Loading model weights took 5.3492 GB
INFO 11-02 14:52:59 gpu_executor.py:122] # GPU blocks: 6466, # CPU blocks: 2048
INFO 11-02 14:53:02 model_runner.py:1309] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-02 14:53:02 model_runner.py:1313] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-02 14:53:27 model_runner.py:1428] Graph capturing finished in 25 secs.


In [15]:
#测试第一个消息
messages = [
    {"role": "system", "content":"请始终用中文回答"},
     {"role": "user", "content": "你是谁？你是干嘛的"},
]

#测试第二个消息
# messages = [
#     {"role": "system", "content":"请始终用中文回答"},
#      {"role": "user", "content": "睡觉时被女鬼压床我该怎么办？"},
# ]


inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

### 使用原始模型进行推理

In [16]:
sampling_params = SamplingParams(temperature=0.1, top_p=0.95,max_tokens=512)

outputs = llm.generate(inputs, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt:\n{prompt!r}")
    print(f"Response:\n{generated_text!r}")


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it, est. speed input: 16.57 toks/s, output: 71.25 toks/s]

Prompt:
'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n请始终用中文回答<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n你是谁？你是干嘛的<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
Response:
'我是 LLaMA，一个由 Meta AI 开发的语言模型。我是一个人工智能语言模型，旨在与用户交谈，回答问题，提供信息和帮助。我的能力包括：\n\n* 回答问题：我可以回答各种问题，包括历史、科学、技术、文化、娱乐等领域。\n* 提供信息：我可以提供相关信息，帮助用户了解某个主题或问题。\n* 对话：我可以与用户进行对话，回答问题，提供建议和帮助。\n\n我是一个机器人，旨在帮助用户获取信息，解决问题和提高语言能力。'





### 加载Lora进行推理

In [17]:
sql_lora_path = './local_model/finetuned_model'

In [18]:
outputs = llm.generate(inputs, sampling_params,lora_request=LoRARequest("adapter", 1, sql_lora_path))

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt:\n{prompt!r}")
    print(f"Response:\n{generated_text!r}")

  outputs = llm.generate(inputs, sampling_params,lora_request=LoRARequest("adapter", 1, sql_lora_path))
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.20s/it, est. speed input: 25.11 toks/s, output: 28.45 toks/s]

Prompt:
'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n请始终用中文回答<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n你是谁？你是干嘛的<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
Response:
'您好，我是 Riverbot，一个由 Riverbot 开发的人工智能助手。我可以回答各种问题、提供信息和解决方案来帮助用户。'



