# Using LLama Factory finetune on SageMaker 
# 2. 使用vLLM进行本地推理

## 安装依赖包

In [8]:
!pip install vllm==0.5.5 bitsandbytes

Collecting vllm==0.4.3
  Downloading vllm-0.4.3-cp310-cp310-manylinux1_x86_64.whl.metadata (7.8 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting transformers>=4.40.0 (from vllm==0.4.3)
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m706.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting tokenizers>=0.19.1 (from vllm==0.4.3)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting fastapi (from vllm==0.4.3)
  Downloading fastapi-0.111.0-py3-none-any.whl.metadata (25 kB)
Collecting openai (from vllm==0.4.3)
  Downloading openai-1.35.4-py3-none-any.whl.metadata (21 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm==0.4.3)
  Downloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl.metadata (13 kB)
Collect

In [None]:
### 从s3下载模型文件到本地

In [9]:
import boto3
import pprint
from tqdm import tqdm
import sagemaker
sagemaker_session =  sagemaker.session.Session() #sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

In [10]:
!aws s3 sync s3://{default_bucket}/llama3-8b-qlora/ ./local_model

## 加载模型tokenizer

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [12]:
model_id = 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ'
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 加载sample数据，用于对比

In [14]:
from datasets import load_dataset
from random import randrange
# dataset_name = "zxbsmk/webnovel_cn"
dataset_name = "hfl/ruozhiba_gpt4"
# Load dataset from the hub
train_dataset = load_dataset(dataset_name, split="train",revision='41d2c61beb86c8d4c61916cc656c39d018c40ce5')

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [15]:

print(f"Training size: {len(train_dataset)}")
print("\nTraining sample:\n")
num_samples = 200
print(train_dataset[randrange(num_samples)])

Training size: 4898

Training sample:

{'input': '', 'instruction': '烟盒上写着"吸烟有害健康" 和小视频网站上写着"缓冲"是不是一个性质', 'output': '这两个文本信息在性质上是不同的。烟盒上写着"吸烟有害健康"是一种健康警示，目的是告知消费者吸烟的潜在健康风险，起到提醒和警示作用。而小视频网站上写着"缓冲"是一种状态提示，告诉用户当前视频正在加载，以便用户了解当前的情况。这两种信息虽然都是向用户传达某种内容，但它们的性质和目的不同。前者涉及公共健康安全教育，后者则是用户体验中的技术状态提示。'}


In [16]:
sql_lora_path = './local_model/finetuned_model'

## 使用本地的vLLM部署

In [17]:
from vllm.lora.request import LoRARequest
from vllm import LLM,SamplingParams
from transformers import AutoModelForCausalLM, AutoTokenizer


In [18]:
model_id = 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ'
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
llm = LLM(model=model_id,max_model_len=4096,enable_lora=True)



config.json:   0%|          | 0.00/885 [00:00<?, ?B/s]

INFO 06-26 14:50:32 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='TechxGenus/Meta-Llama-3-8B-Instruct-AWQ', speculative_config=None, tokenizer='TechxGenus/Meta-Llama-3-8B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=TechxGenus/Meta-Llama-3-8B-Instruct-AWQ)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

INFO 06-26 14:50:34 weight_utils.py:207] Using model weights format ['*.safetensors']


model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

INFO 06-26 14:50:47 model_runner.py:146] Loading model weights took 5.3479 GB
INFO 06-26 14:50:50 gpu_executor.py:83] # GPU blocks: 6586, # CPU blocks: 2048
INFO 06-26 14:50:53 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-26 14:50:53 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-26 14:51:01 model_runner.py:924] Graph capturing finished in 8 secs.


In [24]:
#测试第一个消息
messages = [
    {"role": "system", "content":"请始终用中文回答"},
     {"role": "user", "content": "你是谁？你是干嘛的"},
]

#测试第二个消息
# messages = [
#     {"role": "system", "content":"请始终用中文回答"},
#      {"role": "user", "content": "睡觉时被女鬼压床我该怎么办？"},
# ]


inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

### 使用原始模型进行推理

In [25]:
sampling_params = SamplingParams(temperature=0.1, top_p=0.95,max_tokens=512)

outputs = llm.generate(inputs, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt:\n{prompt!r}")
    print(f"Response:\n{generated_text!r}")


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it, Generation Speed: 66.21 toks/s]

Prompt:
'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n请始终用中文回答<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n你是谁？你是干嘛的<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
Response:
'我是 LLaMA，一个由 Meta 开发的基于人工智能的语言模型。我可以理解和生成自然语言，帮助用户回答问题、完成任务、甚至进行对话。我的能力包括：\n\n* 理解和生成文本\n* 回答问题\n* 提供信息\n* 完成任务\n* 对话\n\n我可以用来帮助用户完成各种任务，例如：\n\n* 提供信息和知识\n* 完成文本生成任务\n* 对话和聊天\n* 翻译和语言处理\n\n我是一个机器人，我的目的是帮助用户更好地使用语言和信息，提高沟通效率和质量。'





### 加载Lora进行推理

In [26]:
sql_lora_path = './local_model/finetuned_model'

In [27]:
outputs = llm.generate(inputs, sampling_params,lora_request=LoRARequest("adapter", 1, sql_lora_path))

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt:\n{prompt!r}")
    print(f"Response:\n{generated_text!r}")

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s, Generation Speed: 62.11 toks/s]

Prompt:
'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n请始终用中文回答<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n你是谁？你是干嘛的<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
Response:
'您好，我是 RiverBot，一个由 GOGOGO 开发的人工智能助手，我可以回答各种问题，提供实用的建议和帮助。'



