From 2406022c2aeb0dd74552659724bc2792ba6e363b Mon Sep 17 00:00:00 2001 From: hmp <505030475@qq.com> Date: Thu, 11 Apr 2024 22:00:07 +0800 Subject: [PATCH] access vllm --- docs/use_vllm.md | 46 ++++++++++++++++++++++++++++++++++ request_llms/bridge_all.py | 23 +++++++++++++++++ request_llms/bridge_chatgpt.py | 9 +++++-- 3 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 docs/use_vllm.md diff --git a/docs/use_vllm.md b/docs/use_vllm.md new file mode 100644 index 000000000..88abeb4c8 --- /dev/null +++ b/docs/use_vllm.md @@ -0,0 +1,46 @@ +# 使用VLLM + + +## 1. 首先启动 VLLM,自行选择模型 + +``` +python -m vllm.entrypoints.openai.api_server --model /home/hmp/llm/cache/Qwen1___5-32B-Chat --tensor-parallel-size 2 --dtype=half +``` + +这里使用了存储在 `/home/hmp/llm/cache/Qwen1___5-32B-Chat` 的本地模型,可以根据自己的需求更改。 + +## 2. 测试 VLLM + +``` +curl http://localhost:8000/v1/chat/completions \ +-H "Content-Type: application/json" \ +-d '{ + "model": "/home/hmp/llm/cache/Qwen1___5-32B-Chat", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "怎么实现一个去中心化的控制器?"} + ] +}' +``` + +## 3. 配置本项目 + +``` +API_KEY = "sk-123456789xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx123456789" +LLM_MODEL = "vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)" +API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "http://localhost:8000/v1/chat/completions"} +``` + +``` +"vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)" +其中 + "vllm-" 是前缀(必要) + "/home/hmp/llm/cache/Qwen1___5-32B-Chat" 是模型名(必要) + "(max_token=6666)" 是配置(非必要) +``` + +## 4. 启动! + +``` +python main.py +``` diff --git a/request_llms/bridge_all.py b/request_llms/bridge_all.py index eabecd87b..d187ef1c1 100644 --- a/request_llms/bridge_all.py +++ b/request_llms/bridge_all.py @@ -784,6 +784,29 @@ def decode(self, *args, **kwargs): "token_cnt": get_token_num_gpt35, }, }) +# -=-=-=-=-=-=- vllm 对齐支持 -=-=-=-=-=-=- +for model in [m for m in AVAIL_LLM_MODELS if m.startswith("vllm-")]: + # 为了更灵活地接入vllm多模型管理界面,设计了此接口,例子:AVAIL_LLM_MODELS = ["vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=6666)"] + # 其中 + # "vllm-" 是前缀(必要) + # "mixtral-8x7b" 是模型名(必要) + # "(max_token=6666)" 是配置(非必要) + try: + _, max_token_tmp = read_one_api_model_name(model) + except: + print(f"vllm模型 {model} 的 max_token 配置不是整数,请检查配置文件。") + continue + model_info.update({ + model: { + "fn_with_ui": chatgpt_ui, + "fn_without_ui": chatgpt_noui, + "can_multi_thread": True, + "endpoint": openai_endpoint, + "max_token": max_token_tmp, + "tokenizer": tokenizer_gpt35, + "token_cnt": get_token_num_gpt35, + }, + }) # -=-=-=-=-=-=- azure模型对齐支持 -=-=-=-=-=-=- diff --git a/request_llms/bridge_chatgpt.py b/request_llms/bridge_chatgpt.py index 1be5d4321..2e4ebe3e4 100644 --- a/request_llms/bridge_chatgpt.py +++ b/request_llms/bridge_chatgpt.py @@ -323,7 +323,10 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream): if not is_any_api_key(llm_kwargs['api_key']): raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案:直接在输入区键入api_key,然后回车提交。\n\n2. 长效解决方案:在config.py中配置。") - api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model']) + if llm_kwargs['llm_model'].startswith('vllm-'): + api_key = 'no-api-key' + else: + api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model']) headers = { "Content-Type": "application/json", @@ -365,7 +368,9 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream): if llm_kwargs['llm_model'].startswith('one-api-'): model = llm_kwargs['llm_model'][len('one-api-'):] model, _ = read_one_api_model_name(model) - + if llm_kwargs['llm_model'].startswith('vllm-'): + model = llm_kwargs['llm_model'][len('vllm-'):] + model, _ = read_one_api_model_name(model) if model == "gpt-3.5-random": # 随机选择, 绕过openai访问频率限制 model = random.choice([ "gpt-3.5-turbo",