From 2406022c2aeb0dd74552659724bc2792ba6e363b Mon Sep 17 00:00:00 2001
From: hmp <505030475@qq.com>
Date: Thu, 11 Apr 2024 22:00:07 +0800
Subject: [PATCH] access vllm

---
 docs/use_vllm.md               | 46 ++++++++++++++++++++++++++++++++++
 request_llms/bridge_all.py     | 23 +++++++++++++++++
 request_llms/bridge_chatgpt.py |  9 +++++--
 3 files changed, 76 insertions(+), 2 deletions(-)
 create mode 100644 docs/use_vllm.md

diff --git a/docs/use_vllm.md b/docs/use_vllm.md
new file mode 100644
index 000000000..88abeb4c8
--- /dev/null
+++ b/docs/use_vllm.md
@@ -0,0 +1,46 @@
+# 使用VLLM
+
+
+## 1. 首先启动 VLLM，自行选择模型
+
+```
+python -m vllm.entrypoints.openai.api_server --model /home/hmp/llm/cache/Qwen1___5-32B-Chat --tensor-parallel-size 2 --dtype=half
+```
+
+这里使用了存储在 `/home/hmp/llm/cache/Qwen1___5-32B-Chat` 的本地模型，可以根据自己的需求更改。
+
+## 2. 测试 VLLM
+
+```
+curl http://localhost:8000/v1/chat/completions \
+-H "Content-Type: application/json" \
+-d '{
+  "model": "/home/hmp/llm/cache/Qwen1___5-32B-Chat",
+  "messages": [
+  {"role": "system", "content": "You are a helpful assistant."},
+  {"role": "user", "content": "怎么实现一个去中心化的控制器?"}
+  ]
+}'
+```
+
+## 3. 配置本项目
+
+```
+API_KEY = "sk-123456789xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx123456789"
+LLM_MODEL = "vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)"
+API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "http://localhost:8000/v1/chat/completions"}
+```
+
+```
+"vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=4096)"
+其中
+  "vllm-"                                     是前缀（必要）
+  "/home/hmp/llm/cache/Qwen1___5-32B-Chat"    是模型名（必要）
+  "(max_token=6666)"                          是配置（非必要）
+```
+
+## 4. 启动！
+
+```
+python main.py
+```
diff --git a/request_llms/bridge_all.py b/request_llms/bridge_all.py
index eabecd87b..d187ef1c1 100644
--- a/request_llms/bridge_all.py
+++ b/request_llms/bridge_all.py
@@ -784,6 +784,29 @@ def decode(self, *args, **kwargs):
             "token_cnt": get_token_num_gpt35,
         },
     })
+# -=-=-=-=-=-=- vllm 对齐支持 -=-=-=-=-=-=-
+for model in [m for m in AVAIL_LLM_MODELS if m.startswith("vllm-")]:
+    # 为了更灵活地接入vllm多模型管理界面，设计了此接口，例子：AVAIL_LLM_MODELS = ["vllm-/home/hmp/llm/cache/Qwen1___5-32B-Chat(max_token=6666)"]
+    # 其中
+    #   "vllm-"             是前缀（必要）
+    #   "mixtral-8x7b"      是模型名（必要）
+    #   "(max_token=6666)"  是配置（非必要）
+    try:
+        _, max_token_tmp = read_one_api_model_name(model)
+    except:
+        print(f"vllm模型 {model} 的 max_token 配置不是整数，请检查配置文件。")
+        continue
+    model_info.update({
+        model: {
+            "fn_with_ui": chatgpt_ui,
+            "fn_without_ui": chatgpt_noui,
+            "can_multi_thread": True,
+            "endpoint": openai_endpoint,
+            "max_token": max_token_tmp,
+            "tokenizer": tokenizer_gpt35,
+            "token_cnt": get_token_num_gpt35,
+        },
+    })
 
 
 # -=-=-=-=-=-=- azure模型对齐支持 -=-=-=-=-=-=-
diff --git a/request_llms/bridge_chatgpt.py b/request_llms/bridge_chatgpt.py
index 1be5d4321..2e4ebe3e4 100644
--- a/request_llms/bridge_chatgpt.py
+++ b/request_llms/bridge_chatgpt.py
@@ -323,7 +323,10 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
     if not is_any_api_key(llm_kwargs['api_key']):
         raise AssertionError("你提供了错误的API_KEY。\n\n1. 临时解决方案：直接在输入区键入api_key，然后回车提交。\n\n2. 长效解决方案：在config.py中配置。")
 
-    api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
+    if llm_kwargs['llm_model'].startswith('vllm-'):
+        api_key = 'no-api-key'
+    else:
+        api_key = select_api_key(llm_kwargs['api_key'], llm_kwargs['llm_model'])
 
     headers = {
         "Content-Type": "application/json",
@@ -365,7 +368,9 @@ def generate_payload(inputs, llm_kwargs, history, system_prompt, stream):
     if llm_kwargs['llm_model'].startswith('one-api-'):
         model = llm_kwargs['llm_model'][len('one-api-'):]
         model, _ = read_one_api_model_name(model)
-
+    if llm_kwargs['llm_model'].startswith('vllm-'):
+        model = llm_kwargs['llm_model'][len('vllm-'):]
+        model, _ = read_one_api_model_name(model)
     if model == "gpt-3.5-random": # 随机选择, 绕过openai访问频率限制
         model = random.choice([
             "gpt-3.5-turbo",