feat: serve adapter layers (#52)

bentoml · Jun 23, 2023 · dfca956 · dfca956
1 parent 5981e49
commit dfca956
Show file tree

Hide file tree

Showing 33 changed files with 1,895 additions and 495 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+nightly-requirements.txt linguist-generated=true
+* text=auto eol=lf
diff --git a/README.md b/README.md
@@ -346,10 +346,9 @@ async def prompt(input_text: str) -> str:
 
 OpenLLM seamlessly integrates with Hugging Face Agents.
 
-> **Warning** The Hugging Face Agent is still in the experimental stage. It is
-> recommended to OpenLLM with
-> `pip install -r nightly-requirements.generated.txt` to get the latest API
-> update for Hugging Face agent.
+> **Warning** The HuggingFace Agent is still at experimental stage. It is
+> recommended to OpenLLM with `pip install -r nightly-requirements.txt` to get
+> the latest API update for HuggingFace agent.
 
 ```python
 import transformers

diff --git a/changelog.d/52.feature.md b/changelog.d/52.feature.md
@@ -0,0 +1,45 @@
+#### Serving LLM with fine-tuned LoRA, QLoRA adapters layers
+
+Then the given fine tuning weights can be served with the model via
+`openllm start`:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id /path/to/adapters
+```
+
+If you just wish to try some pretrained adapter checkpoint, you can use
+`--adapter-id`:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora
+```
+
+To use multiple adapters, use the following format:
+
+```bash
+openllm start opt --model-id facebook/opt-6.7b --adapter-id aarnphm/opt-6.7b-lora --adapter-id aarnphm/opt-6.7b-lora:french_lora
+```
+
+By default, the first `adapter-id` will be the default lora layer, but
+optionally users can change what lora layer to use for inference via
+`/v1/adapters`:
+
+```bash
+curl -X POST http://localhost:3000/v1/adapters --json '{"adapter_name": "vn_lora"}'
+```
+
+> Note that for multiple `adapter-name` and `adapter-id`, it is recomended to
+> update to use the default adapter before sending the inference, to avoid any
+> performance degradation
+
+To include this into the Bento, one can also provide a `--adapter-id` into
+`openllm build`:
+
+```bash
+openllm build opt --model-id facebook/opt-6.7b --adapter-id ...
+```
+
+### Rework
+
+Separate out configuration builder, to make it more flexible for future
+configuration generation.
diff --git a/nightly-requirements.generated.txt → nightly-requirements.txt b/nightly-requirements.generated.txt → nightly-requirements.txt
diff --git a/pyproject.toml b/pyproject.toml
@@ -58,15 +58,15 @@ requires-python = ">=3.8"
 # NOTE: Don't modify project.optional-dependencies
 # as it is managed by ./tools/update-optional-dependencies.py
 [project.optional-dependencies]
-agents = ["transformers[agents]", "diffusers", "soundfile"]
+agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
 all = [
     "openllm[chatglm]",
     "openllm[starcoder]",
     "openllm[falcon]",
     "openllm[agents]",
+    "openllm[flan-t5]",
     "openllm[fine-tune]",
     "openllm[openai]",
-    "openllm[flan-t5]",
 ]
 chatglm = ["cpm_kernels", "sentencepiece"]
 falcon = ["einops", "xformers", "safetensors"]

diff --git a/src/openllm/__init__.py b/src/openllm/__init__.py
@@ -26,7 +26,9 @@
 from __future__ import annotations
 
 import logging
+import os
 import typing as t
+import warnings
 
 from . import utils as utils
 from .__about__ import __version__ as __version__
@@ -39,6 +41,24 @@
 
     utils.configure_logging()
     logging.basicConfig(level=logging.NOTSET)
+else:
+    # configuration for bitsandbytes before import
+    os.environ["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
+    # The following warnings from bitsandbytes, and probably not that important
+    # for users to see when DEBUG is False
+    warnings.filterwarnings(
+        "ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization"
+    )
+    warnings.filterwarnings(
+        "ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization"
+    )
+    warnings.filterwarnings(
+        "ignore",
+        message=(
+            "The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization"
+            " are unavailable."
+        ),
+    )
 
 
 _import_structure = {