feat: continuous batching with vLLM (#349)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: continuous batching Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: add changeloe Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> * chore: add one shot generation Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
bentoml · Sep 14, 2023 · ad91079 · ad91079
1 parent e35e143
commit ad91079
Show file tree

Hide file tree

Showing 22 changed files with 340 additions and 236 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -60,12 +60,14 @@ repos:
               openllm-client/src/openllm_client/pb.*|
               .github/.*|
               cz.py |
+              bench.py |
               hatch_build.py
           )$
         additional_dependencies:
           - click==8.1.3
           - peft
           - bentoml==1.1.1
+          - build==0.10.0
           - transformers>=4.31.0
           - pandas-stubs
           - types-psutil

diff --git a/bench.py b/bench.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+import asyncio
+import json
+
+import aiohttp
+
+import openllm
+
+async def send_request(url, prompt, session, model, **attrs):
+  headers = {'accept': 'application/json', 'Content-Type': 'application/json'}
+  config = openllm.AutoConfig.for_model(model).model_construct_env(**attrs).model_dump()
+  data = {'prompt': prompt, 'llm_config': config, 'adapter_name': None}
+  async with session.post(url, headers=headers, data=json.dumps(data)) as response:
+    result = await response.text()
+  print('-' * 10 + '\n\n prompt:', prompt, '\nGeneration:', result, '\n\n' + '-' * 10)
+
+async def main():
+  url = 'http://localhost:3000/v1/generate_stream'
+  # len=100
+  prompts = [
+      'What is the meaning of life?',
+      'Explain the concept of quantum entanglement.',
+      'Describe the process of photosynthesis.',
+      'What are the benefits of regular exercise?',
+      'How does the internet work?',
+      'Discuss the impact of climate change on ecosystems.',
+      'Explain the principles of supply and demand in economics.',
+      'What is the history of the Roman Empire?',
+      'Describe the structure of a cell.',
+      'Discuss the pros and cons of renewable energy sources.',
+      'Explain the theory of relativity.',
+      'What is the role of DNA in genetics?',
+      'Describe the art movement of the Renaissance.',
+      'Discuss the causes of World War I.',
+      'What are the major functions of the human brain?',
+      'Explain the process of evolution by natural selection.',
+      'Describe the cultural significance of the Great Wall of China.',
+      'What is the impact of social media on society?',
+      'Discuss the life and works of Shakespeare.',
+      'Explain the concept of artificial intelligence.',
+      'What are the different types of chemical reactions?',
+      "Describe the structure of the Earth's atmosphere.",
+      'Discuss the history of the civil rights movement.',
+      'What are the economic implications of globalization?',
+      'Explain the principles of good nutrition.',
+      'Describe the major functions of the immune system.',
+      'Discuss the impact of colonialism on Africa.',
+      'What is the process of cellular respiration?',
+      'Explain the importance of biodiversity.',
+      'Discuss the causes and consequences of the Industrial Revolution.',
+      'What are the fundamental principles of democracy?',
+      'Describe the major components of a computer.',
+      'Explain the concept of human rights.',
+      'What is the role of enzymes in biological reactions?',
+      'Discuss the history of space exploration.',
+      'What are the ethical considerations in medical research?',
+      'Describe the cultural significance of the Pyramids of Egypt.',
+      'Explain the principles of classical physics.',
+      'What is the impact of climate change on weather patterns?',
+      'Discuss the major events of the American Revolution.',
+      'What are the effects of pollution on the environment?',
+      'Describe the process of protein synthesis.',
+      'Explain the concept of sustainable agriculture.',
+      'What is the history of the European Union?',
+      'Discuss the impact of the Renaissance on art and culture.',
+      'What are the key principles of marketing?',
+      'Explain the structure of the periodic table.',
+      'Describe the major types of renewable energy.',
+      'Discuss the causes and consequences of the French Revolution.',
+      'What is the role of the United Nations in international relations?',
+      'Explain the principles of game theory in economics.',
+      'What are the stages of human development?',
+      'Describe the cultural significance of the Taj Mahal.',
+      'Discuss the major themes in the works of Ernest Hemingway.',
+      'What is the impact of automation on the workforce?',
+      'Explain the concept of genetic engineering.',
+      'What are the different types of chemical bonds?',
+      "Describe the layers of the Earth's atmosphere.",
+      "Discuss the history of the women's suffrage movement.",
+      'What are the economic factors influencing consumer behavior?',
+      'Explain the principles of conflict resolution.',
+      'What is the role of neurotransmitters in the nervous system?',
+      'Discuss the impact of colonialism on India.',
+      'What is the process of mitosis?',
+      'Explain the importance of water conservation.',
+      'Describe the cultural significance of the Acropolis in Athens.',
+      'Discuss the major philosophical ideas of Plato.',
+      'What are the principles of investment in finance?',
+      'Explain the structure of a virus.',
+      'What is the history of the United Nations?',
+      'Discuss the impact of technology on modern art.',
+      'What are the key concepts in cognitive psychology?',
+      'Describe the major types of non-renewable energy sources.',
+      'Explain the causes and consequences of the Russian Revolution.',
+      'What is the role of the World Health Organization in global health?',
+      'Discuss the principles of ethics in business.',
+      'What are the stages of the water cycle?',
+      'Explain the concept of social justice.',
+      'What is the impact of deforestation on climate change?',
+      'Describe the process of meiosis.',
+      'Discuss the cultural significance of the Sistine Chapel ceiling.',
+      'What are the major themes in the novels of Jane Austen?',
+      'Explain the role of branding in marketing.',
+      'What is the history of the Internet?',
+      'Discuss the impact of artificial intelligence on society.',
+      'What are the principles of statistical analysis in research?',
+      'Explain the structure of an atom.',
+      'What is the significance of the Theory of Evolution by Charles Darwin?',
+      'Describe the major types of renewable energy.',
+      'Discuss the causes and consequences of the American Civil War.',
+      'What is the role of the International Monetary Fund in global economics?',
+      'Explain the principles of environmental conservation.',
+      'What are the stages of the rock cycle?',
+      'Describe the concept of cultural relativism.',
+      'Discuss the major contributions of Leonardo da Vinci to art and science.',
+      'What is the impact of globalization on cultural diversity?',
+      'Explain the process of genetic inheritance.',
+      'What are the different forms of government in the world?',
+      'Describe the major types of pollution.',
+      'Discuss the history of the labor movement.',
+      'What are the principles of sustainable urban planning?',
+      'Explain the role of hormones in the endocrine system.',
+      'What is the cultural significance of the Great Barrier Reef?',
+      'Discuss the major ideas of Friedrich Nietzsche.',
+      'What is the impact of social media on political movements?',
+      'Explain the concept of quantum computing.',
+      'What are the principles of international diplomacy?',
+      'Describe the major types of ocean ecosystems.',
+      'Discuss the causes and consequences of the Cold War.',
+      'What is the role of the World Trade Organization in global trade?',
+      'Explain the principles of behavioral psychology.',
+      'What are the stages of the nitrogen cycle?',
+      'Describe the concept of cultural appropriation.',
+      'Discuss the major works of Vincent van Gogh.',
+  ]
+  async with aiohttp.ClientSession() as session:
+    await asyncio.gather(*[send_request(url, prompt, session, 'llama', max_new_tokens=4096, top_p=0.21) for _, prompt in enumerate(prompts)])
+
+if __name__ == '__main__': asyncio.run(main())
diff --git a/changelog.d/349.feat.md b/changelog.d/349.feat.md
@@ -0,0 +1,3 @@
+Added support for continuous batching via vLLM
+
+Currently benchmark shows that 100 concurrent requests shows around 1218 TPS on 1 A100 running meta-llama/Llama-2-13b-chat-hf
diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
@@ -465,7 +465,6 @@ class ModelSettings(t.TypedDict, total=False):
 
   # meta
   url: str
-  requires_gpu: bool
   trust_remote_code: bool
   service_name: NotRequired[str]
   requirements: t.Optional[ListStr]
@@ -523,7 +522,6 @@ def default(cls) -> _ModelSettingsAttr:
                           'cpu': 'pt', 'nvidia.com/gpu': 'pt'
                       },
                       name_type='dasherize',
-                      requires_gpu=False,
                       url='',
                       model_type='causal_lm',
                       trust_remote_code=False,
@@ -541,7 +539,6 @@ def default(cls) -> _ModelSettingsAttr:
     architecture: str
     default_backend: t.Dict[LiteralResourceSpec, LiteralBackend]
     url: str
-    requires_gpu: bool
     trust_remote_code: bool
     service_name: str
     requirements: t.Optional[ListStr]
@@ -737,8 +734,6 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
     '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')'''
     __openllm_url__: str = Field(None)
     '''The resolved url for this LLMConfig.'''
-    __openllm_requires_gpu__: bool = Field(None)
-    '''Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.'''
     __openllm_trust_remote_code__: bool = Field(None)
     '''Whether to always trust remote code'''
     __openllm_service_name__: str = Field(None)
@@ -932,7 +927,6 @@ class FalconConfig(openllm.LLMConfig):
       __config__ = {
           "name_type": "lowercase",
           "trust_remote_code": True,
-          "requires_gpu": True,
           "timeout": 3600000,
           "url": "https://falconllm.tii.ae/",
           "requirements": ["einops", "xformers", "safetensors"],
@@ -1108,8 +1102,6 @@ def __getitem__(self, item: t.Literal['default_backend']) -> t.Dict[LiteralResou
   @overload
   def __getitem__(self, item: t.Literal['url']) -> str: ...
   @overload
-  def __getitem__(self, item: t.Literal['requires_gpu']) -> bool: ...
-  @overload
   def __getitem__(self, item: t.Literal['trust_remote_code']) -> bool: ...
   @overload
   def __getitem__(self, item: t.Literal['service_name']) -> str: ...

diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
@@ -25,7 +25,7 @@
 
   from .utils.lazy import VersionInfo
 
-M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel]')
+M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.AsyncLLMEngine, peft.PeftModel]')
 T = t.TypeVar('T', bound='t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]')
 
 def get_literal_args(typ: t.Any) -> tuple[str, ...]:
@@ -94,7 +94,9 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
   embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], EmbeddingsOutput]
   generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
   generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
-  generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
+  generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Iterator[t.Any]]
+  vllm_generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
+  vllm_generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.AsyncGenerator[str, None]]
 
 class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   __doc__: str
@@ -111,7 +113,9 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
   embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[EmbeddingsOutput]]
   generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
   generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
-  generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
+  generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Iterator[t.Any]]
+  vllm_generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
+  vllm_generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.AsyncGenerator[str, None]]
 
   def __init__(self,
                runnable_class: type[LLMRunnable[M, T]],

diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py
@@ -40,7 +40,6 @@ class BaichuanConfig(openllm_core.LLMConfig):
       'name_type': 'lowercase',
       'trust_remote_code': True,
       'timeout': 3600000,
-      'requires_gpu': True,
       'url': 'https://github.com/baichuan-inc/Baichuan-7B',
       'requirements': ['cpm-kernels', 'sentencepiece'],
       'architecture': 'BaiChuanForCausalLM',

diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py
@@ -44,7 +44,6 @@ class ChatGLMConfig(openllm_core.LLMConfig):
       'name_type': 'lowercase',
       'trust_remote_code': True,
       'timeout': 3600000,
-      'requires_gpu': True,
       'url': 'https://github.com/THUDM/ChatGLM-6B',
       'requirements': ['cpm-kernels', 'sentencepiece'],
       'architecture': 'ChatGLMForConditionalGeneration',

diff --git a/openllm-core/src/openllm_core/config/configuration_falcon.py b/openllm-core/src/openllm_core/config/configuration_falcon.py
@@ -41,7 +41,6 @@ class FalconConfig(openllm_core.LLMConfig):
   __config__ = {
       'name_type': 'lowercase',
       'trust_remote_code': True,
-      'requires_gpu': True,
       'timeout': int(36e6),
       'url': 'https://falconllm.tii.ae/',
       'requirements': ['einops', 'xformers'],

diff --git a/openllm-core/src/openllm_core/config/configuration_gpt_neox.py b/openllm-core/src/openllm_core/config/configuration_gpt_neox.py
@@ -45,7 +45,6 @@ class GPTNeoXConfig(openllm_core.LLMConfig):
   __config__ = {
       'model_name': 'gpt_neox',
       'start_name': 'gpt-neox',
-      'requires_gpu': True,
       'architecture': 'GPTNeoXForCausalLM',
       'url': 'https://github.com/EleutherAI/gpt-neox',
       'default_id': 'eleutherai/gpt-neox-20b',

diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py
@@ -10,7 +10,7 @@
 Run a LLMServer for OPT model.
 
 \b
-> See more information about falcon at [facebook/opt-66b](https://huggingface.co/facebook/opt-66b)
+> See more information about OPT at [facebook/opt-66b](https://huggingface.co/facebook/opt-66b)
 
 \b
 ## Usage

diff --git a/openllm-core/src/openllm_core/config/configuration_starcoder.py b/openllm-core/src/openllm_core/config/configuration_starcoder.py
@@ -36,11 +36,9 @@ class StarCoderConfig(openllm_core.LLMConfig):
   """
   __config__ = {
       'name_type': 'lowercase',
-      'requires_gpu': True,
       'url': 'https://github.com/bigcode-project/starcoder',
       'architecture': 'GPTBigCodeForCausalLM',
       'requirements': ['bitsandbytes'],
-      'workers_per_resource': 0.5,
       'default_id': 'bigcode/starcoder',
       'model_ids': ['bigcode/starcoder', 'bigcode/starcoderbase']
   }

diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
@@ -115,7 +115,7 @@ openai = ["openai", "tiktoken"]
 opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
 playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
 starcoder = ["bitsandbytes"]
-vllm = ["vllm>=0.1.6", "ray"]
+vllm = ["vllm>=0.1.7", "ray"]
 
 [tool.hatch.version]
 fallback-version = "0.0.0"