Skip to content

Commit

Permalink
feat: continuous batching with vLLM (#349)
Browse files Browse the repository at this point in the history
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>

* feat: continuous batching

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

* chore: add changeloe

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

* chore: add one shot generation

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: paperspace <29749331+aarnphm@users.noreply.github.com>
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
  • Loading branch information
aarnphm committed Sep 14, 2023
1 parent e35e143 commit ad91079
Show file tree
Hide file tree
Showing 22 changed files with 340 additions and 236 deletions.
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ repos:
openllm-client/src/openllm_client/pb.*|
.github/.*|
cz.py |
bench.py |
hatch_build.py
)$
additional_dependencies:
- click==8.1.3
- peft
- bentoml==1.1.1
- build==0.10.0
- transformers>=4.31.0
- pandas-stubs
- types-psutil
Expand Down
139 changes: 139 additions & 0 deletions bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from __future__ import annotations
import asyncio
import json

import aiohttp

import openllm

async def send_request(url, prompt, session, model, **attrs):
headers = {'accept': 'application/json', 'Content-Type': 'application/json'}
config = openllm.AutoConfig.for_model(model).model_construct_env(**attrs).model_dump()
data = {'prompt': prompt, 'llm_config': config, 'adapter_name': None}
async with session.post(url, headers=headers, data=json.dumps(data)) as response:
result = await response.text()
print('-' * 10 + '\n\n prompt:', prompt, '\nGeneration:', result, '\n\n' + '-' * 10)

async def main():
url = 'http://localhost:3000/v1/generate_stream'
# len=100
prompts = [
'What is the meaning of life?',
'Explain the concept of quantum entanglement.',
'Describe the process of photosynthesis.',
'What are the benefits of regular exercise?',
'How does the internet work?',
'Discuss the impact of climate change on ecosystems.',
'Explain the principles of supply and demand in economics.',
'What is the history of the Roman Empire?',
'Describe the structure of a cell.',
'Discuss the pros and cons of renewable energy sources.',
'Explain the theory of relativity.',
'What is the role of DNA in genetics?',
'Describe the art movement of the Renaissance.',
'Discuss the causes of World War I.',
'What are the major functions of the human brain?',
'Explain the process of evolution by natural selection.',
'Describe the cultural significance of the Great Wall of China.',
'What is the impact of social media on society?',
'Discuss the life and works of Shakespeare.',
'Explain the concept of artificial intelligence.',
'What are the different types of chemical reactions?',
"Describe the structure of the Earth's atmosphere.",
'Discuss the history of the civil rights movement.',
'What are the economic implications of globalization?',
'Explain the principles of good nutrition.',
'Describe the major functions of the immune system.',
'Discuss the impact of colonialism on Africa.',
'What is the process of cellular respiration?',
'Explain the importance of biodiversity.',
'Discuss the causes and consequences of the Industrial Revolution.',
'What are the fundamental principles of democracy?',
'Describe the major components of a computer.',
'Explain the concept of human rights.',
'What is the role of enzymes in biological reactions?',
'Discuss the history of space exploration.',
'What are the ethical considerations in medical research?',
'Describe the cultural significance of the Pyramids of Egypt.',
'Explain the principles of classical physics.',
'What is the impact of climate change on weather patterns?',
'Discuss the major events of the American Revolution.',
'What are the effects of pollution on the environment?',
'Describe the process of protein synthesis.',
'Explain the concept of sustainable agriculture.',
'What is the history of the European Union?',
'Discuss the impact of the Renaissance on art and culture.',
'What are the key principles of marketing?',
'Explain the structure of the periodic table.',
'Describe the major types of renewable energy.',
'Discuss the causes and consequences of the French Revolution.',
'What is the role of the United Nations in international relations?',
'Explain the principles of game theory in economics.',
'What are the stages of human development?',
'Describe the cultural significance of the Taj Mahal.',
'Discuss the major themes in the works of Ernest Hemingway.',
'What is the impact of automation on the workforce?',
'Explain the concept of genetic engineering.',
'What are the different types of chemical bonds?',
"Describe the layers of the Earth's atmosphere.",
"Discuss the history of the women's suffrage movement.",
'What are the economic factors influencing consumer behavior?',
'Explain the principles of conflict resolution.',
'What is the role of neurotransmitters in the nervous system?',
'Discuss the impact of colonialism on India.',
'What is the process of mitosis?',
'Explain the importance of water conservation.',
'Describe the cultural significance of the Acropolis in Athens.',
'Discuss the major philosophical ideas of Plato.',
'What are the principles of investment in finance?',
'Explain the structure of a virus.',
'What is the history of the United Nations?',
'Discuss the impact of technology on modern art.',
'What are the key concepts in cognitive psychology?',
'Describe the major types of non-renewable energy sources.',
'Explain the causes and consequences of the Russian Revolution.',
'What is the role of the World Health Organization in global health?',
'Discuss the principles of ethics in business.',
'What are the stages of the water cycle?',
'Explain the concept of social justice.',
'What is the impact of deforestation on climate change?',
'Describe the process of meiosis.',
'Discuss the cultural significance of the Sistine Chapel ceiling.',
'What are the major themes in the novels of Jane Austen?',
'Explain the role of branding in marketing.',
'What is the history of the Internet?',
'Discuss the impact of artificial intelligence on society.',
'What are the principles of statistical analysis in research?',
'Explain the structure of an atom.',
'What is the significance of the Theory of Evolution by Charles Darwin?',
'Describe the major types of renewable energy.',
'Discuss the causes and consequences of the American Civil War.',
'What is the role of the International Monetary Fund in global economics?',
'Explain the principles of environmental conservation.',
'What are the stages of the rock cycle?',
'Describe the concept of cultural relativism.',
'Discuss the major contributions of Leonardo da Vinci to art and science.',
'What is the impact of globalization on cultural diversity?',
'Explain the process of genetic inheritance.',
'What are the different forms of government in the world?',
'Describe the major types of pollution.',
'Discuss the history of the labor movement.',
'What are the principles of sustainable urban planning?',
'Explain the role of hormones in the endocrine system.',
'What is the cultural significance of the Great Barrier Reef?',
'Discuss the major ideas of Friedrich Nietzsche.',
'What is the impact of social media on political movements?',
'Explain the concept of quantum computing.',
'What are the principles of international diplomacy?',
'Describe the major types of ocean ecosystems.',
'Discuss the causes and consequences of the Cold War.',
'What is the role of the World Trade Organization in global trade?',
'Explain the principles of behavioral psychology.',
'What are the stages of the nitrogen cycle?',
'Describe the concept of cultural appropriation.',
'Discuss the major works of Vincent van Gogh.',
]
async with aiohttp.ClientSession() as session:
await asyncio.gather(*[send_request(url, prompt, session, 'llama', max_new_tokens=4096, top_p=0.21) for _, prompt in enumerate(prompts)])

if __name__ == '__main__': asyncio.run(main())
3 changes: 3 additions & 0 deletions changelog.d/349.feat.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Added support for continuous batching via vLLM

Currently benchmark shows that 100 concurrent requests shows around 1218 TPS on 1 A100 running meta-llama/Llama-2-13b-chat-hf
8 changes: 0 additions & 8 deletions openllm-core/src/openllm_core/_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,6 @@ class ModelSettings(t.TypedDict, total=False):

# meta
url: str
requires_gpu: bool
trust_remote_code: bool
service_name: NotRequired[str]
requirements: t.Optional[ListStr]
Expand Down Expand Up @@ -523,7 +522,6 @@ def default(cls) -> _ModelSettingsAttr:
'cpu': 'pt', 'nvidia.com/gpu': 'pt'
},
name_type='dasherize',
requires_gpu=False,
url='',
model_type='causal_lm',
trust_remote_code=False,
Expand All @@ -541,7 +539,6 @@ def default(cls) -> _ModelSettingsAttr:
architecture: str
default_backend: t.Dict[LiteralResourceSpec, LiteralBackend]
url: str
requires_gpu: bool
trust_remote_code: bool
service_name: str
requirements: t.Optional[ListStr]
Expand Down Expand Up @@ -737,8 +734,6 @@ def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None:
'''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')'''
__openllm_url__: str = Field(None)
'''The resolved url for this LLMConfig.'''
__openllm_requires_gpu__: bool = Field(None)
'''Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.'''
__openllm_trust_remote_code__: bool = Field(None)
'''Whether to always trust remote code'''
__openllm_service_name__: str = Field(None)
Expand Down Expand Up @@ -932,7 +927,6 @@ class FalconConfig(openllm.LLMConfig):
__config__ = {
"name_type": "lowercase",
"trust_remote_code": True,
"requires_gpu": True,
"timeout": 3600000,
"url": "https://falconllm.tii.ae/",
"requirements": ["einops", "xformers", "safetensors"],
Expand Down Expand Up @@ -1108,8 +1102,6 @@ def __getitem__(self, item: t.Literal['default_backend']) -> t.Dict[LiteralResou
@overload
def __getitem__(self, item: t.Literal['url']) -> str: ...
@overload
def __getitem__(self, item: t.Literal['requires_gpu']) -> bool: ...
@overload
def __getitem__(self, item: t.Literal['trust_remote_code']) -> bool: ...
@overload
def __getitem__(self, item: t.Literal['service_name']) -> str: ...
Expand Down
10 changes: 7 additions & 3 deletions openllm-core/src/openllm_core/_typing_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from .utils.lazy import VersionInfo

M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel]')
M = t.TypeVar('M', bound='t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.AsyncLLMEngine, peft.PeftModel]')
T = t.TypeVar('T', bound='t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]')

def get_literal_args(typ: t.Any) -> tuple[str, ...]:
Expand Down Expand Up @@ -94,7 +94,9 @@ class LLMRunnable(bentoml.Runnable, t.Generic[M, T]):
embeddings: RunnableMethod[LLMRunnable[M, T], [list[str]], EmbeddingsOutput]
generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
generate_one: RunnableMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.Iterator[t.Any]]
vllm_generate: RunnableMethod[LLMRunnable[M, T], [str], list[t.Any]]
vllm_generate_iterator: RunnableMethod[LLMRunnable[M, T], [str], t.AsyncGenerator[str, None]]

class LLMRunner(bentoml.Runner, t.Generic[M, T]):
__doc__: str
Expand All @@ -111,7 +113,9 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]):
embeddings: RunnerMethod[LLMRunnable[M, T], [list[str]], t.Sequence[EmbeddingsOutput]]
generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
generate_one: RunnerMethod[LLMRunnable[M, T], [str, list[str]], t.Sequence[dict[t.Literal['generated_text'], str]]]
generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Generator[str, None, str]]
generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.Iterator[t.Any]]
vllm_generate: RunnerMethod[LLMRunnable[M, T], [str], list[t.Any]]
vllm_generate_iterator: RunnerMethod[LLMRunnable[M, T], [str], t.AsyncGenerator[str, None]]

def __init__(self,
runnable_class: type[LLMRunnable[M, T]],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ class BaichuanConfig(openllm_core.LLMConfig):
'name_type': 'lowercase',
'trust_remote_code': True,
'timeout': 3600000,
'requires_gpu': True,
'url': 'https://github.com/baichuan-inc/Baichuan-7B',
'requirements': ['cpm-kernels', 'sentencepiece'],
'architecture': 'BaiChuanForCausalLM',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ class ChatGLMConfig(openllm_core.LLMConfig):
'name_type': 'lowercase',
'trust_remote_code': True,
'timeout': 3600000,
'requires_gpu': True,
'url': 'https://github.com/THUDM/ChatGLM-6B',
'requirements': ['cpm-kernels', 'sentencepiece'],
'architecture': 'ChatGLMForConditionalGeneration',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ class FalconConfig(openllm_core.LLMConfig):
__config__ = {
'name_type': 'lowercase',
'trust_remote_code': True,
'requires_gpu': True,
'timeout': int(36e6),
'url': 'https://falconllm.tii.ae/',
'requirements': ['einops', 'xformers'],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class GPTNeoXConfig(openllm_core.LLMConfig):
__config__ = {
'model_name': 'gpt_neox',
'start_name': 'gpt-neox',
'requires_gpu': True,
'architecture': 'GPTNeoXForCausalLM',
'url': 'https://github.com/EleutherAI/gpt-neox',
'default_id': 'eleutherai/gpt-neox-20b',
Expand Down
2 changes: 1 addition & 1 deletion openllm-core/src/openllm_core/config/configuration_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
Run a LLMServer for OPT model.
\b
> See more information about falcon at [facebook/opt-66b](https://huggingface.co/facebook/opt-66b)
> See more information about OPT at [facebook/opt-66b](https://huggingface.co/facebook/opt-66b)
\b
## Usage
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,9 @@ class StarCoderConfig(openllm_core.LLMConfig):
"""
__config__ = {
'name_type': 'lowercase',
'requires_gpu': True,
'url': 'https://github.com/bigcode-project/starcoder',
'architecture': 'GPTBigCodeForCausalLM',
'requirements': ['bitsandbytes'],
'workers_per_resource': 0.5,
'default_id': 'bigcode/starcoder',
'model_ids': ['bigcode/starcoder', 'bigcode/starcoderbase']
}
Expand Down
2 changes: 1 addition & 1 deletion openllm-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ openai = ["openai", "tiktoken"]
opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
starcoder = ["bitsandbytes"]
vllm = ["vllm>=0.1.6", "ray"]
vllm = ["vllm>=0.1.7", "ray"]

[tool.hatch.version]
fallback-version = "0.0.0"
Expand Down
Loading

0 comments on commit ad91079

Please sign in to comment.