diff --git a/.flake8 b/.flake8 index a2d1129..1276a9a 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] max-line-length = 120 -ignore = F401, W503 +ignore = E731,E231,E203,E501,F401,W503 diff --git a/.gitignore b/.gitignore index 39aabc4..5786ca0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ rsync_exclude.txt __pycache__/ temp/ dist/ +outputs/ poetry.lock diff --git a/docs/release-notes.md b/docs/release-notes.md index 10c16a5..20b97b7 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,10 +1,19 @@ # Release Notes +## Release v1.2.0 +### What's changed +#### Added features +* New LLM wrapper: VLLM for local inference with batches + +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.1...v1.2.0) + ## Release v1.1.1 ### What's Changed #### Further Changes: - deleted poetry.lock -- updated transformers dependency: bumped from 4.46.3 to 4.48.0 +- updated transformers dependency: bumped from 4.46.3 to 4.48.0 + +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.0...v1.1.1) ## Release v1.1.0 ### What's changed @@ -16,6 +25,8 @@ * improved opros meta-prompt * added support for python versions from 3.9 onwards (previously 3.11) +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.1...v1.1.0) + ## Release v1.0.1 ### What's changed #### Added features @@ -24,6 +35,8 @@ #### Further Changes: * fixed release notes +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.0...v1.0.1) + ## Release v1.0.0 ### What's changed #### Added Features: diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py index e4ca64e..95a22bd 100644 --- a/promptolution/llms/__init__.py +++ b/promptolution/llms/__init__.py @@ -3,6 +3,7 @@ from .api_llm import APILLM from .base_llm import DummyLLM from .local_llm import LocalLLM +from .vllm import VLLM def get_llm(model_id: str, *args, **kwargs): @@ -10,13 +11,15 @@ def get_llm(model_id: str, *args, **kwargs): This function supports three types of language models: 1. DummyLLM: A mock LLM for testing purposes. - 2. LocalLLM: For running models locally (identified by 'local' in the model_id). - 3. APILLM: For API-based models (default if not matching other types). + 2. LocalLLM: For running models locally. + 3. VLLM: For running models using the vLLM library. + 4. APILLM: For API-based models (default if not matching other types). Args: model_id (str): Identifier for the model to use. Special cases: - "dummy" for DummyLLM - "local-{model_name}" for LocalLLM + - "vllm-{model_name}" for VLLM - Any other string for APILLM *args: Variable length argument list passed to the LLM constructor. **kwargs: Arbitrary keyword arguments passed to the LLM constructor. @@ -29,4 +32,7 @@ def get_llm(model_id: str, *args, **kwargs): if "local" in model_id: model_id = "-".join(model_id.split("-")[1:]) return LocalLLM(model_id, *args, **kwargs) + if "vllm" in model_id: + model_id = "-".join(model_id.split("-")[1:]) + return VLLM(model_id, *args, **kwargs) return APILLM(model_id, *args, **kwargs) diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py index 1c34709..cf966bf 100644 --- a/promptolution/llms/api_llm.py +++ b/promptolution/llms/api_llm.py @@ -13,6 +13,8 @@ from langchain_core.messages import HumanMessage from langchain_openai import ChatOpenAI +from promptolution.llms.base_llm import BaseLLM + logger = Logger(__name__) logger.setLevel(INFO) @@ -46,7 +48,7 @@ async def invoke_model(prompt, model, semaphore): await asyncio.sleep(delay) -class APILLM: +class APILLM(BaseLLM): """A class to interface with various language models through their respective APIs. This class supports Claude (Anthropic), GPT (OpenAI), and LLaMA (DeepInfra) models. diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py index 1cfb616..074bf01 100644 --- a/promptolution/llms/local_llm.py +++ b/promptolution/llms/local_llm.py @@ -8,8 +8,10 @@ logger = logging.getLogger(__name__) logger.warning(f"Could not import torch or transformers in local_llm.py: {e}") +from promptolution.llms.base_llm import BaseLLM -class LocalLLM: + +class LocalLLM(BaseLLM): """A class for running language models locally using the Hugging Face Transformers library. This class sets up a text generation pipeline with specified model parameters diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py new file mode 100644 index 0000000..d99c542 --- /dev/null +++ b/promptolution/llms/vllm.py @@ -0,0 +1,135 @@ +"""Module for running language models locally using the vLLM library.""" + + +from logging import INFO, Logger + +try: + import torch + from transformers import AutoTokenizer + from vllm import LLM, SamplingParams +except ImportError as e: + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}") + +from promptolution.llms.base_llm import BaseLLM + +logger = Logger(__name__) +logger.setLevel(INFO) + + +class VLLM(BaseLLM): + """A class for running language models using the vLLM library. + + This class sets up a vLLM inference engine with specified model parameters + and provides a method to generate responses for given prompts. + + Attributes: + llm (vllm.LLM): The vLLM inference engine. + tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model. + sampling_params (vllm.SamplingParams): Parameters for text generation. + + Methods: + get_response: Generate responses for a list of prompts. + """ + + def __init__( + self, + model_id: str, + batch_size: int = 64, + max_generated_tokens: int = 256, + temperature: float = 0.1, + top_p: float = 0.9, + model_storage_path: str = None, + token: str = None, + dtype: str = "auto", + tensor_parallel_size: int = 1, + gpu_memory_utilization: float = 0.95, + max_model_len: int = 2048, + trust_remote_code: bool = False, + ): + """Initialize the VLLM with a specific model. + + Args: + model_id (str): The identifier of the model to use. + batch_size (int, optional): The batch size for text generation. Defaults to 8. + max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256. + temperature (float, optional): Sampling temperature. Defaults to 0.1. + top_p (float, optional): Top-p sampling parameter. Defaults to 0.9. + model_storage_path (str, optional): Directory to store the model. Defaults to None. + token: (str, optional): Token for accessing the model - not used in implementation yet. + dtype (str, optional): Data type for model weights. Defaults to "float16". + tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1. + gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95. + max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048. + trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. + + Note: + This method sets up a vLLM engine with specified parameters for efficient inference. + """ + self.batch_size = batch_size + self.dtype = dtype + self.tensor_parallel_size = tensor_parallel_size + self.gpu_memory_utilization = gpu_memory_utilization + self.max_model_len = max_model_len + self.trust_remote_code = trust_remote_code + + # Configure sampling parameters + self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens) + + # Initialize the vLLM engine + self.llm = LLM( + model=model_id, + tokenizer=model_id, + dtype=self.dtype, + tensor_parallel_size=self.tensor_parallel_size, + gpu_memory_utilization=self.gpu_memory_utilization, + max_model_len=self.max_model_len, + download_dir=model_storage_path, + trust_remote_code=self.trust_remote_code, + ) + + # Initialize tokenizer separately for potential pre-processing + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + + def get_response(self, inputs: list[str]): + """Generate responses for a list of prompts using the vLLM engine. + + Args: + prompts (list[str]): A list of input prompts. + + Returns: + list[str]: A list of generated responses corresponding to the input prompts. + + Note: + This method uses vLLM's batched generation capabilities for efficient inference. + """ + prompts = [ + self.tokenizer.apply_chat_template( + [ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + {"role": "user", "content": input}, + ], + tokenize=False, + ) + for input in inputs + ] + + # generate responses for self.batch_size prompts at the same time + all_responses = [] + for i in range(0, len(prompts), self.batch_size): + batch = prompts[i : i + self.batch_size] + outputs = self.llm.generate(batch, self.sampling_params) + responses = [output.outputs[0].text for output in outputs] + all_responses.extend(responses) + + return all_responses + + def __del__(self): + """Cleanup method to delete the LLM instance and free up GPU memory.""" + del self.llm + torch.cuda.empty_cache() diff --git a/pyproject.toml b/pyproject.toml index e933ab3..e4f5be3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "promptolution" -version = "1.1.1" +version = "1.2.0" description = "" authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"] readme = "README.md" @@ -15,6 +15,7 @@ langchain-community = "^0.2.12" pandas = "^2.2.2" tqdm = "^4.66.5" scikit-learn = "^1.5.2" +vllm = "^0.7.3" [tool.poetry.group.dev.dependencies] matplotlib = "^3.9.2"