Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
max-line-length = 120
ignore = F401, W503
ignore = E731,E231,E203,E501,F401,W503
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ rsync_exclude.txt
__pycache__/
temp/
dist/
outputs/
poetry.lock
15 changes: 14 additions & 1 deletion docs/release-notes.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
# Release Notes

## Release v1.2.0
### What's changed
#### Added features
* New LLM wrapper: VLLM for local inference with batches

**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.1...v1.2.0)

## Release v1.1.1
### What's Changed
#### Further Changes:
- deleted poetry.lock
- updated transformers dependency: bumped from 4.46.3 to 4.48.0
- updated transformers dependency: bumped from 4.46.3 to 4.48.0

**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.0...v1.1.1)

## Release v1.1.0
### What's changed
Expand All @@ -16,6 +25,8 @@
* improved opros meta-prompt
* added support for python versions from 3.9 onwards (previously 3.11)

**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.1...v1.1.0)

## Release v1.0.1
### What's changed
#### Added features
Expand All @@ -24,6 +35,8 @@
#### Further Changes:
* fixed release notes

**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.0...v1.0.1)

## Release v1.0.0
### What's changed
#### Added Features:
Expand Down
10 changes: 8 additions & 2 deletions promptolution/llms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,23 @@
from .api_llm import APILLM
from .base_llm import DummyLLM
from .local_llm import LocalLLM
from .vllm import VLLM


def get_llm(model_id: str, *args, **kwargs):
"""Factory function to create and return a language model instance based on the provided model_id.

This function supports three types of language models:
1. DummyLLM: A mock LLM for testing purposes.
2. LocalLLM: For running models locally (identified by 'local' in the model_id).
3. APILLM: For API-based models (default if not matching other types).
2. LocalLLM: For running models locally.
3. VLLM: For running models using the vLLM library.
4. APILLM: For API-based models (default if not matching other types).

Args:
model_id (str): Identifier for the model to use. Special cases:
- "dummy" for DummyLLM
- "local-{model_name}" for LocalLLM
- "vllm-{model_name}" for VLLM
- Any other string for APILLM
*args: Variable length argument list passed to the LLM constructor.
**kwargs: Arbitrary keyword arguments passed to the LLM constructor.
Expand All @@ -29,4 +32,7 @@ def get_llm(model_id: str, *args, **kwargs):
if "local" in model_id:
model_id = "-".join(model_id.split("-")[1:])
return LocalLLM(model_id, *args, **kwargs)
if "vllm" in model_id:
model_id = "-".join(model_id.split("-")[1:])
return VLLM(model_id, *args, **kwargs)
return APILLM(model_id, *args, **kwargs)
4 changes: 3 additions & 1 deletion promptolution/llms/api_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI

from promptolution.llms.base_llm import BaseLLM

logger = Logger(__name__)
logger.setLevel(INFO)

Expand Down Expand Up @@ -46,7 +48,7 @@ async def invoke_model(prompt, model, semaphore):
await asyncio.sleep(delay)


class APILLM:
class APILLM(BaseLLM):
"""A class to interface with various language models through their respective APIs.

This class supports Claude (Anthropic), GPT (OpenAI), and LLaMA (DeepInfra) models.
Expand Down
4 changes: 3 additions & 1 deletion promptolution/llms/local_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
logger = logging.getLogger(__name__)
logger.warning(f"Could not import torch or transformers in local_llm.py: {e}")

from promptolution.llms.base_llm import BaseLLM

class LocalLLM:

class LocalLLM(BaseLLM):
"""A class for running language models locally using the Hugging Face Transformers library.

This class sets up a text generation pipeline with specified model parameters
Expand Down
135 changes: 135 additions & 0 deletions promptolution/llms/vllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""Module for running language models locally using the vLLM library."""


from logging import INFO, Logger

try:
import torch
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
except ImportError as e:
import logging

logger = logging.getLogger(__name__)
logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}")

from promptolution.llms.base_llm import BaseLLM

logger = Logger(__name__)
logger.setLevel(INFO)


class VLLM(BaseLLM):
"""A class for running language models using the vLLM library.

This class sets up a vLLM inference engine with specified model parameters
and provides a method to generate responses for given prompts.

Attributes:
llm (vllm.LLM): The vLLM inference engine.
tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
sampling_params (vllm.SamplingParams): Parameters for text generation.

Methods:
get_response: Generate responses for a list of prompts.
"""

def __init__(
self,
model_id: str,
batch_size: int = 64,
max_generated_tokens: int = 256,
temperature: float = 0.1,
top_p: float = 0.9,
model_storage_path: str = None,
token: str = None,
dtype: str = "auto",
tensor_parallel_size: int = 1,
gpu_memory_utilization: float = 0.95,
max_model_len: int = 2048,
trust_remote_code: bool = False,
):
"""Initialize the VLLM with a specific model.

Args:
model_id (str): The identifier of the model to use.
batch_size (int, optional): The batch size for text generation. Defaults to 8.
max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
temperature (float, optional): Sampling temperature. Defaults to 0.1.
top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
model_storage_path (str, optional): Directory to store the model. Defaults to None.
token: (str, optional): Token for accessing the model - not used in implementation yet.
dtype (str, optional): Data type for model weights. Defaults to "float16".
tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.

Note:
This method sets up a vLLM engine with specified parameters for efficient inference.
"""
self.batch_size = batch_size
self.dtype = dtype
self.tensor_parallel_size = tensor_parallel_size
self.gpu_memory_utilization = gpu_memory_utilization
self.max_model_len = max_model_len
self.trust_remote_code = trust_remote_code

# Configure sampling parameters
self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens)

# Initialize the vLLM engine
self.llm = LLM(
model=model_id,
tokenizer=model_id,
dtype=self.dtype,
tensor_parallel_size=self.tensor_parallel_size,
gpu_memory_utilization=self.gpu_memory_utilization,
max_model_len=self.max_model_len,
download_dir=model_storage_path,
trust_remote_code=self.trust_remote_code,
)

# Initialize tokenizer separately for potential pre-processing
self.tokenizer = AutoTokenizer.from_pretrained(model_id)

def get_response(self, inputs: list[str]):
"""Generate responses for a list of prompts using the vLLM engine.

Args:
prompts (list[str]): A list of input prompts.

Returns:
list[str]: A list of generated responses corresponding to the input prompts.

Note:
This method uses vLLM's batched generation capabilities for efficient inference.
"""
prompts = [
self.tokenizer.apply_chat_template(
[
{
"role": "system",
"content": "You are a helpful assistant.",
},
{"role": "user", "content": input},
],
tokenize=False,
)
for input in inputs
]

# generate responses for self.batch_size prompts at the same time
all_responses = []
for i in range(0, len(prompts), self.batch_size):
batch = prompts[i : i + self.batch_size]
outputs = self.llm.generate(batch, self.sampling_params)
responses = [output.outputs[0].text for output in outputs]
all_responses.extend(responses)

return all_responses

def __del__(self):
"""Cleanup method to delete the LLM instance and free up GPU memory."""
del self.llm
torch.cuda.empty_cache()
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "promptolution"
version = "1.1.1"
version = "1.2.0"
description = ""
authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"]
readme = "README.md"
Expand All @@ -15,6 +15,7 @@ langchain-community = "^0.2.12"
pandas = "^2.2.2"
tqdm = "^4.66.5"
scikit-learn = "^1.5.2"
vllm = "^0.7.3"

[tool.poetry.group.dev.dependencies]
matplotlib = "^3.9.2"
Expand Down
Loading