From 683d9261bf185bd71dbd87bd00eea32204264710 Mon Sep 17 00:00:00 2001 From: mo374z Date: Mon, 3 Mar 2025 18:32:33 +0100 Subject: [PATCH 01/19] Add vllm as feature and a llm_test_run_script --- .gitignore | 1 + promptolution/llms/__init__.py | 10 +++- promptolution/llms/vllm.py | 98 ++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + scripts/llm_test_run.py | 70 ++++++++++++++++++++++++ 5 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 promptolution/llms/vllm.py create mode 100644 scripts/llm_test_run.py diff --git a/.gitignore b/.gitignore index 39aabc4..5786ca0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ rsync_exclude.txt __pycache__/ temp/ dist/ +outputs/ poetry.lock diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py index e4ca64e..95a22bd 100644 --- a/promptolution/llms/__init__.py +++ b/promptolution/llms/__init__.py @@ -3,6 +3,7 @@ from .api_llm import APILLM from .base_llm import DummyLLM from .local_llm import LocalLLM +from .vllm import VLLM def get_llm(model_id: str, *args, **kwargs): @@ -10,13 +11,15 @@ def get_llm(model_id: str, *args, **kwargs): This function supports three types of language models: 1. DummyLLM: A mock LLM for testing purposes. - 2. LocalLLM: For running models locally (identified by 'local' in the model_id). - 3. APILLM: For API-based models (default if not matching other types). + 2. LocalLLM: For running models locally. + 3. VLLM: For running models using the vLLM library. + 4. APILLM: For API-based models (default if not matching other types). Args: model_id (str): Identifier for the model to use. Special cases: - "dummy" for DummyLLM - "local-{model_name}" for LocalLLM + - "vllm-{model_name}" for VLLM - Any other string for APILLM *args: Variable length argument list passed to the LLM constructor. **kwargs: Arbitrary keyword arguments passed to the LLM constructor. @@ -29,4 +32,7 @@ def get_llm(model_id: str, *args, **kwargs): if "local" in model_id: model_id = "-".join(model_id.split("-")[1:]) return LocalLLM(model_id, *args, **kwargs) + if "vllm" in model_id: + model_id = "-".join(model_id.split("-")[1:]) + return VLLM(model_id, *args, **kwargs) return APILLM(model_id, *args, **kwargs) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py new file mode 100644 index 0000000..dd3a489 --- /dev/null +++ b/promptolution/llms/vllm.py @@ -0,0 +1,98 @@ +"""Module for running language models locally using the vLLM library.""" + + +from logging import INFO, Logger + +import torch +from transformers import AutoTokenizer +from vllm import LLM, SamplingParams + +logger = Logger(__name__) +logger.setLevel(INFO) + + +class VLLM: + """A class for running language models using the vLLM library. + + This class sets up a vLLM inference engine with specified model parameters + and provides a method to generate responses for given prompts. + + Attributes: + llm (vllm.LLM): The vLLM inference engine. + tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model. + sampling_params (vllm.SamplingParams): Parameters for text generation. + + Methods: + get_response: Generate responses for a list of prompts. + """ + + def __init__( + self, model_id: str, batch_size=8, max_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None + ): + """Initialize the VLLM with a specific model. + + Args: + model_id (str): The identifier of the model to use. + batch_size (int, optional): The batch size for text generation. Defaults to 8. + max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256. + temperature (float, optional): Sampling temperature. Defaults to 0.1. + top_p (float, optional): Top-p sampling parameter. Defaults to 0.9. + model_storage_path (str, optional): Directory to store the model. Defaults to None. + + Note: + This method sets up a vLLM engine with specified parameters for efficient inference. + """ + # Configure sampling parameters + self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_tokens) + + # Initialize the vLLM engine + self.llm = LLM( + model=model_id, + tokenizer=model_id, + dtype="float16", + tensor_parallel_size=1, + gpu_memory_utilization=0.95, + max_model_len=2048, + download_dir=model_storage_path, + trust_remote_code=True, + ) + + # Initialize tokenizer separately for potential pre-processing + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + self.batch_size = batch_size + + def get_response(self, inputs: list[str]): + """Generate responses for a list of prompts using the vLLM engine. + + Args: + prompts (list[str]): A list of input prompts. + + Returns: + list[str]: A list of generated responses corresponding to the input prompts. + + Note: + This method uses vLLM's batched generation capabilities for efficient inference. + """ + prompts = [ + self.tokenizer.apply_chat_template( + [ + { + "role": "system", + "content": "You are a helpful, harmless, and honest assistant. " + "You answer the user's questions accurately and fairly.", + }, + {"role": "user", "content": input}, + ], + tokenize=False, + ) + for input in inputs + ] + outputs = self.llm.generate(prompts, self.sampling_params) + responses = [output.outputs[0].text for output in outputs] + + return responses + + def __del__(self): + """Cleanup method to delete the LLM instance and free up GPU memory.""" + del self.llm + torch.cuda.empty_cache() diff --git a/pyproject.toml b/pyproject.toml index e933ab3..b96bc55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ langchain-community = "^0.2.12" pandas = "^2.2.2" tqdm = "^4.66.5" scikit-learn = "^1.5.2" +vllm = "^0.7.3" [tool.poetry.group.dev.dependencies] matplotlib = "^3.9.2" diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py new file mode 100644 index 0000000..61fff98 --- /dev/null +++ b/scripts/llm_test_run.py @@ -0,0 +1,70 @@ +"""Test script for measuring raw LLM inference performance on a dataset.""" +import time +import json +from logging import Logger +import argparse +import pandas as pd +import numpy as np + +from promptolution.tasks import get_task +from promptolution.config import Config +from promptolution.predictors import Classificator +from promptolution.llms import get_llm + +logger = Logger(__name__) + + +def main(): + """Run inference test on a dataset using a specified LLM.""" + parser = argparse.ArgumentParser(description="Test LLM inference performance") + parser.add_argument("--model", type=str) + parser.add_argument("--output", type=str) + parser.add_argument("--dataset", type=str, default="agnews") + parser.add_argument("--token", type=str, default=None) + parser.add_argument("--model-storage-path", type=str, default=None) + args = parser.parse_args() + + config = Config( + evaluation_llm=args.model, + ds_path=f"data_sets/cls/{args.dataset}/", + task_name=args.dataset, + api_token=args.token, + n_eval_samples=200, + ) + + start_time = time.time() + + task = get_task(config, split="dev") + llm = get_llm(config.evaluation_llm, token=config.api_token) + + predictor = Classificator(llm, classes=task.classes) + + prompt = task.initial_population[0] + + xs = task.xs[:config.n_eval_samples] + ys = task.ys[:config.n_eval_samples] + + preds, seqs = predictor.predict(prompt, xs, return_seq=True) + + scores = [] + for i in range(len(xs)): + scores.append(1 if preds[0][i] == ys[i] else 0) + + # clean up the sequences + seqs = [seq.replace("\n", "").strip() for seq in seqs] + + df = pd.DataFrame(dict(prompt=task.initial_population[0], seq=seqs, score=scores)) + + total_inference_time = time.time() - start_time + + accuracy = np.array(scores).mean() + + print(f"Overall Acc {accuracy:.4f}") + print(f"Used model {args.model} on dataset {args.dataset}") + print(f"Total inference took {total_inference_time:.2f} seconds") + + df.to_csv(args.output, index=False) + + +if __name__ == "__main__": + main() From 69837fa17cbcfc9e85e676cb8e22f9fc822c2f3a Mon Sep 17 00:00:00 2001 From: mo374z Date: Mon, 3 Mar 2025 18:46:05 +0100 Subject: [PATCH 02/19] small fixes in vllm class --- promptolution/llms/vllm.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index dd3a489..aeb1bb1 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -27,7 +27,14 @@ class VLLM: """ def __init__( - self, model_id: str, batch_size=8, max_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None + self, + model_id: str, + batch_size: int = 8, + max_tokens: int = 256, + temperature: float = 0.1, + top_p: float = 0.9, + model_storage_path: str = None, + token: str = None, ): """Initialize the VLLM with a specific model. @@ -38,6 +45,7 @@ def __init__( temperature (float, optional): Sampling temperature. Defaults to 0.1. top_p (float, optional): Top-p sampling parameter. Defaults to 0.9. model_storage_path (str, optional): Directory to store the model. Defaults to None. + token: (str, optional): Token for accessing the model - not used in implementation yet. Note: This method sets up a vLLM engine with specified parameters for efficient inference. From 7563712bce41432af9abc7bb5f6e415412fb1360 Mon Sep 17 00:00:00 2001 From: mo374z Date: Mon, 3 Mar 2025 21:35:57 +0100 Subject: [PATCH 03/19] differentiate between vllm and api inference --- scripts/llm_test_run.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index 61fff98..ed9242e 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -35,7 +35,13 @@ def main(): start_time = time.time() task = get_task(config, split="dev") - llm = get_llm(config.evaluation_llm, token=config.api_token) + if "vllm" in args.model: + llm = get_llm( + config.evaluation_llm, + model_storage_path=args.model_storage_path, + ) + else: + llm = get_llm(config.evaluation_llm, token=config.api_token) predictor = Classificator(llm, classes=task.classes) From af6f9f8230e896a03a2dfd5d381f883f1e985136 Mon Sep 17 00:00:00 2001 From: mo374z Date: Mon, 3 Mar 2025 22:10:45 +0100 Subject: [PATCH 04/19] set up experiment over multiple tasks and prompts --- scripts/llm_test_run.py | 76 +++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index ed9242e..876433e 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -1,6 +1,6 @@ """Test script for measuring raw LLM inference performance on a dataset.""" import time -import json +from tqdm import tqdm from logging import Logger import argparse import pandas as pd @@ -16,60 +16,76 @@ def main(): """Run inference test on a dataset using a specified LLM.""" - parser = argparse.ArgumentParser(description="Test LLM inference performance") + parser = argparse.ArgumentParser() parser.add_argument("--model", type=str) parser.add_argument("--output", type=str) - parser.add_argument("--dataset", type=str, default="agnews") + parser.add_argument("--datasets", type=list, default=["agnews", "subj"]) parser.add_argument("--token", type=str, default=None) parser.add_argument("--model-storage-path", type=str, default=None) args = parser.parse_args() - config = Config( - evaluation_llm=args.model, - ds_path=f"data_sets/cls/{args.dataset}/", - task_name=args.dataset, - api_token=args.token, - n_eval_samples=200, - ) - start_time = time.time() - task = get_task(config, split="dev") if "vllm" in args.model: llm = get_llm( - config.evaluation_llm, + args.model, model_storage_path=args.model_storage_path, ) else: - llm = get_llm(config.evaluation_llm, token=config.api_token) + llm = get_llm(args.model, args.token) - predictor = Classificator(llm, classes=task.classes) + results = pd.DataFrame() - prompt = task.initial_population[0] + for dataset in args.datasets: + config = Config( + evaluation_llm=args.model, + ds_path=f"data_sets/cls/{dataset}/", + task_name=dataset, + api_token=args.token, + n_eval_samples=200, + ) - xs = task.xs[:config.n_eval_samples] - ys = task.ys[:config.n_eval_samples] + task = get_task(config, split="dev") + predictor = Classificator(llm, classes=task.classes) - preds, seqs = predictor.predict(prompt, xs, return_seq=True) + prompt = task.initial_population - scores = [] - for i in range(len(xs)): - scores.append(1 if preds[0][i] == ys[i] else 0) + xs = task.xs[:config.n_eval_samples] + ys = task.ys[:config.n_eval_samples] - # clean up the sequences - seqs = [seq.replace("\n", "").strip() for seq in seqs] + for prompt in tqdm(task.initial_population): + preds, seqs = predictor.predict(prompt, xs, return_seq=True) - df = pd.DataFrame(dict(prompt=task.initial_population[0], seq=seqs, score=scores)) + scores = [] + for i in range(len(xs)): + scores.append(1 if preds[0][i] == ys[i] else 0) - total_inference_time = time.time() - start_time + # clean up the sequences + seqs = [seq.replace("\n", "").strip() for seq in seqs] - accuracy = np.array(scores).mean() + # if single prompts should be stored + # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores)) + # df.to_csv(args.output + "_detailed", index=False) - print(f"Overall Acc {accuracy:.4f}") - print(f"Used model {args.model} on dataset {args.dataset}") + accuracy = np.array(scores).mean() + + results = pd.concat([results, + pd.DataFrame( + dict( + model=args.model, + dataset=dataset, + prompt=prompt, + accuracy=accuracy, + n_samples=len(xs), + ), + index=[0], + )] + ) + + total_inference_time = time.time() - start_time print(f"Total inference took {total_inference_time:.2f} seconds") - df.to_csv(args.output, index=False) + results.to_csv(args.output, mode="a", header=False, index=False) if __name__ == "__main__": From bc9997a6221a54aa0b503bd8712f9bc90fc4e468 Mon Sep 17 00:00:00 2001 From: mo374z Date: Mon, 3 Mar 2025 22:21:37 +0100 Subject: [PATCH 05/19] change csv saving --- scripts/llm_test_run.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index 876433e..b54c908 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -69,24 +69,21 @@ def main(): accuracy = np.array(scores).mean() - results = pd.concat([results, - pd.DataFrame( - dict( - model=args.model, - dataset=dataset, - prompt=prompt, - accuracy=accuracy, - n_samples=len(xs), - ), - index=[0], - )] - ) + results = pd.DataFrame( + dict( + model=args.model, + dataset=dataset, + prompt=prompt, + accuracy=accuracy, + n_samples=len(xs), + ), + index=[0], + ) + results.to_csv(args.output, mode="a", header=False, index=False) total_inference_time = time.time() - start_time print(f"Total inference took {total_inference_time:.2f} seconds") - results.to_csv(args.output, mode="a", header=False, index=False) - if __name__ == "__main__": main() From 7958b8614ddae5fdd0bf3adfab9306c76c4be2c8 Mon Sep 17 00:00:00 2001 From: mo374z Date: Tue, 4 Mar 2025 17:07:05 +0100 Subject: [PATCH 06/19] add base llm super class --- promptolution/llms/vllm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index aeb1bb1..6f2d3fd 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -7,11 +7,13 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams +from promptolution.llms.base_llm import BaseLLM + logger = Logger(__name__) logger.setLevel(INFO) -class VLLM: +class VLLM(BaseLLM): """A class for running language models using the vLLM library. This class sets up a vLLM inference engine with specified model parameters From e82db3563f3907e86239c41e39e02666cea251c3 Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 00:43:04 +0100 Subject: [PATCH 07/19] add changes from PR review --- promptolution/llms/api_llm.py | 4 ++- promptolution/llms/local_llm.py | 4 ++- promptolution/llms/vllm.py | 45 +++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py index 1c34709..cf966bf 100644 --- a/promptolution/llms/api_llm.py +++ b/promptolution/llms/api_llm.py @@ -13,6 +13,8 @@ from langchain_core.messages import HumanMessage from langchain_openai import ChatOpenAI +from promptolution.llms.base_llm import BaseLLM + logger = Logger(__name__) logger.setLevel(INFO) @@ -46,7 +48,7 @@ async def invoke_model(prompt, model, semaphore): await asyncio.sleep(delay) -class APILLM: +class APILLM(BaseLLM): """A class to interface with various language models through their respective APIs. This class supports Claude (Anthropic), GPT (OpenAI), and LLaMA (DeepInfra) models. diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py index 1cfb616..074bf01 100644 --- a/promptolution/llms/local_llm.py +++ b/promptolution/llms/local_llm.py @@ -8,8 +8,10 @@ logger = logging.getLogger(__name__) logger.warning(f"Could not import torch or transformers in local_llm.py: {e}") +from promptolution.llms.base_llm import BaseLLM -class LocalLLM: + +class LocalLLM(BaseLLM): """A class for running language models locally using the Hugging Face Transformers library. This class sets up a text generation pipeline with specified model parameters diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 6f2d3fd..53983d8 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -3,9 +3,15 @@ from logging import INFO, Logger -import torch -from transformers import AutoTokenizer -from vllm import LLM, SamplingParams +try: + import torch + from transformers import AutoTokenizer + from vllm import LLM, SamplingParams +except ImportError as e: + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}") from promptolution.llms.base_llm import BaseLLM @@ -32,39 +38,56 @@ def __init__( self, model_id: str, batch_size: int = 8, - max_tokens: int = 256, + max_generated_tokens: int = 256, temperature: float = 0.1, top_p: float = 0.9, model_storage_path: str = None, token: str = None, + dtype: str = "float16", + tensor_parallel_size: int = 1, + gpu_memory_utilization: float = 0.95, + max_model_len: int = 2048, + trust_remote_code: bool = False, ): """Initialize the VLLM with a specific model. Args: model_id (str): The identifier of the model to use. batch_size (int, optional): The batch size for text generation. Defaults to 8. - max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256. + max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256. temperature (float, optional): Sampling temperature. Defaults to 0.1. top_p (float, optional): Top-p sampling parameter. Defaults to 0.9. model_storage_path (str, optional): Directory to store the model. Defaults to None. token: (str, optional): Token for accessing the model - not used in implementation yet. + dtype (str, optional): Data type for model weights. Defaults to "float16". + tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1. + gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95. + max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048. + trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. Note: This method sets up a vLLM engine with specified parameters for efficient inference. """ + self.batch_size = batch_size + self.dtype = dtype + self.tensor_parallel_size = tensor_parallel_size + self.gpu_memory_utilization = gpu_memory_utilization + self.max_model_len = max_model_len + self.trust_remote_code = trust_remote_code + # Configure sampling parameters - self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_tokens) + self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens) # Initialize the vLLM engine self.llm = LLM( model=model_id, tokenizer=model_id, - dtype="float16", - tensor_parallel_size=1, - gpu_memory_utilization=0.95, - max_model_len=2048, + dtype=self.dtype, + tensor_parallel_size=self.tensor_parallel_size, + gpu_memory_utilization=self.gpu_memory_utilization, + max_model_len=self.max_model_len, download_dir=model_storage_path, - trust_remote_code=True, + trust_remote_code=self.trust_remote_code, ) # Initialize tokenizer separately for potential pre-processing From 0045de7122c171ccac6382f87c150c43a19c897a Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 18:51:53 +0100 Subject: [PATCH 08/19] change some VLLM params --- promptolution/llms/vllm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 53983d8..f9b8a36 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -43,10 +43,10 @@ def __init__( top_p: float = 0.9, model_storage_path: str = None, token: str = None, - dtype: str = "float16", - tensor_parallel_size: int = 1, + dtype: str = "auto", + tensor_parallel_size: int = None, gpu_memory_utilization: float = 0.95, - max_model_len: int = 2048, + max_model_len: int = 1024, trust_remote_code: bool = False, ): """Initialize the VLLM with a specific model. From 0b3c7cb028af085916ebbffaa56644cda935ab07 Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 18:54:49 +0100 Subject: [PATCH 09/19] fix tensor parallel size to 1 --- promptolution/llms/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index f9b8a36..34658b2 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -44,7 +44,7 @@ def __init__( model_storage_path: str = None, token: str = None, dtype: str = "auto", - tensor_parallel_size: int = None, + tensor_parallel_size: int = 1, gpu_memory_utilization: float = 0.95, max_model_len: int = 1024, trust_remote_code: bool = False, From a73c378426e15379676044a9dc477f84aec8f978 Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 19:04:13 +0100 Subject: [PATCH 10/19] experiment with batch size --- promptolution/llms/vllm.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 34658b2..8c07fc0 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -46,7 +46,7 @@ def __init__( dtype: str = "auto", tensor_parallel_size: int = 1, gpu_memory_utilization: float = 0.95, - max_model_len: int = 1024, + max_model_len: int = 2048, trust_remote_code: bool = False, ): """Initialize the VLLM with a specific model. @@ -120,8 +120,20 @@ def get_response(self, inputs: list[str]): ) for input in inputs ] - outputs = self.llm.generate(prompts, self.sampling_params) - responses = [output.outputs[0].text for output in outputs] + # outputs = self.llm.generate(prompts, self.sampling_params) + # responses = [output.outputs[0].text for output in outputs] + optimal_batch_size = 100 + + responses = [] + for i in range(0, len(prompts), optimal_batch_size): + batch = prompts[i : i + optimal_batch_size] # noqa: E203 + outputs = self.llm.generate(batch, self.sampling_params) + batch_responses = [output.outputs[0].text for output in outputs] + responses.extend(batch_responses) + + # Explicitly clean up between batches + if i + optimal_batch_size < len(prompts): + torch.cuda.empty_cache() return responses From 1f6841098664a497970ba6921a3332c10b6f3138 Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 19:11:38 +0100 Subject: [PATCH 11/19] experiment with larger batch sizes --- promptolution/llms/vllm.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 8c07fc0..1872b95 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -120,20 +120,13 @@ def get_response(self, inputs: list[str]): ) for input in inputs ] - # outputs = self.llm.generate(prompts, self.sampling_params) - # responses = [output.outputs[0].text for output in outputs] - optimal_batch_size = 100 - - responses = [] - for i in range(0, len(prompts), optimal_batch_size): - batch = prompts[i : i + optimal_batch_size] # noqa: E203 - outputs = self.llm.generate(batch, self.sampling_params) - batch_responses = [output.outputs[0].text for output in outputs] - responses.extend(batch_responses) - - # Explicitly clean up between batches - if i + optimal_batch_size < len(prompts): - torch.cuda.empty_cache() + + prompts_2 = prompts.copy() + + prompts_all = prompts + prompts_2 + + outputs = self.llm.generate(prompts_all, self.sampling_params) + responses = [output.outputs[0].text for output in outputs] return responses From f5fe188b2ee4436e8276e15bffcd72f730f55d95 Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 19:37:50 +0100 Subject: [PATCH 12/19] add continuous batch llm --- promptolution/llms/__init__.py | 4 + promptolution/llms/cb_vllm.py | 235 +++++++++++++++++++++++++++++++++ scripts/llm_test_run.py | 9 +- 3 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 promptolution/llms/cb_vllm.py diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py index 95a22bd..ac774ad 100644 --- a/promptolution/llms/__init__.py +++ b/promptolution/llms/__init__.py @@ -2,6 +2,7 @@ from .api_llm import APILLM from .base_llm import DummyLLM +from .cb_vllm import ContinuousBatchVLLM from .local_llm import LocalLLM from .vllm import VLLM @@ -32,6 +33,9 @@ def get_llm(model_id: str, *args, **kwargs): if "local" in model_id: model_id = "-".join(model_id.split("-")[1:]) return LocalLLM(model_id, *args, **kwargs) + if "cbvllm" in model_id: + model_id = "-".join(model_id.split("-")[1:]) + return ContinuousBatchVLLM(model_id, *args, **kwargs) if "vllm" in model_id: model_id = "-".join(model_id.split("-")[1:]) return VLLM(model_id, *args, **kwargs) diff --git a/promptolution/llms/cb_vllm.py b/promptolution/llms/cb_vllm.py new file mode 100644 index 0000000..810d829 --- /dev/null +++ b/promptolution/llms/cb_vllm.py @@ -0,0 +1,235 @@ +"""Module for running language models using vLLM with continuous batching.""" + +import time +from concurrent.futures import ThreadPoolExecutor +from logging import INFO, Logger +from queue import Queue +from threading import Lock +from typing import List + +try: + import torch + from transformers import AutoTokenizer + from vllm import LLM, SamplingParams +except ImportError as e: + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}") + +from promptolution.llms.base_llm import BaseLLM + +logger = Logger(__name__) +logger.setLevel(INFO) + + +class ContinuousBatchVLLM(BaseLLM): + """A class for running language models using vLLM with continuous batching.""" + + def __init__( + self, + model_id: str, + concurrent_requests: int = 8, + max_generated_tokens: int = 256, + temperature: float = 0.1, + top_p: float = 0.9, + model_storage_path: str = None, + token: str = None, + dtype: str = "auto", + tensor_parallel_size: int = 1, + gpu_memory_utilization: float = 0.95, + max_model_len: int = 2048, + trust_remote_code: bool = False, + block_size: int = 16, + ): + """Initialize the continuous batching vLLM engine. + + Args: + model_id (str): The identifier of the model to use. + concurrent_requests (int, optional): Number of requests to process concurrently. Defaults to 8. + max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256. + temperature (float, optional): Sampling temperature. Defaults to 0.1. + top_p (float, optional): Top-p sampling parameter. Defaults to 0.9. + model_storage_path (str, optional): Directory to store the model. Defaults to None. + token (str, optional): Token for accessing the model. Defaults to None. + dtype (str, optional): Data type for model weights. Defaults to "auto". + tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1. + gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95. + max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048. + trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. + block_size (int, optional): KV cache block size. Smaller values can improve performance. Defaults to 16. + """ + self.model_id = model_id + self.concurrent_requests = concurrent_requests + self.dtype = dtype + self.tensor_parallel_size = tensor_parallel_size + self.gpu_memory_utilization = gpu_memory_utilization + self.max_model_len = max_model_len + self.trust_remote_code = trust_remote_code + self.block_size = block_size + + self.sampling_params = SamplingParams( + temperature=temperature, + top_p=top_p, + max_tokens=max_generated_tokens, + early_stopping=True, + ) + + logger.info(f"Initializing continuous batching vLLM with model {model_id}") + start_time = time.time() + + self.llm = LLM( + model=model_id, + tokenizer=model_id, + dtype=self.dtype, + tensor_parallel_size=self.tensor_parallel_size, + gpu_memory_utilization=self.gpu_memory_utilization, + max_model_len=self.max_model_len, + download_dir=model_storage_path, + trust_remote_code=self.trust_remote_code, + block_size=self.block_size, + ) + + logger.info(f"vLLM initialization took {time.time() - start_time:.2f} seconds") + + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + + self.executor = ThreadPoolExecutor(max_workers=1) + self.request_queue = Queue() + self.result_map = {} + self.result_lock = Lock() + + self._warm_up_model() + + self.is_running = True + self.executor.submit(self._continuous_batch_worker) + + def _warm_up_model(self): + logger.info("Warming up model...") + start_time = time.time() + + warmup_prompt = self.tokenizer.apply_chat_template( + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello, how are you?"}, + ], + tokenize=False, + ) + + self.llm.generate([warmup_prompt], self.sampling_params) + + torch.cuda.empty_cache() + logger.info(f"Model warm-up completed in {time.time() - start_time:.2f} seconds") + + def _continuous_batch_worker(self): + logger.info("Starting continuous batching worker thread") + + active_requests = {} + + while self.is_running: + while not self.request_queue.empty() and len(active_requests) < self.concurrent_requests: + try: + request_id, prompt = self.request_queue.get_nowait() + active_requests[request_id] = prompt + except Exception: + break + + if active_requests: + try: + request_ids = list(active_requests.keys()) + prompts = list(active_requests.values()) + + logger.info(f"Processing batch of {len(prompts)} prompts") + start_time = time.time() + + outputs = self.llm.generate(prompts, self.sampling_params) + + elapsed = time.time() - start_time + logger.info(f"Batch processed in {elapsed:.3f}s ({len(prompts)/elapsed:.1f} prompts/sec)") + + with self.result_lock: + for request_id, output in zip(request_ids, outputs): + self.result_map[request_id] = output.outputs[0].text + + active_requests.clear() + + except Exception as e: + logger.error(f"Error in continuous batching worker: {e}") + active_requests.clear() + + time.sleep(0.01) + + def get_response(self, inputs: List[str]) -> List[str]: + """Generate responses for a list of prompts using the continuous batching vLLM engine. + + This method queues the input prompts for processing by the background worker thread + and waits for the results to be available. + + Args: + inputs (List[str]): A list of input prompts. + + Returns: + List[str]: A list of generated responses corresponding to the input prompts. + """ + prompts = [ + self.tokenizer.apply_chat_template( + [ + { + "role": "system", + "content": "You are a helpful, harmless, and honest assistant. " + "You answer the user's questions accurately and fairly.", + }, + {"role": "user", "content": input_text}, + ], + tokenize=False, + ) + for input_text in inputs + ] + + request_ids = [f"req_{int(time.time() * 1000)}_{i}" for i in range(len(prompts))] + + for request_id, prompt in zip(request_ids, prompts): + self.request_queue.put((request_id, prompt)) + + max_wait_time = 60 + start_time = time.time() + + results = [None] * len(request_ids) + remaining_ids = set(request_ids) + + while remaining_ids and (time.time() - start_time) < max_wait_time: + with self.result_lock: + for i, request_id in enumerate(request_ids): + if request_id in self.result_map and request_id in remaining_ids: + results[i] = self.result_map[request_id] + remaining_ids.remove(request_id) + del self.result_map[request_id] + + if remaining_ids: + time.sleep(0.1) + + if remaining_ids: + logger.warning(f"Timed out waiting for {len(remaining_ids)} requests") + for i, request_id in enumerate(request_ids): + if results[i] is None: + results[i] = "Error: Request timed out" + + return results + + def __del__(self): + """Cleanup method to stop the worker thread and free resources. + + This magic method is called when the object is about to be destroyed. + It ensures proper shutdown of the background worker thread and + releases GPU resources. + """ + self.is_running = False + + if hasattr(self, "executor"): + self.executor.shutdown(wait=False) + + if hasattr(self, "llm"): + del self.llm + + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index b54c908..b82f828 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -22,11 +22,18 @@ def main(): parser.add_argument("--datasets", type=list, default=["agnews", "subj"]) parser.add_argument("--token", type=str, default=None) parser.add_argument("--model-storage-path", type=str, default=None) + parser.add_argument("--concurrent-requests", type=int, default=8) args = parser.parse_args() start_time = time.time() - if "vllm" in args.model: + if "cbvllm" in args.model: + llm = get_llm( + args.model, + model_storage_path=args.model_storage_path, + concurrent_requests=args.concurrent_requests, + ) + elif "vllm" in args.model: llm = get_llm( args.model, model_storage_path=args.model_storage_path, From 1330a9e2f55b4089c92f4a658255a4aa3879088c Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 19:39:34 +0100 Subject: [PATCH 13/19] remove arg --- promptolution/llms/cb_vllm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/promptolution/llms/cb_vllm.py b/promptolution/llms/cb_vllm.py index 810d829..c28b6bc 100644 --- a/promptolution/llms/cb_vllm.py +++ b/promptolution/llms/cb_vllm.py @@ -72,7 +72,6 @@ def __init__( temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, - early_stopping=True, ) logger.info(f"Initializing continuous batching vLLM with model {model_id}") From c6dbb7be85942f9f6569c7daec103dae68231b60 Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 19:54:50 +0100 Subject: [PATCH 14/19] remove continuous batch inference try --- promptolution/llms/__init__.py | 4 - promptolution/llms/cb_vllm.py | 234 --------------------------------- promptolution/llms/vllm.py | 6 +- scripts/llm_test_run.py | 9 +- 4 files changed, 2 insertions(+), 251 deletions(-) delete mode 100644 promptolution/llms/cb_vllm.py diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py index ac774ad..95a22bd 100644 --- a/promptolution/llms/__init__.py +++ b/promptolution/llms/__init__.py @@ -2,7 +2,6 @@ from .api_llm import APILLM from .base_llm import DummyLLM -from .cb_vllm import ContinuousBatchVLLM from .local_llm import LocalLLM from .vllm import VLLM @@ -33,9 +32,6 @@ def get_llm(model_id: str, *args, **kwargs): if "local" in model_id: model_id = "-".join(model_id.split("-")[1:]) return LocalLLM(model_id, *args, **kwargs) - if "cbvllm" in model_id: - model_id = "-".join(model_id.split("-")[1:]) - return ContinuousBatchVLLM(model_id, *args, **kwargs) if "vllm" in model_id: model_id = "-".join(model_id.split("-")[1:]) return VLLM(model_id, *args, **kwargs) diff --git a/promptolution/llms/cb_vllm.py b/promptolution/llms/cb_vllm.py deleted file mode 100644 index c28b6bc..0000000 --- a/promptolution/llms/cb_vllm.py +++ /dev/null @@ -1,234 +0,0 @@ -"""Module for running language models using vLLM with continuous batching.""" - -import time -from concurrent.futures import ThreadPoolExecutor -from logging import INFO, Logger -from queue import Queue -from threading import Lock -from typing import List - -try: - import torch - from transformers import AutoTokenizer - from vllm import LLM, SamplingParams -except ImportError as e: - import logging - - logger = logging.getLogger(__name__) - logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}") - -from promptolution.llms.base_llm import BaseLLM - -logger = Logger(__name__) -logger.setLevel(INFO) - - -class ContinuousBatchVLLM(BaseLLM): - """A class for running language models using vLLM with continuous batching.""" - - def __init__( - self, - model_id: str, - concurrent_requests: int = 8, - max_generated_tokens: int = 256, - temperature: float = 0.1, - top_p: float = 0.9, - model_storage_path: str = None, - token: str = None, - dtype: str = "auto", - tensor_parallel_size: int = 1, - gpu_memory_utilization: float = 0.95, - max_model_len: int = 2048, - trust_remote_code: bool = False, - block_size: int = 16, - ): - """Initialize the continuous batching vLLM engine. - - Args: - model_id (str): The identifier of the model to use. - concurrent_requests (int, optional): Number of requests to process concurrently. Defaults to 8. - max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256. - temperature (float, optional): Sampling temperature. Defaults to 0.1. - top_p (float, optional): Top-p sampling parameter. Defaults to 0.9. - model_storage_path (str, optional): Directory to store the model. Defaults to None. - token (str, optional): Token for accessing the model. Defaults to None. - dtype (str, optional): Data type for model weights. Defaults to "auto". - tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1. - gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95. - max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048. - trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. - block_size (int, optional): KV cache block size. Smaller values can improve performance. Defaults to 16. - """ - self.model_id = model_id - self.concurrent_requests = concurrent_requests - self.dtype = dtype - self.tensor_parallel_size = tensor_parallel_size - self.gpu_memory_utilization = gpu_memory_utilization - self.max_model_len = max_model_len - self.trust_remote_code = trust_remote_code - self.block_size = block_size - - self.sampling_params = SamplingParams( - temperature=temperature, - top_p=top_p, - max_tokens=max_generated_tokens, - ) - - logger.info(f"Initializing continuous batching vLLM with model {model_id}") - start_time = time.time() - - self.llm = LLM( - model=model_id, - tokenizer=model_id, - dtype=self.dtype, - tensor_parallel_size=self.tensor_parallel_size, - gpu_memory_utilization=self.gpu_memory_utilization, - max_model_len=self.max_model_len, - download_dir=model_storage_path, - trust_remote_code=self.trust_remote_code, - block_size=self.block_size, - ) - - logger.info(f"vLLM initialization took {time.time() - start_time:.2f} seconds") - - self.tokenizer = AutoTokenizer.from_pretrained(model_id) - - self.executor = ThreadPoolExecutor(max_workers=1) - self.request_queue = Queue() - self.result_map = {} - self.result_lock = Lock() - - self._warm_up_model() - - self.is_running = True - self.executor.submit(self._continuous_batch_worker) - - def _warm_up_model(self): - logger.info("Warming up model...") - start_time = time.time() - - warmup_prompt = self.tokenizer.apply_chat_template( - [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Hello, how are you?"}, - ], - tokenize=False, - ) - - self.llm.generate([warmup_prompt], self.sampling_params) - - torch.cuda.empty_cache() - logger.info(f"Model warm-up completed in {time.time() - start_time:.2f} seconds") - - def _continuous_batch_worker(self): - logger.info("Starting continuous batching worker thread") - - active_requests = {} - - while self.is_running: - while not self.request_queue.empty() and len(active_requests) < self.concurrent_requests: - try: - request_id, prompt = self.request_queue.get_nowait() - active_requests[request_id] = prompt - except Exception: - break - - if active_requests: - try: - request_ids = list(active_requests.keys()) - prompts = list(active_requests.values()) - - logger.info(f"Processing batch of {len(prompts)} prompts") - start_time = time.time() - - outputs = self.llm.generate(prompts, self.sampling_params) - - elapsed = time.time() - start_time - logger.info(f"Batch processed in {elapsed:.3f}s ({len(prompts)/elapsed:.1f} prompts/sec)") - - with self.result_lock: - for request_id, output in zip(request_ids, outputs): - self.result_map[request_id] = output.outputs[0].text - - active_requests.clear() - - except Exception as e: - logger.error(f"Error in continuous batching worker: {e}") - active_requests.clear() - - time.sleep(0.01) - - def get_response(self, inputs: List[str]) -> List[str]: - """Generate responses for a list of prompts using the continuous batching vLLM engine. - - This method queues the input prompts for processing by the background worker thread - and waits for the results to be available. - - Args: - inputs (List[str]): A list of input prompts. - - Returns: - List[str]: A list of generated responses corresponding to the input prompts. - """ - prompts = [ - self.tokenizer.apply_chat_template( - [ - { - "role": "system", - "content": "You are a helpful, harmless, and honest assistant. " - "You answer the user's questions accurately and fairly.", - }, - {"role": "user", "content": input_text}, - ], - tokenize=False, - ) - for input_text in inputs - ] - - request_ids = [f"req_{int(time.time() * 1000)}_{i}" for i in range(len(prompts))] - - for request_id, prompt in zip(request_ids, prompts): - self.request_queue.put((request_id, prompt)) - - max_wait_time = 60 - start_time = time.time() - - results = [None] * len(request_ids) - remaining_ids = set(request_ids) - - while remaining_ids and (time.time() - start_time) < max_wait_time: - with self.result_lock: - for i, request_id in enumerate(request_ids): - if request_id in self.result_map and request_id in remaining_ids: - results[i] = self.result_map[request_id] - remaining_ids.remove(request_id) - del self.result_map[request_id] - - if remaining_ids: - time.sleep(0.1) - - if remaining_ids: - logger.warning(f"Timed out waiting for {len(remaining_ids)} requests") - for i, request_id in enumerate(request_ids): - if results[i] is None: - results[i] = "Error: Request timed out" - - return results - - def __del__(self): - """Cleanup method to stop the worker thread and free resources. - - This magic method is called when the object is about to be destroyed. - It ensures proper shutdown of the background worker thread and - releases GPU resources. - """ - self.is_running = False - - if hasattr(self, "executor"): - self.executor.shutdown(wait=False) - - if hasattr(self, "llm"): - del self.llm - - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 1872b95..3de8c4b 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -121,11 +121,7 @@ def get_response(self, inputs: list[str]): for input in inputs ] - prompts_2 = prompts.copy() - - prompts_all = prompts + prompts_2 - - outputs = self.llm.generate(prompts_all, self.sampling_params) + outputs = self.llm.generate(prompts, self.sampling_params) responses = [output.outputs[0].text for output in outputs] return responses diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index b82f828..b54c908 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -22,18 +22,11 @@ def main(): parser.add_argument("--datasets", type=list, default=["agnews", "subj"]) parser.add_argument("--token", type=str, default=None) parser.add_argument("--model-storage-path", type=str, default=None) - parser.add_argument("--concurrent-requests", type=int, default=8) args = parser.parse_args() start_time = time.time() - if "cbvllm" in args.model: - llm = get_llm( - args.model, - model_storage_path=args.model_storage_path, - concurrent_requests=args.concurrent_requests, - ) - elif "vllm" in args.model: + if "vllm" in args.model: llm = get_llm( args.model, model_storage_path=args.model_storage_path, From 42ab6c969cfc46097ee9cf4d98fdfd0f4e797b08 Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 20:58:30 +0100 Subject: [PATCH 15/19] add batching to vllm --- .flake8 | 2 +- promptolution/llms/vllm.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.flake8 b/.flake8 index a2d1129..1276a9a 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] max-line-length = 120 -ignore = F401, W503 +ignore = E731,E231,E203,E501,F401,W503 diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 3de8c4b..0acd01e 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -37,7 +37,7 @@ class VLLM(BaseLLM): def __init__( self, model_id: str, - batch_size: int = 8, + batch_size: int = 64, max_generated_tokens: int = 256, temperature: float = 0.1, top_p: float = 0.9, @@ -92,7 +92,6 @@ def __init__( # Initialize tokenizer separately for potential pre-processing self.tokenizer = AutoTokenizer.from_pretrained(model_id) - self.batch_size = batch_size def get_response(self, inputs: list[str]): """Generate responses for a list of prompts using the vLLM engine. @@ -121,10 +120,15 @@ def get_response(self, inputs: list[str]): for input in inputs ] - outputs = self.llm.generate(prompts, self.sampling_params) - responses = [output.outputs[0].text for output in outputs] + # generate responses for self.batch_size prompts at the same time + all_responses = [] + for i in range(0, len(prompts), self.batch_size): + batch = prompts[i : i + self.batch_size] + outputs = self.llm.generate(batch, self.sampling_params) + responses = [output.outputs[0].text for output in outputs] + all_responses.extend(responses) - return responses + return all_responses def __del__(self): """Cleanup method to delete the LLM instance and free up GPU memory.""" From 0be3d064c596043c6a356ba170ae0ec24c1dd2ec Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 21:00:00 +0100 Subject: [PATCH 16/19] add batching in script --- scripts/llm_test_run.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index b54c908..aea7ccd 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -21,6 +21,7 @@ def main(): parser.add_argument("--output", type=str) parser.add_argument("--datasets", type=list, default=["agnews", "subj"]) parser.add_argument("--token", type=str, default=None) + parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--model-storage-path", type=str, default=None) args = parser.parse_args() @@ -29,6 +30,7 @@ def main(): if "vllm" in args.model: llm = get_llm( args.model, + batch_size=args.batch_size, model_storage_path=args.model_storage_path, ) else: From c5ac1015d461411f083f3ec03f1157d1a86f5b8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Hei=C3=9F?= Date: Wed, 5 Mar 2025 22:01:48 +0100 Subject: [PATCH 17/19] Add release notes and increase version number --- docs/release-notes.md | 15 ++++++++++++++- pyproject.toml | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index 10c16a5..20b97b7 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,10 +1,19 @@ # Release Notes +## Release v1.2.0 +### What's changed +#### Added features +* New LLM wrapper: VLLM for local inference with batches + +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.1...v1.2.0) + ## Release v1.1.1 ### What's Changed #### Further Changes: - deleted poetry.lock -- updated transformers dependency: bumped from 4.46.3 to 4.48.0 +- updated transformers dependency: bumped from 4.46.3 to 4.48.0 + +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.0...v1.1.1) ## Release v1.1.0 ### What's changed @@ -16,6 +25,8 @@ * improved opros meta-prompt * added support for python versions from 3.9 onwards (previously 3.11) +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.1...v1.1.0) + ## Release v1.0.1 ### What's changed #### Added features @@ -24,6 +35,8 @@ #### Further Changes: * fixed release notes +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.0...v1.0.1) + ## Release v1.0.0 ### What's changed #### Added Features: diff --git a/pyproject.toml b/pyproject.toml index b96bc55..e4f5be3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "promptolution" -version = "1.1.1" +version = "1.2.0" description = "" authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"] readme = "README.md" From 0eb701b20a1040108bd417a55ef0a613f578af82 Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 22:06:10 +0100 Subject: [PATCH 18/19] remove llm_test_run.py script --- scripts/llm_test_run.py | 91 ----------------------------------------- 1 file changed, 91 deletions(-) delete mode 100644 scripts/llm_test_run.py diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py deleted file mode 100644 index aea7ccd..0000000 --- a/scripts/llm_test_run.py +++ /dev/null @@ -1,91 +0,0 @@ -"""Test script for measuring raw LLM inference performance on a dataset.""" -import time -from tqdm import tqdm -from logging import Logger -import argparse -import pandas as pd -import numpy as np - -from promptolution.tasks import get_task -from promptolution.config import Config -from promptolution.predictors import Classificator -from promptolution.llms import get_llm - -logger = Logger(__name__) - - -def main(): - """Run inference test on a dataset using a specified LLM.""" - parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str) - parser.add_argument("--output", type=str) - parser.add_argument("--datasets", type=list, default=["agnews", "subj"]) - parser.add_argument("--token", type=str, default=None) - parser.add_argument("--batch-size", type=int, default=64) - parser.add_argument("--model-storage-path", type=str, default=None) - args = parser.parse_args() - - start_time = time.time() - - if "vllm" in args.model: - llm = get_llm( - args.model, - batch_size=args.batch_size, - model_storage_path=args.model_storage_path, - ) - else: - llm = get_llm(args.model, args.token) - - results = pd.DataFrame() - - for dataset in args.datasets: - config = Config( - evaluation_llm=args.model, - ds_path=f"data_sets/cls/{dataset}/", - task_name=dataset, - api_token=args.token, - n_eval_samples=200, - ) - - task = get_task(config, split="dev") - predictor = Classificator(llm, classes=task.classes) - - prompt = task.initial_population - - xs = task.xs[:config.n_eval_samples] - ys = task.ys[:config.n_eval_samples] - - for prompt in tqdm(task.initial_population): - preds, seqs = predictor.predict(prompt, xs, return_seq=True) - - scores = [] - for i in range(len(xs)): - scores.append(1 if preds[0][i] == ys[i] else 0) - - # clean up the sequences - seqs = [seq.replace("\n", "").strip() for seq in seqs] - - # if single prompts should be stored - # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores)) - # df.to_csv(args.output + "_detailed", index=False) - - accuracy = np.array(scores).mean() - - results = pd.DataFrame( - dict( - model=args.model, - dataset=dataset, - prompt=prompt, - accuracy=accuracy, - n_samples=len(xs), - ), - index=[0], - ) - results.to_csv(args.output, mode="a", header=False, index=False) - - total_inference_time = time.time() - start_time - print(f"Total inference took {total_inference_time:.2f} seconds") - - -if __name__ == "__main__": - main() From fae011336f57b724310ea55fef8a3c759d925d6a Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 5 Mar 2025 23:29:37 +0100 Subject: [PATCH 19/19] change system prompt --- promptolution/llms/vllm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 0acd01e..d99c542 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -110,8 +110,7 @@ def get_response(self, inputs: list[str]): [ { "role": "system", - "content": "You are a helpful, harmless, and honest assistant. " - "You answer the user's questions accurately and fairly.", + "content": "You are a helpful assistant.", }, {"role": "user", "content": input}, ],