From 683d9261bf185bd71dbd87bd00eea32204264710 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Mon, 3 Mar 2025 18:32:33 +0100
Subject: [PATCH 01/19] Add vllm as feature and a llm_test_run_script

---
 .gitignore                     |  1 +
 promptolution/llms/__init__.py | 10 +++-
 promptolution/llms/vllm.py     | 98 ++++++++++++++++++++++++++++++++++
 pyproject.toml                 |  1 +
 scripts/llm_test_run.py        | 70 ++++++++++++++++++++++++
 5 files changed, 178 insertions(+), 2 deletions(-)
 create mode 100644 promptolution/llms/vllm.py
 create mode 100644 scripts/llm_test_run.py

diff --git a/.gitignore b/.gitignore
index 39aabc4..5786ca0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ rsync_exclude.txt
 __pycache__/
 temp/
 dist/
+outputs/
 poetry.lock
diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py
index e4ca64e..95a22bd 100644
--- a/promptolution/llms/__init__.py
+++ b/promptolution/llms/__init__.py
@@ -3,6 +3,7 @@
 from .api_llm import APILLM
 from .base_llm import DummyLLM
 from .local_llm import LocalLLM
+from .vllm import VLLM
 
 
 def get_llm(model_id: str, *args, **kwargs):
@@ -10,13 +11,15 @@ def get_llm(model_id: str, *args, **kwargs):
 
     This function supports three types of language models:
     1. DummyLLM: A mock LLM for testing purposes.
-    2. LocalLLM: For running models locally (identified by 'local' in the model_id).
-    3. APILLM: For API-based models (default if not matching other types).
+    2. LocalLLM: For running models locally.
+    3. VLLM: For running models using the vLLM library.
+    4. APILLM: For API-based models (default if not matching other types).
 
     Args:
         model_id (str): Identifier for the model to use. Special cases:
                         - "dummy" for DummyLLM
                         - "local-{model_name}" for LocalLLM
+                        - "vllm-{model_name}" for VLLM
                         - Any other string for APILLM
         *args: Variable length argument list passed to the LLM constructor.
         **kwargs: Arbitrary keyword arguments passed to the LLM constructor.
@@ -29,4 +32,7 @@ def get_llm(model_id: str, *args, **kwargs):
     if "local" in model_id:
         model_id = "-".join(model_id.split("-")[1:])
         return LocalLLM(model_id, *args, **kwargs)
+    if "vllm" in model_id:
+        model_id = "-".join(model_id.split("-")[1:])
+        return VLLM(model_id, *args, **kwargs)
     return APILLM(model_id, *args, **kwargs)
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
new file mode 100644
index 0000000..dd3a489
--- /dev/null
+++ b/promptolution/llms/vllm.py
@@ -0,0 +1,98 @@
+"""Module for running language models locally using the vLLM library."""
+
+
+from logging import INFO, Logger
+
+import torch
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+logger = Logger(__name__)
+logger.setLevel(INFO)
+
+
+class VLLM:
+    """A class for running language models using the vLLM library.
+
+    This class sets up a vLLM inference engine with specified model parameters
+    and provides a method to generate responses for given prompts.
+
+    Attributes:
+        llm (vllm.LLM): The vLLM inference engine.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        sampling_params (vllm.SamplingParams): Parameters for text generation.
+
+    Methods:
+        get_response: Generate responses for a list of prompts.
+    """
+
+    def __init__(
+        self, model_id: str, batch_size=8, max_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None
+    ):
+        """Initialize the VLLM with a specific model.
+
+        Args:
+            model_id (str): The identifier of the model to use.
+            batch_size (int, optional): The batch size for text generation. Defaults to 8.
+            max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
+            temperature (float, optional): Sampling temperature. Defaults to 0.1.
+            top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
+            model_storage_path (str, optional): Directory to store the model. Defaults to None.
+
+        Note:
+            This method sets up a vLLM engine with specified parameters for efficient inference.
+        """
+        # Configure sampling parameters
+        self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_tokens)
+
+        # Initialize the vLLM engine
+        self.llm = LLM(
+            model=model_id,
+            tokenizer=model_id,
+            dtype="float16",
+            tensor_parallel_size=1,
+            gpu_memory_utilization=0.95,
+            max_model_len=2048,
+            download_dir=model_storage_path,
+            trust_remote_code=True,
+        )
+
+        # Initialize tokenizer separately for potential pre-processing
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.batch_size = batch_size
+
+    def get_response(self, inputs: list[str]):
+        """Generate responses for a list of prompts using the vLLM engine.
+
+        Args:
+            prompts (list[str]): A list of input prompts.
+
+        Returns:
+            list[str]: A list of generated responses corresponding to the input prompts.
+
+        Note:
+            This method uses vLLM's batched generation capabilities for efficient inference.
+        """
+        prompts = [
+            self.tokenizer.apply_chat_template(
+                [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful, harmless, and honest assistant. "
+                        "You answer the user's questions accurately and fairly.",
+                    },
+                    {"role": "user", "content": input},
+                ],
+                tokenize=False,
+            )
+            for input in inputs
+        ]
+        outputs = self.llm.generate(prompts, self.sampling_params)
+        responses = [output.outputs[0].text for output in outputs]
+
+        return responses
+
+    def __del__(self):
+        """Cleanup method to delete the LLM instance and free up GPU memory."""
+        del self.llm
+        torch.cuda.empty_cache()
diff --git a/pyproject.toml b/pyproject.toml
index e933ab3..b96bc55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ langchain-community = "^0.2.12"
 pandas = "^2.2.2"
 tqdm = "^4.66.5"
 scikit-learn = "^1.5.2"
+vllm = "^0.7.3"
 
 [tool.poetry.group.dev.dependencies]
 matplotlib = "^3.9.2"
diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
new file mode 100644
index 0000000..61fff98
--- /dev/null
+++ b/scripts/llm_test_run.py
@@ -0,0 +1,70 @@
+"""Test script for measuring raw LLM inference performance on a dataset."""
+import time
+import json
+from logging import Logger
+import argparse
+import pandas as pd
+import numpy as np
+
+from promptolution.tasks import get_task
+from promptolution.config import Config
+from promptolution.predictors import Classificator
+from promptolution.llms import get_llm
+
+logger = Logger(__name__)
+
+
+def main():
+    """Run inference test on a dataset using a specified LLM."""
+    parser = argparse.ArgumentParser(description="Test LLM inference performance")
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--output", type=str)
+    parser.add_argument("--dataset", type=str, default="agnews")
+    parser.add_argument("--token", type=str, default=None)
+    parser.add_argument("--model-storage-path", type=str, default=None)
+    args = parser.parse_args()
+
+    config = Config(
+        evaluation_llm=args.model,
+        ds_path=f"data_sets/cls/{args.dataset}/",
+        task_name=args.dataset,
+        api_token=args.token,
+        n_eval_samples=200,
+    )
+
+    start_time = time.time()
+
+    task = get_task(config, split="dev")
+    llm = get_llm(config.evaluation_llm, token=config.api_token)
+
+    predictor = Classificator(llm, classes=task.classes)
+
+    prompt = task.initial_population[0]
+
+    xs = task.xs[:config.n_eval_samples]
+    ys = task.ys[:config.n_eval_samples]
+
+    preds, seqs = predictor.predict(prompt, xs, return_seq=True)
+
+    scores = []
+    for i in range(len(xs)):
+        scores.append(1 if preds[0][i] == ys[i] else 0)
+
+    # clean up the sequences
+    seqs = [seq.replace("\n", "").strip() for seq in seqs]
+
+    df = pd.DataFrame(dict(prompt=task.initial_population[0], seq=seqs, score=scores))
+
+    total_inference_time = time.time() - start_time
+
+    accuracy = np.array(scores).mean()
+
+    print(f"Overall Acc {accuracy:.4f}")
+    print(f"Used model {args.model} on dataset {args.dataset}")
+    print(f"Total inference took {total_inference_time:.2f} seconds")
+
+    df.to_csv(args.output, index=False)
+
+
+if __name__ == "__main__":
+    main()

From 69837fa17cbcfc9e85e676cb8e22f9fc822c2f3a Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Mon, 3 Mar 2025 18:46:05 +0100
Subject: [PATCH 02/19] small fixes in vllm class

---
 promptolution/llms/vllm.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index dd3a489..aeb1bb1 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -27,7 +27,14 @@ class VLLM:
     """
 
     def __init__(
-        self, model_id: str, batch_size=8, max_tokens=256, temperature=0.1, top_p=0.9, model_storage_path=None
+        self,
+        model_id: str,
+        batch_size: int = 8,
+        max_tokens: int = 256,
+        temperature: float = 0.1,
+        top_p: float = 0.9,
+        model_storage_path: str = None,
+        token: str = None,
     ):
         """Initialize the VLLM with a specific model.
 
@@ -38,6 +45,7 @@ def __init__(
             temperature (float, optional): Sampling temperature. Defaults to 0.1.
             top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
             model_storage_path (str, optional): Directory to store the model. Defaults to None.
+            token: (str, optional): Token for accessing the model - not used in implementation yet.
 
         Note:
             This method sets up a vLLM engine with specified parameters for efficient inference.

From 7563712bce41432af9abc7bb5f6e415412fb1360 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Mon, 3 Mar 2025 21:35:57 +0100
Subject: [PATCH 03/19] differentiate between vllm and api inference

---
 scripts/llm_test_run.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index 61fff98..ed9242e 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -35,7 +35,13 @@ def main():
     start_time = time.time()
 
     task = get_task(config, split="dev")
-    llm = get_llm(config.evaluation_llm, token=config.api_token)
+    if "vllm" in args.model:
+        llm = get_llm(
+            config.evaluation_llm,
+            model_storage_path=args.model_storage_path,
+        )
+    else:
+        llm = get_llm(config.evaluation_llm, token=config.api_token)
 
     predictor = Classificator(llm, classes=task.classes)
 

From af6f9f8230e896a03a2dfd5d381f883f1e985136 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Mon, 3 Mar 2025 22:10:45 +0100
Subject: [PATCH 04/19] set up experiment over multiple tasks and prompts

---
 scripts/llm_test_run.py | 76 +++++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 30 deletions(-)

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index ed9242e..876433e 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -1,6 +1,6 @@
 """Test script for measuring raw LLM inference performance on a dataset."""
 import time
-import json
+from tqdm import tqdm
 from logging import Logger
 import argparse
 import pandas as pd
@@ -16,60 +16,76 @@
 
 def main():
     """Run inference test on a dataset using a specified LLM."""
-    parser = argparse.ArgumentParser(description="Test LLM inference performance")
+    parser = argparse.ArgumentParser()
     parser.add_argument("--model", type=str)
     parser.add_argument("--output", type=str)
-    parser.add_argument("--dataset", type=str, default="agnews")
+    parser.add_argument("--datasets", type=list, default=["agnews", "subj"])
     parser.add_argument("--token", type=str, default=None)
     parser.add_argument("--model-storage-path", type=str, default=None)
     args = parser.parse_args()
 
-    config = Config(
-        evaluation_llm=args.model,
-        ds_path=f"data_sets/cls/{args.dataset}/",
-        task_name=args.dataset,
-        api_token=args.token,
-        n_eval_samples=200,
-    )
-
     start_time = time.time()
 
-    task = get_task(config, split="dev")
     if "vllm" in args.model:
         llm = get_llm(
-            config.evaluation_llm,
+            args.model,
             model_storage_path=args.model_storage_path,
         )
     else:
-        llm = get_llm(config.evaluation_llm, token=config.api_token)
+        llm = get_llm(args.model, args.token)
 
-    predictor = Classificator(llm, classes=task.classes)
+    results = pd.DataFrame()
 
-    prompt = task.initial_population[0]
+    for dataset in args.datasets:
+        config = Config(
+            evaluation_llm=args.model,
+            ds_path=f"data_sets/cls/{dataset}/",
+            task_name=dataset,
+            api_token=args.token,
+            n_eval_samples=200,
+        )
 
-    xs = task.xs[:config.n_eval_samples]
-    ys = task.ys[:config.n_eval_samples]
+        task = get_task(config, split="dev")
+        predictor = Classificator(llm, classes=task.classes)
 
-    preds, seqs = predictor.predict(prompt, xs, return_seq=True)
+        prompt = task.initial_population
 
-    scores = []
-    for i in range(len(xs)):
-        scores.append(1 if preds[0][i] == ys[i] else 0)
+        xs = task.xs[:config.n_eval_samples]
+        ys = task.ys[:config.n_eval_samples]
 
-    # clean up the sequences
-    seqs = [seq.replace("\n", "").strip() for seq in seqs]
+        for prompt in tqdm(task.initial_population):
+            preds, seqs = predictor.predict(prompt, xs, return_seq=True)
 
-    df = pd.DataFrame(dict(prompt=task.initial_population[0], seq=seqs, score=scores))
+            scores = []
+            for i in range(len(xs)):
+                scores.append(1 if preds[0][i] == ys[i] else 0)
 
-    total_inference_time = time.time() - start_time
+            # clean up the sequences
+            seqs = [seq.replace("\n", "").strip() for seq in seqs]
 
-    accuracy = np.array(scores).mean()
+            # if single prompts should be stored
+            # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores))
+            # df.to_csv(args.output + "_detailed", index=False)
 
-    print(f"Overall Acc {accuracy:.4f}")
-    print(f"Used model {args.model} on dataset {args.dataset}")
+            accuracy = np.array(scores).mean()
+
+            results = pd.concat([results,
+                                pd.DataFrame(
+                                    dict(
+                                        model=args.model,
+                                        dataset=dataset,
+                                        prompt=prompt,
+                                        accuracy=accuracy,
+                                        n_samples=len(xs),
+                                    ),
+                                    index=[0],
+                                )]
+                                )
+
+    total_inference_time = time.time() - start_time
     print(f"Total inference took {total_inference_time:.2f} seconds")
 
-    df.to_csv(args.output, index=False)
+    results.to_csv(args.output, mode="a", header=False, index=False)
 
 
 if __name__ == "__main__":

From bc9997a6221a54aa0b503bd8712f9bc90fc4e468 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Mon, 3 Mar 2025 22:21:37 +0100
Subject: [PATCH 05/19] change csv saving

---
 scripts/llm_test_run.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index 876433e..b54c908 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -69,24 +69,21 @@ def main():
 
             accuracy = np.array(scores).mean()
 
-            results = pd.concat([results,
-                                pd.DataFrame(
-                                    dict(
-                                        model=args.model,
-                                        dataset=dataset,
-                                        prompt=prompt,
-                                        accuracy=accuracy,
-                                        n_samples=len(xs),
-                                    ),
-                                    index=[0],
-                                )]
-                                )
+            results = pd.DataFrame(
+                dict(
+                    model=args.model,
+                    dataset=dataset,
+                    prompt=prompt,
+                    accuracy=accuracy,
+                    n_samples=len(xs),
+                ),
+                index=[0],
+            )
+            results.to_csv(args.output, mode="a", header=False, index=False)
 
     total_inference_time = time.time() - start_time
     print(f"Total inference took {total_inference_time:.2f} seconds")
 
-    results.to_csv(args.output, mode="a", header=False, index=False)
-
 
 if __name__ == "__main__":
     main()

From 7958b8614ddae5fdd0bf3adfab9306c76c4be2c8 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Tue, 4 Mar 2025 17:07:05 +0100
Subject: [PATCH 06/19] add base llm super class

---
 promptolution/llms/vllm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index aeb1bb1..6f2d3fd 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -7,11 +7,13 @@
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 
+from promptolution.llms.base_llm import BaseLLM
+
 logger = Logger(__name__)
 logger.setLevel(INFO)
 
 
-class VLLM:
+class VLLM(BaseLLM):
     """A class for running language models using the vLLM library.
 
     This class sets up a vLLM inference engine with specified model parameters

From e82db3563f3907e86239c41e39e02666cea251c3 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 00:43:04 +0100
Subject: [PATCH 07/19] add changes from PR review

---
 promptolution/llms/api_llm.py   |  4 ++-
 promptolution/llms/local_llm.py |  4 ++-
 promptolution/llms/vllm.py      | 45 +++++++++++++++++++++++++--------
 3 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index 1c34709..cf966bf 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -13,6 +13,8 @@
 from langchain_core.messages import HumanMessage
 from langchain_openai import ChatOpenAI
 
+from promptolution.llms.base_llm import BaseLLM
+
 logger = Logger(__name__)
 logger.setLevel(INFO)
 
@@ -46,7 +48,7 @@ async def invoke_model(prompt, model, semaphore):
                 await asyncio.sleep(delay)
 
 
-class APILLM:
+class APILLM(BaseLLM):
     """A class to interface with various language models through their respective APIs.
 
     This class supports Claude (Anthropic), GPT (OpenAI), and LLaMA (DeepInfra) models.
diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
index 1cfb616..074bf01 100644
--- a/promptolution/llms/local_llm.py
+++ b/promptolution/llms/local_llm.py
@@ -8,8 +8,10 @@
     logger = logging.getLogger(__name__)
     logger.warning(f"Could not import torch or transformers in local_llm.py: {e}")
 
+from promptolution.llms.base_llm import BaseLLM
 
-class LocalLLM:
+
+class LocalLLM(BaseLLM):
     """A class for running language models locally using the Hugging Face Transformers library.
 
     This class sets up a text generation pipeline with specified model parameters
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 6f2d3fd..53983d8 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -3,9 +3,15 @@
 
 from logging import INFO, Logger
 
-import torch
-from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+try:
+    import torch
+    from transformers import AutoTokenizer
+    from vllm import LLM, SamplingParams
+except ImportError as e:
+    import logging
+
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}")
 
 from promptolution.llms.base_llm import BaseLLM
 
@@ -32,39 +38,56 @@ def __init__(
         self,
         model_id: str,
         batch_size: int = 8,
-        max_tokens: int = 256,
+        max_generated_tokens: int = 256,
         temperature: float = 0.1,
         top_p: float = 0.9,
         model_storage_path: str = None,
         token: str = None,
+        dtype: str = "float16",
+        tensor_parallel_size: int = 1,
+        gpu_memory_utilization: float = 0.95,
+        max_model_len: int = 2048,
+        trust_remote_code: bool = False,
     ):
         """Initialize the VLLM with a specific model.
 
         Args:
             model_id (str): The identifier of the model to use.
             batch_size (int, optional): The batch size for text generation. Defaults to 8.
-            max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
+            max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
             temperature (float, optional): Sampling temperature. Defaults to 0.1.
             top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
             model_storage_path (str, optional): Directory to store the model. Defaults to None.
             token: (str, optional): Token for accessing the model - not used in implementation yet.
+            dtype (str, optional): Data type for model weights. Defaults to "float16".
+            tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
+            gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
+            max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
+            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
 
         Note:
             This method sets up a vLLM engine with specified parameters for efficient inference.
         """
+        self.batch_size = batch_size
+        self.dtype = dtype
+        self.tensor_parallel_size = tensor_parallel_size
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self.max_model_len = max_model_len
+        self.trust_remote_code = trust_remote_code
+
         # Configure sampling parameters
-        self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_tokens)
+        self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens)
 
         # Initialize the vLLM engine
         self.llm = LLM(
             model=model_id,
             tokenizer=model_id,
-            dtype="float16",
-            tensor_parallel_size=1,
-            gpu_memory_utilization=0.95,
-            max_model_len=2048,
+            dtype=self.dtype,
+            tensor_parallel_size=self.tensor_parallel_size,
+            gpu_memory_utilization=self.gpu_memory_utilization,
+            max_model_len=self.max_model_len,
             download_dir=model_storage_path,
-            trust_remote_code=True,
+            trust_remote_code=self.trust_remote_code,
         )
 
         # Initialize tokenizer separately for potential pre-processing

From 0045de7122c171ccac6382f87c150c43a19c897a Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 18:51:53 +0100
Subject: [PATCH 08/19] change some VLLM params

---
 promptolution/llms/vllm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 53983d8..f9b8a36 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -43,10 +43,10 @@ def __init__(
         top_p: float = 0.9,
         model_storage_path: str = None,
         token: str = None,
-        dtype: str = "float16",
-        tensor_parallel_size: int = 1,
+        dtype: str = "auto",
+        tensor_parallel_size: int = None,
         gpu_memory_utilization: float = 0.95,
-        max_model_len: int = 2048,
+        max_model_len: int = 1024,
         trust_remote_code: bool = False,
     ):
         """Initialize the VLLM with a specific model.

From 0b3c7cb028af085916ebbffaa56644cda935ab07 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 18:54:49 +0100
Subject: [PATCH 09/19] fix tensor parallel size to 1

---
 promptolution/llms/vllm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index f9b8a36..34658b2 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -44,7 +44,7 @@ def __init__(
         model_storage_path: str = None,
         token: str = None,
         dtype: str = "auto",
-        tensor_parallel_size: int = None,
+        tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.95,
         max_model_len: int = 1024,
         trust_remote_code: bool = False,

From a73c378426e15379676044a9dc477f84aec8f978 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 19:04:13 +0100
Subject: [PATCH 10/19] experiment with batch size

---
 promptolution/llms/vllm.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 34658b2..8c07fc0 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -46,7 +46,7 @@ def __init__(
         dtype: str = "auto",
         tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.95,
-        max_model_len: int = 1024,
+        max_model_len: int = 2048,
         trust_remote_code: bool = False,
     ):
         """Initialize the VLLM with a specific model.
@@ -120,8 +120,20 @@ def get_response(self, inputs: list[str]):
             )
             for input in inputs
         ]
-        outputs = self.llm.generate(prompts, self.sampling_params)
-        responses = [output.outputs[0].text for output in outputs]
+        # outputs = self.llm.generate(prompts, self.sampling_params)
+        # responses = [output.outputs[0].text for output in outputs]
+        optimal_batch_size = 100
+
+        responses = []
+        for i in range(0, len(prompts), optimal_batch_size):
+            batch = prompts[i : i + optimal_batch_size]  # noqa: E203
+            outputs = self.llm.generate(batch, self.sampling_params)
+            batch_responses = [output.outputs[0].text for output in outputs]
+            responses.extend(batch_responses)
+
+            # Explicitly clean up between batches
+            if i + optimal_batch_size < len(prompts):
+                torch.cuda.empty_cache()
 
         return responses
 

From 1f6841098664a497970ba6921a3332c10b6f3138 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 19:11:38 +0100
Subject: [PATCH 11/19] experiment with larger batch sizes

---
 promptolution/llms/vllm.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 8c07fc0..1872b95 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -120,20 +120,13 @@ def get_response(self, inputs: list[str]):
             )
             for input in inputs
         ]
-        # outputs = self.llm.generate(prompts, self.sampling_params)
-        # responses = [output.outputs[0].text for output in outputs]
-        optimal_batch_size = 100
-
-        responses = []
-        for i in range(0, len(prompts), optimal_batch_size):
-            batch = prompts[i : i + optimal_batch_size]  # noqa: E203
-            outputs = self.llm.generate(batch, self.sampling_params)
-            batch_responses = [output.outputs[0].text for output in outputs]
-            responses.extend(batch_responses)
-
-            # Explicitly clean up between batches
-            if i + optimal_batch_size < len(prompts):
-                torch.cuda.empty_cache()
+
+        prompts_2 = prompts.copy()
+
+        prompts_all = prompts + prompts_2
+
+        outputs = self.llm.generate(prompts_all, self.sampling_params)
+        responses = [output.outputs[0].text for output in outputs]
 
         return responses
 

From f5fe188b2ee4436e8276e15bffcd72f730f55d95 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 19:37:50 +0100
Subject: [PATCH 12/19] add continuous batch llm

---
 promptolution/llms/__init__.py |   4 +
 promptolution/llms/cb_vllm.py  | 235 +++++++++++++++++++++++++++++++++
 scripts/llm_test_run.py        |   9 +-
 3 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 promptolution/llms/cb_vllm.py

diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py
index 95a22bd..ac774ad 100644
--- a/promptolution/llms/__init__.py
+++ b/promptolution/llms/__init__.py
@@ -2,6 +2,7 @@
 
 from .api_llm import APILLM
 from .base_llm import DummyLLM
+from .cb_vllm import ContinuousBatchVLLM
 from .local_llm import LocalLLM
 from .vllm import VLLM
 
@@ -32,6 +33,9 @@ def get_llm(model_id: str, *args, **kwargs):
     if "local" in model_id:
         model_id = "-".join(model_id.split("-")[1:])
         return LocalLLM(model_id, *args, **kwargs)
+    if "cbvllm" in model_id:
+        model_id = "-".join(model_id.split("-")[1:])
+        return ContinuousBatchVLLM(model_id, *args, **kwargs)
     if "vllm" in model_id:
         model_id = "-".join(model_id.split("-")[1:])
         return VLLM(model_id, *args, **kwargs)
diff --git a/promptolution/llms/cb_vllm.py b/promptolution/llms/cb_vllm.py
new file mode 100644
index 0000000..810d829
--- /dev/null
+++ b/promptolution/llms/cb_vllm.py
@@ -0,0 +1,235 @@
+"""Module for running language models using vLLM with continuous batching."""
+
+import time
+from concurrent.futures import ThreadPoolExecutor
+from logging import INFO, Logger
+from queue import Queue
+from threading import Lock
+from typing import List
+
+try:
+    import torch
+    from transformers import AutoTokenizer
+    from vllm import LLM, SamplingParams
+except ImportError as e:
+    import logging
+
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}")
+
+from promptolution.llms.base_llm import BaseLLM
+
+logger = Logger(__name__)
+logger.setLevel(INFO)
+
+
+class ContinuousBatchVLLM(BaseLLM):
+    """A class for running language models using vLLM with continuous batching."""
+
+    def __init__(
+        self,
+        model_id: str,
+        concurrent_requests: int = 8,
+        max_generated_tokens: int = 256,
+        temperature: float = 0.1,
+        top_p: float = 0.9,
+        model_storage_path: str = None,
+        token: str = None,
+        dtype: str = "auto",
+        tensor_parallel_size: int = 1,
+        gpu_memory_utilization: float = 0.95,
+        max_model_len: int = 2048,
+        trust_remote_code: bool = False,
+        block_size: int = 16,
+    ):
+        """Initialize the continuous batching vLLM engine.
+
+        Args:
+            model_id (str): The identifier of the model to use.
+            concurrent_requests (int, optional): Number of requests to process concurrently. Defaults to 8.
+            max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
+            temperature (float, optional): Sampling temperature. Defaults to 0.1.
+            top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
+            model_storage_path (str, optional): Directory to store the model. Defaults to None.
+            token (str, optional): Token for accessing the model. Defaults to None.
+            dtype (str, optional): Data type for model weights. Defaults to "auto".
+            tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
+            gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
+            max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
+            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
+            block_size (int, optional): KV cache block size. Smaller values can improve performance. Defaults to 16.
+        """
+        self.model_id = model_id
+        self.concurrent_requests = concurrent_requests
+        self.dtype = dtype
+        self.tensor_parallel_size = tensor_parallel_size
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self.max_model_len = max_model_len
+        self.trust_remote_code = trust_remote_code
+        self.block_size = block_size
+
+        self.sampling_params = SamplingParams(
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_generated_tokens,
+            early_stopping=True,
+        )
+
+        logger.info(f"Initializing continuous batching vLLM with model {model_id}")
+        start_time = time.time()
+
+        self.llm = LLM(
+            model=model_id,
+            tokenizer=model_id,
+            dtype=self.dtype,
+            tensor_parallel_size=self.tensor_parallel_size,
+            gpu_memory_utilization=self.gpu_memory_utilization,
+            max_model_len=self.max_model_len,
+            download_dir=model_storage_path,
+            trust_remote_code=self.trust_remote_code,
+            block_size=self.block_size,
+        )
+
+        logger.info(f"vLLM initialization took {time.time() - start_time:.2f} seconds")
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        self.executor = ThreadPoolExecutor(max_workers=1)
+        self.request_queue = Queue()
+        self.result_map = {}
+        self.result_lock = Lock()
+
+        self._warm_up_model()
+
+        self.is_running = True
+        self.executor.submit(self._continuous_batch_worker)
+
+    def _warm_up_model(self):
+        logger.info("Warming up model...")
+        start_time = time.time()
+
+        warmup_prompt = self.tokenizer.apply_chat_template(
+            [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Hello, how are you?"},
+            ],
+            tokenize=False,
+        )
+
+        self.llm.generate([warmup_prompt], self.sampling_params)
+
+        torch.cuda.empty_cache()
+        logger.info(f"Model warm-up completed in {time.time() - start_time:.2f} seconds")
+
+    def _continuous_batch_worker(self):
+        logger.info("Starting continuous batching worker thread")
+
+        active_requests = {}
+
+        while self.is_running:
+            while not self.request_queue.empty() and len(active_requests) < self.concurrent_requests:
+                try:
+                    request_id, prompt = self.request_queue.get_nowait()
+                    active_requests[request_id] = prompt
+                except Exception:
+                    break
+
+            if active_requests:
+                try:
+                    request_ids = list(active_requests.keys())
+                    prompts = list(active_requests.values())
+
+                    logger.info(f"Processing batch of {len(prompts)} prompts")
+                    start_time = time.time()
+
+                    outputs = self.llm.generate(prompts, self.sampling_params)
+
+                    elapsed = time.time() - start_time
+                    logger.info(f"Batch processed in {elapsed:.3f}s ({len(prompts)/elapsed:.1f} prompts/sec)")
+
+                    with self.result_lock:
+                        for request_id, output in zip(request_ids, outputs):
+                            self.result_map[request_id] = output.outputs[0].text
+
+                    active_requests.clear()
+
+                except Exception as e:
+                    logger.error(f"Error in continuous batching worker: {e}")
+                    active_requests.clear()
+
+            time.sleep(0.01)
+
+    def get_response(self, inputs: List[str]) -> List[str]:
+        """Generate responses for a list of prompts using the continuous batching vLLM engine.
+
+        This method queues the input prompts for processing by the background worker thread
+        and waits for the results to be available.
+
+        Args:
+            inputs (List[str]): A list of input prompts.
+
+        Returns:
+            List[str]: A list of generated responses corresponding to the input prompts.
+        """
+        prompts = [
+            self.tokenizer.apply_chat_template(
+                [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful, harmless, and honest assistant. "
+                        "You answer the user's questions accurately and fairly.",
+                    },
+                    {"role": "user", "content": input_text},
+                ],
+                tokenize=False,
+            )
+            for input_text in inputs
+        ]
+
+        request_ids = [f"req_{int(time.time() * 1000)}_{i}" for i in range(len(prompts))]
+
+        for request_id, prompt in zip(request_ids, prompts):
+            self.request_queue.put((request_id, prompt))
+
+        max_wait_time = 60
+        start_time = time.time()
+
+        results = [None] * len(request_ids)
+        remaining_ids = set(request_ids)
+
+        while remaining_ids and (time.time() - start_time) < max_wait_time:
+            with self.result_lock:
+                for i, request_id in enumerate(request_ids):
+                    if request_id in self.result_map and request_id in remaining_ids:
+                        results[i] = self.result_map[request_id]
+                        remaining_ids.remove(request_id)
+                        del self.result_map[request_id]
+
+            if remaining_ids:
+                time.sleep(0.1)
+
+        if remaining_ids:
+            logger.warning(f"Timed out waiting for {len(remaining_ids)} requests")
+            for i, request_id in enumerate(request_ids):
+                if results[i] is None:
+                    results[i] = "Error: Request timed out"
+
+        return results
+
+    def __del__(self):
+        """Cleanup method to stop the worker thread and free resources.
+
+        This magic method is called when the object is about to be destroyed.
+        It ensures proper shutdown of the background worker thread and
+        releases GPU resources.
+        """
+        self.is_running = False
+
+        if hasattr(self, "executor"):
+            self.executor.shutdown(wait=False)
+
+        if hasattr(self, "llm"):
+            del self.llm
+
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index b54c908..b82f828 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -22,11 +22,18 @@ def main():
     parser.add_argument("--datasets", type=list, default=["agnews", "subj"])
     parser.add_argument("--token", type=str, default=None)
     parser.add_argument("--model-storage-path", type=str, default=None)
+    parser.add_argument("--concurrent-requests", type=int, default=8)
     args = parser.parse_args()
 
     start_time = time.time()
 
-    if "vllm" in args.model:
+    if "cbvllm" in args.model:
+        llm = get_llm(
+            args.model,
+            model_storage_path=args.model_storage_path,
+            concurrent_requests=args.concurrent_requests,
+        )
+    elif "vllm" in args.model:
         llm = get_llm(
             args.model,
             model_storage_path=args.model_storage_path,

From 1330a9e2f55b4089c92f4a658255a4aa3879088c Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 19:39:34 +0100
Subject: [PATCH 13/19] remove arg

---
 promptolution/llms/cb_vllm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/promptolution/llms/cb_vllm.py b/promptolution/llms/cb_vllm.py
index 810d829..c28b6bc 100644
--- a/promptolution/llms/cb_vllm.py
+++ b/promptolution/llms/cb_vllm.py
@@ -72,7 +72,6 @@ def __init__(
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_generated_tokens,
-            early_stopping=True,
         )
 
         logger.info(f"Initializing continuous batching vLLM with model {model_id}")

From c6dbb7be85942f9f6569c7daec103dae68231b60 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 19:54:50 +0100
Subject: [PATCH 14/19] remove continuous batch inference try

---
 promptolution/llms/__init__.py |   4 -
 promptolution/llms/cb_vllm.py  | 234 ---------------------------------
 promptolution/llms/vllm.py     |   6 +-
 scripts/llm_test_run.py        |   9 +-
 4 files changed, 2 insertions(+), 251 deletions(-)
 delete mode 100644 promptolution/llms/cb_vllm.py

diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py
index ac774ad..95a22bd 100644
--- a/promptolution/llms/__init__.py
+++ b/promptolution/llms/__init__.py
@@ -2,7 +2,6 @@
 
 from .api_llm import APILLM
 from .base_llm import DummyLLM
-from .cb_vllm import ContinuousBatchVLLM
 from .local_llm import LocalLLM
 from .vllm import VLLM
 
@@ -33,9 +32,6 @@ def get_llm(model_id: str, *args, **kwargs):
     if "local" in model_id:
         model_id = "-".join(model_id.split("-")[1:])
         return LocalLLM(model_id, *args, **kwargs)
-    if "cbvllm" in model_id:
-        model_id = "-".join(model_id.split("-")[1:])
-        return ContinuousBatchVLLM(model_id, *args, **kwargs)
     if "vllm" in model_id:
         model_id = "-".join(model_id.split("-")[1:])
         return VLLM(model_id, *args, **kwargs)
diff --git a/promptolution/llms/cb_vllm.py b/promptolution/llms/cb_vllm.py
deleted file mode 100644
index c28b6bc..0000000
--- a/promptolution/llms/cb_vllm.py
+++ /dev/null
@@ -1,234 +0,0 @@
-"""Module for running language models using vLLM with continuous batching."""
-
-import time
-from concurrent.futures import ThreadPoolExecutor
-from logging import INFO, Logger
-from queue import Queue
-from threading import Lock
-from typing import List
-
-try:
-    import torch
-    from transformers import AutoTokenizer
-    from vllm import LLM, SamplingParams
-except ImportError as e:
-    import logging
-
-    logger = logging.getLogger(__name__)
-    logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}")
-
-from promptolution.llms.base_llm import BaseLLM
-
-logger = Logger(__name__)
-logger.setLevel(INFO)
-
-
-class ContinuousBatchVLLM(BaseLLM):
-    """A class for running language models using vLLM with continuous batching."""
-
-    def __init__(
-        self,
-        model_id: str,
-        concurrent_requests: int = 8,
-        max_generated_tokens: int = 256,
-        temperature: float = 0.1,
-        top_p: float = 0.9,
-        model_storage_path: str = None,
-        token: str = None,
-        dtype: str = "auto",
-        tensor_parallel_size: int = 1,
-        gpu_memory_utilization: float = 0.95,
-        max_model_len: int = 2048,
-        trust_remote_code: bool = False,
-        block_size: int = 16,
-    ):
-        """Initialize the continuous batching vLLM engine.
-
-        Args:
-            model_id (str): The identifier of the model to use.
-            concurrent_requests (int, optional): Number of requests to process concurrently. Defaults to 8.
-            max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
-            temperature (float, optional): Sampling temperature. Defaults to 0.1.
-            top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
-            model_storage_path (str, optional): Directory to store the model. Defaults to None.
-            token (str, optional): Token for accessing the model. Defaults to None.
-            dtype (str, optional): Data type for model weights. Defaults to "auto".
-            tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
-            gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
-            max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
-            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
-            block_size (int, optional): KV cache block size. Smaller values can improve performance. Defaults to 16.
-        """
-        self.model_id = model_id
-        self.concurrent_requests = concurrent_requests
-        self.dtype = dtype
-        self.tensor_parallel_size = tensor_parallel_size
-        self.gpu_memory_utilization = gpu_memory_utilization
-        self.max_model_len = max_model_len
-        self.trust_remote_code = trust_remote_code
-        self.block_size = block_size
-
-        self.sampling_params = SamplingParams(
-            temperature=temperature,
-            top_p=top_p,
-            max_tokens=max_generated_tokens,
-        )
-
-        logger.info(f"Initializing continuous batching vLLM with model {model_id}")
-        start_time = time.time()
-
-        self.llm = LLM(
-            model=model_id,
-            tokenizer=model_id,
-            dtype=self.dtype,
-            tensor_parallel_size=self.tensor_parallel_size,
-            gpu_memory_utilization=self.gpu_memory_utilization,
-            max_model_len=self.max_model_len,
-            download_dir=model_storage_path,
-            trust_remote_code=self.trust_remote_code,
-            block_size=self.block_size,
-        )
-
-        logger.info(f"vLLM initialization took {time.time() - start_time:.2f} seconds")
-
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-        self.executor = ThreadPoolExecutor(max_workers=1)
-        self.request_queue = Queue()
-        self.result_map = {}
-        self.result_lock = Lock()
-
-        self._warm_up_model()
-
-        self.is_running = True
-        self.executor.submit(self._continuous_batch_worker)
-
-    def _warm_up_model(self):
-        logger.info("Warming up model...")
-        start_time = time.time()
-
-        warmup_prompt = self.tokenizer.apply_chat_template(
-            [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "Hello, how are you?"},
-            ],
-            tokenize=False,
-        )
-
-        self.llm.generate([warmup_prompt], self.sampling_params)
-
-        torch.cuda.empty_cache()
-        logger.info(f"Model warm-up completed in {time.time() - start_time:.2f} seconds")
-
-    def _continuous_batch_worker(self):
-        logger.info("Starting continuous batching worker thread")
-
-        active_requests = {}
-
-        while self.is_running:
-            while not self.request_queue.empty() and len(active_requests) < self.concurrent_requests:
-                try:
-                    request_id, prompt = self.request_queue.get_nowait()
-                    active_requests[request_id] = prompt
-                except Exception:
-                    break
-
-            if active_requests:
-                try:
-                    request_ids = list(active_requests.keys())
-                    prompts = list(active_requests.values())
-
-                    logger.info(f"Processing batch of {len(prompts)} prompts")
-                    start_time = time.time()
-
-                    outputs = self.llm.generate(prompts, self.sampling_params)
-
-                    elapsed = time.time() - start_time
-                    logger.info(f"Batch processed in {elapsed:.3f}s ({len(prompts)/elapsed:.1f} prompts/sec)")
-
-                    with self.result_lock:
-                        for request_id, output in zip(request_ids, outputs):
-                            self.result_map[request_id] = output.outputs[0].text
-
-                    active_requests.clear()
-
-                except Exception as e:
-                    logger.error(f"Error in continuous batching worker: {e}")
-                    active_requests.clear()
-
-            time.sleep(0.01)
-
-    def get_response(self, inputs: List[str]) -> List[str]:
-        """Generate responses for a list of prompts using the continuous batching vLLM engine.
-
-        This method queues the input prompts for processing by the background worker thread
-        and waits for the results to be available.
-
-        Args:
-            inputs (List[str]): A list of input prompts.
-
-        Returns:
-            List[str]: A list of generated responses corresponding to the input prompts.
-        """
-        prompts = [
-            self.tokenizer.apply_chat_template(
-                [
-                    {
-                        "role": "system",
-                        "content": "You are a helpful, harmless, and honest assistant. "
-                        "You answer the user's questions accurately and fairly.",
-                    },
-                    {"role": "user", "content": input_text},
-                ],
-                tokenize=False,
-            )
-            for input_text in inputs
-        ]
-
-        request_ids = [f"req_{int(time.time() * 1000)}_{i}" for i in range(len(prompts))]
-
-        for request_id, prompt in zip(request_ids, prompts):
-            self.request_queue.put((request_id, prompt))
-
-        max_wait_time = 60
-        start_time = time.time()
-
-        results = [None] * len(request_ids)
-        remaining_ids = set(request_ids)
-
-        while remaining_ids and (time.time() - start_time) < max_wait_time:
-            with self.result_lock:
-                for i, request_id in enumerate(request_ids):
-                    if request_id in self.result_map and request_id in remaining_ids:
-                        results[i] = self.result_map[request_id]
-                        remaining_ids.remove(request_id)
-                        del self.result_map[request_id]
-
-            if remaining_ids:
-                time.sleep(0.1)
-
-        if remaining_ids:
-            logger.warning(f"Timed out waiting for {len(remaining_ids)} requests")
-            for i, request_id in enumerate(request_ids):
-                if results[i] is None:
-                    results[i] = "Error: Request timed out"
-
-        return results
-
-    def __del__(self):
-        """Cleanup method to stop the worker thread and free resources.
-
-        This magic method is called when the object is about to be destroyed.
-        It ensures proper shutdown of the background worker thread and
-        releases GPU resources.
-        """
-        self.is_running = False
-
-        if hasattr(self, "executor"):
-            self.executor.shutdown(wait=False)
-
-        if hasattr(self, "llm"):
-            del self.llm
-
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 1872b95..3de8c4b 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -121,11 +121,7 @@ def get_response(self, inputs: list[str]):
             for input in inputs
         ]
 
-        prompts_2 = prompts.copy()
-
-        prompts_all = prompts + prompts_2
-
-        outputs = self.llm.generate(prompts_all, self.sampling_params)
+        outputs = self.llm.generate(prompts, self.sampling_params)
         responses = [output.outputs[0].text for output in outputs]
 
         return responses
diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index b82f828..b54c908 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -22,18 +22,11 @@ def main():
     parser.add_argument("--datasets", type=list, default=["agnews", "subj"])
     parser.add_argument("--token", type=str, default=None)
     parser.add_argument("--model-storage-path", type=str, default=None)
-    parser.add_argument("--concurrent-requests", type=int, default=8)
     args = parser.parse_args()
 
     start_time = time.time()
 
-    if "cbvllm" in args.model:
-        llm = get_llm(
-            args.model,
-            model_storage_path=args.model_storage_path,
-            concurrent_requests=args.concurrent_requests,
-        )
-    elif "vllm" in args.model:
+    if "vllm" in args.model:
         llm = get_llm(
             args.model,
             model_storage_path=args.model_storage_path,

From 42ab6c969cfc46097ee9cf4d98fdfd0f4e797b08 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 20:58:30 +0100
Subject: [PATCH 15/19] add batching to vllm

---
 .flake8                    |  2 +-
 promptolution/llms/vllm.py | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.flake8 b/.flake8
index a2d1129..1276a9a 100644
--- a/.flake8
+++ b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
 max-line-length = 120
-ignore = F401, W503
+ignore = E731,E231,E203,E501,F401,W503
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 3de8c4b..0acd01e 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -37,7 +37,7 @@ class VLLM(BaseLLM):
     def __init__(
         self,
         model_id: str,
-        batch_size: int = 8,
+        batch_size: int = 64,
         max_generated_tokens: int = 256,
         temperature: float = 0.1,
         top_p: float = 0.9,
@@ -92,7 +92,6 @@ def __init__(
 
         # Initialize tokenizer separately for potential pre-processing
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.batch_size = batch_size
 
     def get_response(self, inputs: list[str]):
         """Generate responses for a list of prompts using the vLLM engine.
@@ -121,10 +120,15 @@ def get_response(self, inputs: list[str]):
             for input in inputs
         ]
 
-        outputs = self.llm.generate(prompts, self.sampling_params)
-        responses = [output.outputs[0].text for output in outputs]
+        # generate responses for self.batch_size prompts at the same time
+        all_responses = []
+        for i in range(0, len(prompts), self.batch_size):
+            batch = prompts[i : i + self.batch_size]
+            outputs = self.llm.generate(batch, self.sampling_params)
+            responses = [output.outputs[0].text for output in outputs]
+            all_responses.extend(responses)
 
-        return responses
+        return all_responses
 
     def __del__(self):
         """Cleanup method to delete the LLM instance and free up GPU memory."""

From 0be3d064c596043c6a356ba170ae0ec24c1dd2ec Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 21:00:00 +0100
Subject: [PATCH 16/19] add batching in script

---
 scripts/llm_test_run.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
index b54c908..aea7ccd 100644
--- a/scripts/llm_test_run.py
+++ b/scripts/llm_test_run.py
@@ -21,6 +21,7 @@ def main():
     parser.add_argument("--output", type=str)
     parser.add_argument("--datasets", type=list, default=["agnews", "subj"])
     parser.add_argument("--token", type=str, default=None)
+    parser.add_argument("--batch-size", type=int, default=64)
     parser.add_argument("--model-storage-path", type=str, default=None)
     args = parser.parse_args()
 
@@ -29,6 +30,7 @@ def main():
     if "vllm" in args.model:
         llm = get_llm(
             args.model,
+            batch_size=args.batch_size,
             model_storage_path=args.model_storage_path,
         )
     else:

From c5ac1015d461411f083f3ec03f1157d1a86f5b8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timo=20Hei=C3=9F?= <ti-heiss@t-online.de>
Date: Wed, 5 Mar 2025 22:01:48 +0100
Subject: [PATCH 17/19] Add release notes and increase version number

---
 docs/release-notes.md | 15 ++++++++++++++-
 pyproject.toml        |  2 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/release-notes.md b/docs/release-notes.md
index 10c16a5..20b97b7 100644
--- a/docs/release-notes.md
+++ b/docs/release-notes.md
@@ -1,10 +1,19 @@
 # Release Notes
 
+## Release v1.2.0
+### What's changed
+#### Added features
+* New LLM wrapper: VLLM for local inference with batches
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.1...v1.2.0)
+
 ## Release v1.1.1
 ### What's Changed
 #### Further Changes:
 - deleted poetry.lock
-- updated transformers dependency: bumped from 4.46.3 to 4.48.0 
+- updated transformers dependency: bumped from 4.46.3 to 4.48.0
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.0...v1.1.1)
 
 ## Release v1.1.0
 ### What's changed
@@ -16,6 +25,8 @@
 * improved opros meta-prompt
 * added support for python versions from 3.9 onwards (previously 3.11)
 
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.1...v1.1.0)
+
 ## Release v1.0.1
 ### What's changed
 #### Added features
@@ -24,6 +35,8 @@
 #### Further Changes:
 * fixed release notes
 
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.0...v1.0.1)
+
 ## Release v1.0.0
 ### What's changed
 #### Added Features:
diff --git a/pyproject.toml b/pyproject.toml
index b96bc55..e4f5be3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "promptolution"
-version = "1.1.1"
+version = "1.2.0"
 description = ""
 authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"]
 readme = "README.md"

From 0eb701b20a1040108bd417a55ef0a613f578af82 Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 22:06:10 +0100
Subject: [PATCH 18/19] remove llm_test_run.py script

---
 scripts/llm_test_run.py | 91 -----------------------------------------
 1 file changed, 91 deletions(-)
 delete mode 100644 scripts/llm_test_run.py

diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py
deleted file mode 100644
index aea7ccd..0000000
--- a/scripts/llm_test_run.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""Test script for measuring raw LLM inference performance on a dataset."""
-import time
-from tqdm import tqdm
-from logging import Logger
-import argparse
-import pandas as pd
-import numpy as np
-
-from promptolution.tasks import get_task
-from promptolution.config import Config
-from promptolution.predictors import Classificator
-from promptolution.llms import get_llm
-
-logger = Logger(__name__)
-
-
-def main():
-    """Run inference test on a dataset using a specified LLM."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--output", type=str)
-    parser.add_argument("--datasets", type=list, default=["agnews", "subj"])
-    parser.add_argument("--token", type=str, default=None)
-    parser.add_argument("--batch-size", type=int, default=64)
-    parser.add_argument("--model-storage-path", type=str, default=None)
-    args = parser.parse_args()
-
-    start_time = time.time()
-
-    if "vllm" in args.model:
-        llm = get_llm(
-            args.model,
-            batch_size=args.batch_size,
-            model_storage_path=args.model_storage_path,
-        )
-    else:
-        llm = get_llm(args.model, args.token)
-
-    results = pd.DataFrame()
-
-    for dataset in args.datasets:
-        config = Config(
-            evaluation_llm=args.model,
-            ds_path=f"data_sets/cls/{dataset}/",
-            task_name=dataset,
-            api_token=args.token,
-            n_eval_samples=200,
-        )
-
-        task = get_task(config, split="dev")
-        predictor = Classificator(llm, classes=task.classes)
-
-        prompt = task.initial_population
-
-        xs = task.xs[:config.n_eval_samples]
-        ys = task.ys[:config.n_eval_samples]
-
-        for prompt in tqdm(task.initial_population):
-            preds, seqs = predictor.predict(prompt, xs, return_seq=True)
-
-            scores = []
-            for i in range(len(xs)):
-                scores.append(1 if preds[0][i] == ys[i] else 0)
-
-            # clean up the sequences
-            seqs = [seq.replace("\n", "").strip() for seq in seqs]
-
-            # if single prompts should be stored
-            # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores))
-            # df.to_csv(args.output + "_detailed", index=False)
-
-            accuracy = np.array(scores).mean()
-
-            results = pd.DataFrame(
-                dict(
-                    model=args.model,
-                    dataset=dataset,
-                    prompt=prompt,
-                    accuracy=accuracy,
-                    n_samples=len(xs),
-                ),
-                index=[0],
-            )
-            results.to_csv(args.output, mode="a", header=False, index=False)
-
-    total_inference_time = time.time() - start_time
-    print(f"Total inference took {total_inference_time:.2f} seconds")
-
-
-if __name__ == "__main__":
-    main()

From fae011336f57b724310ea55fef8a3c759d925d6a Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo@t-online.de>
Date: Wed, 5 Mar 2025 23:29:37 +0100
Subject: [PATCH 19/19] change system prompt

---
 promptolution/llms/vllm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 0acd01e..d99c542 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -110,8 +110,7 @@ def get_response(self, inputs: list[str]):
                 [
                     {
                         "role": "system",
-                        "content": "You are a helpful, harmless, and honest assistant. "
-                        "You answer the user's questions accurately and fairly.",
+                        "content": "You are a helpful assistant.",
                     },
                     {"role": "user", "content": input},
                 ],