From f9f1d40bd71508ea8d2ebe6c894a877c1712e82e Mon Sep 17 00:00:00 2001 From: mo374z Date: Thu, 6 Mar 2025 22:41:31 +0100 Subject: [PATCH 01/41] add token count, flexible batch size and kwargs to vllm class --- promptolution/llms/vllm.py | 77 ++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index d99c542..ddd60fa 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -32,22 +32,25 @@ class VLLM(BaseLLM): Methods: get_response: Generate responses for a list of prompts. + get_token_count: Get the current count of input and output tokens. + reset_token_count: Reset the token counters to zero. """ def __init__( self, model_id: str, - batch_size: int = 64, + batch_size: int | None = None, max_generated_tokens: int = 256, temperature: float = 0.1, top_p: float = 0.9, - model_storage_path: str = None, - token: str = None, + model_storage_path: str | None = None, + token: str | None = None, dtype: str = "auto", tensor_parallel_size: int = 1, gpu_memory_utilization: float = 0.95, max_model_len: int = 2048, trust_remote_code: bool = False, + **kwargs, ): """Initialize the VLLM with a specific model. @@ -64,31 +67,45 @@ def __init__( gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95. max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048. trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. + **kwargs: Additional keyword arguments to pass to the LLM class initialization. Note: This method sets up a vLLM engine with specified parameters for efficient inference. """ - self.batch_size = batch_size self.dtype = dtype self.tensor_parallel_size = tensor_parallel_size self.gpu_memory_utilization = gpu_memory_utilization self.max_model_len = max_model_len self.trust_remote_code = trust_remote_code + # Initialize token counters + self.input_token_count = 0 + self.output_token_count = 0 + # Configure sampling parameters self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens) - # Initialize the vLLM engine - self.llm = LLM( - model=model_id, - tokenizer=model_id, - dtype=self.dtype, - tensor_parallel_size=self.tensor_parallel_size, - gpu_memory_utilization=self.gpu_memory_utilization, - max_model_len=self.max_model_len, - download_dir=model_storage_path, - trust_remote_code=self.trust_remote_code, - ) + # Initialize the vLLM engine with both explicit parameters and any additional kwargs + llm_params = { + "model": model_id, + "tokenizer": model_id, + "dtype": self.dtype, + "tensor_parallel_size": self.tensor_parallel_size, + "gpu_memory_utilization": self.gpu_memory_utilization, + "max_model_len": self.max_model_len, + "download_dir": model_storage_path, + "trust_remote_code": self.trust_remote_code, + **kwargs, + } + + self.llm = LLM(**llm_params) + + if batch_size is None: + gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks + block_size = self.llm.llm_engine.model_executor.cache_config.block_size + self.batch_size = (gpu_blocks * block_size / self.max_model_len) * 0.95 + else: + self.batch_size = batch_size # Initialize tokenizer separately for potential pre-processing self.tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -104,6 +121,7 @@ def get_response(self, inputs: list[str]): Note: This method uses vLLM's batched generation capabilities for efficient inference. + It also counts input and output tokens. """ prompts = [ self.tokenizer.apply_chat_template( @@ -119,16 +137,45 @@ def get_response(self, inputs: list[str]): for input in inputs ] + # Count input tokens + for prompt in prompts: + input_tokens = self.tokenizer.encode(prompt) + self.input_token_count += len(input_tokens) + # generate responses for self.batch_size prompts at the same time all_responses = [] for i in range(0, len(prompts), self.batch_size): batch = prompts[i : i + self.batch_size] outputs = self.llm.generate(batch, self.sampling_params) responses = [output.outputs[0].text for output in outputs] + + # Count output tokens + for response in responses: + output_tokens = self.tokenizer.encode(response) + self.output_token_count += len(output_tokens) + all_responses.extend(responses) return all_responses + def get_token_count(self): + """Get the current count of input and output tokens. + + Returns: + dict: A dictionary containing the input and output token counts. + """ + return { + "input_tokens": self.input_token_count, + "output_tokens": self.output_token_count, + "total_tokens": self.input_token_count + self.output_token_count, + } + + def reset_token_count(self): + """Reset the token counters to zero.""" + self.input_token_count = 0 + self.output_token_count = 0 + logger.info("Token counters have been reset.") + def __del__(self): """Cleanup method to delete the LLM instance and free up GPU memory.""" del self.llm From b20495fdc7cfc05999300ee5ec29cca28a1cfde2 Mon Sep 17 00:00:00 2001 From: mo374z Date: Thu, 6 Mar 2025 22:55:52 +0100 Subject: [PATCH 02/41] add testing script for implementation --- scripts/llm_test_run.py | 94 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 scripts/llm_test_run.py diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py new file mode 100644 index 0000000..65930c4 --- /dev/null +++ b/scripts/llm_test_run.py @@ -0,0 +1,94 @@ +"""Test script for measuring raw LLM inference performance on a dataset.""" +import argparse +import time +from logging import Logger + +import numpy as np +import pandas as pd +from promptolution.config import Config +from promptolution.llms import get_llm +from promptolution.predictors import Classificator +from promptolution.tasks import get_task +from tqdm import tqdm + +logger = Logger(__name__) + + +def main(): + """Run inference test on a dataset using a specified LLM.""" + parser = argparse.ArgumentParser() + parser.add_argument("--model") + parser.add_argument("--output") + parser.add_argument("--datasets", default=["agnews", "subj"]) + parser.add_argument("--token", default=None) + parser.add_argument("--batch-size", default=None) + parser.add_argument("--model-storage-path", default=None) + args = parser.parse_args() + + start_time = time.time() + + if "vllm" in args.model: + llm = get_llm( + args.model, + batch_size=args.batch_size, + model_storage_path=args.model_storage_path, + revision="main", + ) + else: + llm = get_llm(args.model, args.token) + + results = pd.DataFrame() + + for dataset in args.datasets: + config = Config( + evaluation_llm=args.model, + ds_path=f"data_sets/cls/{dataset}/", + task_name=dataset, + api_token=args.token, + n_eval_samples=200, + ) + + task = get_task(config, split="dev") + predictor = Classificator(llm, classes=task.classes) + + prompt = task.initial_population + + xs = task.xs[: config.n_eval_samples] + ys = task.ys[: config.n_eval_samples] + + for prompt in tqdm(task.initial_population): + preds, seqs = predictor.predict(prompt, xs, return_seq=True) + + scores = [] + for i in range(len(xs)): + scores.append(1 if preds[0][i] == ys[i] else 0) + + # clean up the sequences + seqs = [seq.replace("\n", "").strip() for seq in seqs] + + # if single prompts should be stored + # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores)) + # df.to_csv(args.output + "_detailed", index=False) + + accuracy = np.array(scores).mean() + + results = pd.DataFrame( + dict( + model=args.model, + dataset=dataset, + prompt=prompt, + accuracy=accuracy, + n_samples=len(xs), + ), + index=[0], + ) + results.to_csv(args.output, mode="a", header=False, index=False) + print(llm.get_token_count()) + llm.reset_token_count() + + total_inference_time = time.time() - start_time + print(f"Total inference took {total_inference_time:.2f} seconds") + + +if __name__ == "__main__": + main() From e27fa6ce0f084d6cee25a2d76f5823ffc248cbaa Mon Sep 17 00:00:00 2001 From: mo374z Date: Thu, 6 Mar 2025 23:06:42 +0100 Subject: [PATCH 03/41] fix batch size calculation --- promptolution/llms/vllm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index ddd60fa..30d78da 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -103,7 +103,8 @@ def __init__( if batch_size is None: gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks block_size = self.llm.llm_engine.model_executor.cache_config.block_size - self.batch_size = (gpu_blocks * block_size / self.max_model_len) * 0.95 + self.batch_size = int((gpu_blocks * block_size / self.max_model_len) * 0.95) + logger.info(f"Batch size set to {self.batch_size} based on GPU memory.") else: self.batch_size = batch_size From 01eeb6d30874a8e43c7486859e5abbde17bd5f8b Mon Sep 17 00:00:00 2001 From: mo374z Date: Thu, 6 Mar 2025 23:21:16 +0100 Subject: [PATCH 04/41] small changes --- scripts/llm_test_run.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index 65930c4..40b9938 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -83,11 +83,10 @@ def main(): index=[0], ) results.to_csv(args.output, mode="a", header=False, index=False) - print(llm.get_token_count()) - llm.reset_token_count() total_inference_time = time.time() - start_time - print(f"Total inference took {total_inference_time:.2f} seconds") + print(f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens.") + print(f"Results saved to {args.output}") if __name__ == "__main__": From 045ffb8722be7c5b6e78180bf8e02415bc9e9c35 Mon Sep 17 00:00:00 2001 From: mo374z Date: Thu, 6 Mar 2025 23:27:02 +0100 Subject: [PATCH 05/41] add revision test --- scripts/llm_test_run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index 40b9938..4de7131 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -22,6 +22,7 @@ def main(): parser.add_argument("--datasets", default=["agnews", "subj"]) parser.add_argument("--token", default=None) parser.add_argument("--batch-size", default=None) + parser.add_argument("--revision", default="main") parser.add_argument("--model-storage-path", default=None) args = parser.parse_args() @@ -32,7 +33,7 @@ def main(): args.model, batch_size=args.batch_size, model_storage_path=args.model_storage_path, - revision="main", + revision=args.revision, ) else: llm = get_llm(args.model, args.token) From ad54496f0f4d64b96a3399764ad494378ab8b986 Mon Sep 17 00:00:00 2001 From: mo374z Date: Thu, 6 Mar 2025 23:28:15 +0100 Subject: [PATCH 06/41] add argument to parser --- scripts/llm_test_run.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index 4de7131..69d2328 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -23,6 +23,7 @@ def main(): parser.add_argument("--token", default=None) parser.add_argument("--batch-size", default=None) parser.add_argument("--revision", default="main") + parser.add_argument("--max-model-len", default=None) parser.add_argument("--model-storage-path", default=None) args = parser.parse_args() @@ -32,6 +33,7 @@ def main(): llm = get_llm( args.model, batch_size=args.batch_size, + max_model_len=args.max_model_len, model_storage_path=args.model_storage_path, revision=args.revision, ) From fc8d7790b2c21151e231509ece92338fc1783ea8 Mon Sep 17 00:00:00 2001 From: mo374z Date: Thu, 6 Mar 2025 23:41:34 +0100 Subject: [PATCH 07/41] max model len to int --- scripts/llm_test_run.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index 69d2328..35519eb 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -29,11 +29,14 @@ def main(): start_time = time.time() + if args.max_model_len is not None: + max_model_len = int(args.max_model_len) + if "vllm" in args.model: llm = get_llm( args.model, batch_size=args.batch_size, - max_model_len=args.max_model_len, + max_model_len=max_model_len, model_storage_path=args.model_storage_path, revision=args.revision, ) From 469117c4b9b4b08c703c79fb95a6697e5fb42dbf Mon Sep 17 00:00:00 2001 From: mo374z Date: Thu, 6 Mar 2025 23:49:37 +0100 Subject: [PATCH 08/41] remove script --- scripts/llm_test_run.py | 99 ----------------------------------------- 1 file changed, 99 deletions(-) delete mode 100644 scripts/llm_test_run.py diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py deleted file mode 100644 index 35519eb..0000000 --- a/scripts/llm_test_run.py +++ /dev/null @@ -1,99 +0,0 @@ -"""Test script for measuring raw LLM inference performance on a dataset.""" -import argparse -import time -from logging import Logger - -import numpy as np -import pandas as pd -from promptolution.config import Config -from promptolution.llms import get_llm -from promptolution.predictors import Classificator -from promptolution.tasks import get_task -from tqdm import tqdm - -logger = Logger(__name__) - - -def main(): - """Run inference test on a dataset using a specified LLM.""" - parser = argparse.ArgumentParser() - parser.add_argument("--model") - parser.add_argument("--output") - parser.add_argument("--datasets", default=["agnews", "subj"]) - parser.add_argument("--token", default=None) - parser.add_argument("--batch-size", default=None) - parser.add_argument("--revision", default="main") - parser.add_argument("--max-model-len", default=None) - parser.add_argument("--model-storage-path", default=None) - args = parser.parse_args() - - start_time = time.time() - - if args.max_model_len is not None: - max_model_len = int(args.max_model_len) - - if "vllm" in args.model: - llm = get_llm( - args.model, - batch_size=args.batch_size, - max_model_len=max_model_len, - model_storage_path=args.model_storage_path, - revision=args.revision, - ) - else: - llm = get_llm(args.model, args.token) - - results = pd.DataFrame() - - for dataset in args.datasets: - config = Config( - evaluation_llm=args.model, - ds_path=f"data_sets/cls/{dataset}/", - task_name=dataset, - api_token=args.token, - n_eval_samples=200, - ) - - task = get_task(config, split="dev") - predictor = Classificator(llm, classes=task.classes) - - prompt = task.initial_population - - xs = task.xs[: config.n_eval_samples] - ys = task.ys[: config.n_eval_samples] - - for prompt in tqdm(task.initial_population): - preds, seqs = predictor.predict(prompt, xs, return_seq=True) - - scores = [] - for i in range(len(xs)): - scores.append(1 if preds[0][i] == ys[i] else 0) - - # clean up the sequences - seqs = [seq.replace("\n", "").strip() for seq in seqs] - - # if single prompts should be stored - # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores)) - # df.to_csv(args.output + "_detailed", index=False) - - accuracy = np.array(scores).mean() - - results = pd.DataFrame( - dict( - model=args.model, - dataset=dataset, - prompt=prompt, - accuracy=accuracy, - n_samples=len(xs), - ), - index=[0], - ) - results.to_csv(args.output, mode="a", header=False, index=False) - - total_inference_time = time.time() - start_time - print(f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens.") - print(f"Results saved to {args.output}") - - -if __name__ == "__main__": - main() From 6b543fa9ad73d24106e9f1383f13d77f3d7349af Mon Sep 17 00:00:00 2001 From: mo374z Date: Fri, 7 Mar 2025 00:21:55 +0100 Subject: [PATCH 09/41] Change version and Release notes --- docs/release-notes.md | 5 +++++ pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index 20b97b7..7be8e79 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,5 +1,10 @@ # Release Notes +## Release v1.2.1 +### What's changed +#### Added features +* New features for the VLLM Wrapper (automatic batch size determination, accepting kwargs and token count) + ## Release v1.2.0 ### What's changed #### Added features diff --git a/pyproject.toml b/pyproject.toml index e4f5be3..06cbcfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "promptolution" -version = "1.2.0" +version = "1.2.1" description = "" authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"] readme = "README.md" From 619ce65e374dbfa4849919e33f94e7fd7f4ec26d Mon Sep 17 00:00:00 2001 From: finitearth Date: Fri, 7 Mar 2025 11:41:04 +0100 Subject: [PATCH 10/41] changed callback behaviour and impelemented token count callback --- promptolution/callbacks.py | 47 ++++++++++++++++-- promptolution/llms/api_llm.py | 2 +- promptolution/llms/base_llm.py | 55 +++++++++++++++++++++- promptolution/llms/local_llm.py | 2 +- promptolution/llms/vllm.py | 29 +----------- promptolution/optimizers/base_optimizer.py | 13 +++-- promptolution/optimizers/evoprompt_de.py | 6 ++- promptolution/optimizers/evoprompt_ga.py | 5 +- promptolution/optimizers/opro.py | 4 +- promptolution/utils/prompt_creation.py | 10 ++-- 10 files changed, 126 insertions(+), 47 deletions(-) diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py index fe655d6..b4f75af 100644 --- a/promptolution/callbacks.py +++ b/promptolution/callbacks.py @@ -14,24 +14,33 @@ def on_step_end(self, optimizer): Args: optimizer: The optimizer object that called the callback. + + Returns: + Bool: True if the optimization should continue, False if it should stop. """ - pass + return True def on_epoch_end(self, optimizer): """Called at the end of each optimization epoch. Args: optimizer: The optimizer object that called the callback. + + Returns: + Bool: True if the optimization should continue, False if it should stop. """ - pass + return True def on_train_end(self, optimizer): """Called at the end of the entire optimization process. Args: optimizer: The optimizer object that called the callback. + + Returns: + Bool: True if the optimization should continue, False if it should stop. """ - pass + return True class LoggerCallback(Callback): @@ -57,6 +66,8 @@ def on_step_end(self, optimizer): self.logger.critical(f"*** Prompt {i}: Score: {score}") self.logger.critical(f"{prompt}") + return True + def on_train_end(self, optimizer, logs=None): """Log information at the end of training. @@ -66,6 +77,8 @@ def on_train_end(self, optimizer, logs=None): """ self.logger.critical(f"Training ended - {logs}") + return True + class CSVCallback(Callback): """Callback for saving optimization progress to a CSV file. @@ -105,13 +118,15 @@ def on_step_end(self, optimizer): ) df.to_csv(self.path, mode="a", header=False, index=False) + return True + def on_train_end(self, optimizer): """Called at the end of training. Args: optimizer: The optimizer object that called the callback. """ - pass + return True class BestPromptCallback(Callback): @@ -139,6 +154,8 @@ def on_step_end(self, optimizer): self.best_score = optimizer.scores[0] self.best_prompt = optimizer.prompts[0] + return True + def get_best_prompt(self): """Get the best prompt and score achieved during optimization. @@ -173,6 +190,8 @@ def on_step_end(self, optimizer): """ self.pbar.update(1) + return True + def on_train_end(self, optimizer): """Close the progress bar at the end of training. @@ -180,3 +199,23 @@ def on_train_end(self, optimizer): optimizer: The optimizer object that called the callback. """ self.pbar.close() + + return True + + +class TokenCountCallback(Callback): + """Callback for stopping optimization based on the total token count.""" + + def __init__(self, max_tokens_for_termination): + """Initialize the TokenCountCallback.""" + self.max_tokens_for_termination = max_tokens_for_termination + + def on_step_end(self, optimizer): + """Check if the total token count exceeds the maximum allowed. If so, stop the optimization.""" + token_counts = optimizer.predictor.llm.get_token_count() + total_token_count = token_counts["total_tokens"] + + if total_token_count > self.max_tokens_for_termination: + return False + + return True diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py index cf966bf..db920de 100644 --- a/promptolution/llms/api_llm.py +++ b/promptolution/llms/api_llm.py @@ -80,7 +80,7 @@ def __init__(self, model_id: str, token: str = None): else: self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token) - def get_response(self, prompts: List[str]) -> List[str]: + def _get_response(self, prompts: List[str]) -> List[str]: """Get responses for a list of prompts in a synchronous manner. This method includes retry logic for handling connection errors and rate limits. diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py index 7f0e95d..c222b6d 100644 --- a/promptolution/llms/base_llm.py +++ b/promptolution/llms/base_llm.py @@ -18,10 +18,61 @@ class BaseLLM(ABC): def __init__(self, *args, **kwargs): """Initialize the LLM.""" - pass + self.input_token_count = 0 + self.output_token_count = 0 + + def get_token_count(self): + """Get the current count of input and output tokens. + + Returns: + dict: A dictionary containing the input and output token counts. + """ + return { + "input_tokens": self.input_token_count, + "output_tokens": self.output_token_count, + "total_tokens": self.input_token_count + self.output_token_count, + } + + def reset_token_count(self): + """Reset the token counters to zero.""" + self.input_token_count = 0 + self.output_token_count = 0 + + def update_token_count(self, inputs: List[str], outputs: List[str]): + """Update the token count based on the given inputs and outputs. + + Args: + inputs (List[str]): A list of input prompts. + outputs (List[str]): A list of generated responses. + """ + input_tokens = sum([len(i.split()) for i in inputs]) + output_tokens = sum([len(o.split()) for o in outputs]) + self.input_token_count += input_tokens + self.output_token_count += output_tokens + + def get_response(self, prompts: str) -> str: + """Generate responses for the given prompts. + + This method calls the _get_response method to generate responses + for the given prompts. It also updates the token count for the + input and output tokens. + + Args: + prompts (str or List[str]): Input prompt(s). If a single string is provided, + it's converted to a list containing that string. + + Returns: + List[str]: A list of generated responses, one for each input prompt. + """ + if isinstance(prompts, str): + prompts = [prompts] + responses = self._get_response(prompts) + self.update_token_count(prompts, responses) + + return responses @abstractmethod - def get_response(self, prompts: List[str]) -> List[str]: + def _get_response(self, prompts: List[str]) -> List[str]: """Generate responses for the given prompts. This method should be implemented by subclasses to define how diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py index 074bf01..a58675e 100644 --- a/promptolution/llms/local_llm.py +++ b/promptolution/llms/local_llm.py @@ -48,7 +48,7 @@ def __init__(self, model_id: str, batch_size=8): self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id self.pipeline.tokenizer.padding_side = "left" - def get_response(self, prompts: list[str]): + def _get_response(self, prompts: list[str]): """Generate responses for a list of prompts using the local language model. Args: diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 30d78da..39157a1 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -78,10 +78,6 @@ def __init__( self.max_model_len = max_model_len self.trust_remote_code = trust_remote_code - # Initialize token counters - self.input_token_count = 0 - self.output_token_count = 0 - # Configure sampling parameters self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens) @@ -111,7 +107,7 @@ def __init__( # Initialize tokenizer separately for potential pre-processing self.tokenizer = AutoTokenizer.from_pretrained(model_id) - def get_response(self, inputs: list[str]): + def _get_response(self, inputs: list[str]): """Generate responses for a list of prompts using the vLLM engine. Args: @@ -150,33 +146,10 @@ def get_response(self, inputs: list[str]): outputs = self.llm.generate(batch, self.sampling_params) responses = [output.outputs[0].text for output in outputs] - # Count output tokens - for response in responses: - output_tokens = self.tokenizer.encode(response) - self.output_token_count += len(output_tokens) - all_responses.extend(responses) return all_responses - def get_token_count(self): - """Get the current count of input and output tokens. - - Returns: - dict: A dictionary containing the input and output token counts. - """ - return { - "input_tokens": self.input_token_count, - "output_tokens": self.output_token_count, - "total_tokens": self.input_token_count + self.output_token_count, - } - - def reset_token_count(self): - """Reset the token counters to zero.""" - self.input_token_count = 0 - self.output_token_count = 0 - logger.info("Token counters have been reset.") - def __del__(self): """Cleanup method to delete the LLM instance and free up GPU memory.""" del self.llm diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py index 2cac685..95ec7c2 100644 --- a/promptolution/optimizers/base_optimizer.py +++ b/promptolution/optimizers/base_optimizer.py @@ -61,18 +61,24 @@ def optimize(self, n_steps: int) -> List[str]: def _on_step_end(self): """Call all registered callbacks at the end of each optimization step.""" + continue_optimization = True for callback in self.callbacks: - callback.on_step_end(self) + continue_optimization &= callback.on_step_end(self) # if any callback returns False, end the optimization + + return continue_optimization def _on_epoch_end(self): """Call all registered callbacks at the end of each optimization epoch.""" + continue_optimization = True for callback in self.callbacks: - callback.on_epoch_end(self) + continue_optimization &= callback._on_epoch_end(self) # if any callback returns False, end the optimization + + return continue_optimization def _on_train_end(self): """Call all registered callbacks at the end of the entire optimization process.""" for callback in self.callbacks: - callback.on_train_end(self) + callback._on_train_end(self) class DummyOptimizer(BaseOptimizer): @@ -111,4 +117,5 @@ def optimize(self, n_steps) -> list[str]: self._on_step_end() self._on_epoch_end() self._on_train_end() + return self.prompts diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py index 17d74b3..f44556e 100644 --- a/promptolution/optimizers/evoprompt_de.py +++ b/promptolution/optimizers/evoprompt_de.py @@ -89,7 +89,11 @@ def optimize(self, n_steps: int) -> List[str]: self.prompts[i] = child_prompts[i] self.scores[i] = child_scores[i] - self._on_step_end() + continue_optimization = self._on_step_end() + + if not continue_optimization: + break self._on_train_end() + return self.prompts diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py index 2ec789b..f6efcb8 100644 --- a/promptolution/optimizers/evoprompt_ga.py +++ b/promptolution/optimizers/evoprompt_ga.py @@ -77,7 +77,10 @@ def optimize(self, n_steps: int) -> List[str]: self.prompts = [prompt for _, prompt in sorted(zip(scores, prompts), reverse=True)][: len(self.prompts)] self.scores = sorted(scores, reverse=True)[: len(self.prompts)] - self._on_step_end() + continue_optimization = self._on_step_end() + if not continue_optimization: + break + return self.prompts def _crossover(self, prompts, scores) -> str: diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py index 3c71f4e..ef6f6fd 100644 --- a/promptolution/optimizers/opro.py +++ b/promptolution/optimizers/opro.py @@ -89,7 +89,9 @@ def optimize(self, n_steps: int) -> List[str]: self.prompts.append(prompt) self.scores.append(score) - self._on_step_end() + continue_optimization = self._on_step_end() + if not continue_optimization: + break self._on_epoch_end() diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index d85edd9..e0c7c0b 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -56,7 +56,7 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = """ if isinstance(task, ClassificationTask): # if classification task sample such that all classes are represented - unique_classes, counts = np.unique(task.ys, return_counts=True) + unique_labels, counts = np.unique(task.ys, return_counts=True) proportions = counts / len(task.ys) samples_per_class = np.round(proportions * n_samples).astype(int) samples_per_class = np.maximum(samples_per_class, 1) @@ -64,8 +64,8 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = # sample xs = [] ys = [] - for cls, n_samples in zip(unique_classes, samples_per_class): - indices = np.where(task.ys == cls)[0] + for label, n_samples in zip(unique_labels, samples_per_class): + indices = np.where(task.ys == label)[0] indices = np.random.choice(indices, n_samples, replace=False) xs.extend(task.xs[indices]) ys.extend(task.ys[indices]) @@ -78,9 +78,9 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = meta_prompt = PROMPT_CREATION_TEMPLATE if meta_prompt is None else meta_prompt examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)]) - meta_prompt = meta_prompt.replace("", examples) prompt = llm.get_response([meta_prompt])[0] prompt = prompt.split("")[0].split("")[-1] + prompt = prompt.strip() return prompt From 2588664f2caf0bdbf8046b72841fa28992a51d95 Mon Sep 17 00:00:00 2001 From: finitearth Date: Fri, 7 Mar 2025 12:41:27 +0100 Subject: [PATCH 11/41] added super inits --- promptolution/llms/api_llm.py | 1 + promptolution/llms/local_llm.py | 2 + promptolution/llms/vllm.py | 2 + scripts/llm_test_run.py | 98 +++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+) create mode 100644 scripts/llm_test_run.py diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py index db920de..14a70da 100644 --- a/promptolution/llms/api_llm.py +++ b/promptolution/llms/api_llm.py @@ -73,6 +73,7 @@ def __init__(self, model_id: str, token: str = None): Raises: ValueError: If an unknown model identifier is provided. """ + super().__init__() if "claude" in model_id: self.model = ChatAnthropic(model=model_id, api_key=token) elif "gpt" in model_id: diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py index a58675e..577d4a0 100644 --- a/promptolution/llms/local_llm.py +++ b/promptolution/llms/local_llm.py @@ -35,6 +35,8 @@ def __init__(self, model_id: str, batch_size=8): This method sets up a text generation pipeline with bfloat16 precision, automatic device mapping, and specific generation parameters. """ + super().__init__() + self.pipeline = transformers.pipeline( "text-generation", model=model_id, diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 39157a1..5380e87 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -72,6 +72,8 @@ def __init__( Note: This method sets up a vLLM engine with specified parameters for efficient inference. """ + super().__init__() + self.dtype = dtype self.tensor_parallel_size = tensor_parallel_size self.gpu_memory_utilization = gpu_memory_utilization diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py new file mode 100644 index 0000000..15b1203 --- /dev/null +++ b/scripts/llm_test_run.py @@ -0,0 +1,98 @@ +"""Test script for measuring raw LLM inference performance on a dataset.""" +import argparse +import time +from logging import Logger + +import numpy as np +import pandas as pd +from promptolution.config import Config +from promptolution.llms import get_llm +from promptolution.predictors import Classificator +from promptolution.tasks import get_task +from tqdm import tqdm + +logger = Logger(__name__) + +# TODO: Align this script with how we import datasets in capo + + +"""Run inference test on a dataset using a specified LLM.""" +parser = argparse.ArgumentParser() +parser.add_argument("--model") +parser.add_argument("--output") +parser.add_argument("--datasets", default=["subj"]) +parser.add_argument("--token", default=None) +parser.add_argument("--batch-size", default=None) +parser.add_argument("--revision", default="main") +parser.add_argument("--max-model-len", default=None) +parser.add_argument("--model-storage-path", default=None) +args = parser.parse_args() + +start_time = time.time() + +if args.max_model_len is not None: + max_model_len = int(args.max_model_len) + +if "vllm" in args.model: + llm = get_llm( + args.model, + batch_size=args.batch_size, + max_model_len=max_model_len, + model_storage_path=args.model_storage_path, + revision=args.revision, + ) +else: + llm = get_llm(args.model, args.token) + +results = pd.DataFrame() + +for dataset in args.datasets: + config = Config( + evaluation_llm=args.model, + ds_path=f"data_sets/cls/{dataset}/", + task_name=dataset, + api_token=args.token, + n_eval_samples=200, + ) + + task = get_task(config, split="dev") + predictor = Classificator(llm, classes=task.classes) + + prompts = [task.initial_population[0]] + + xs = task.xs[: config.n_eval_samples] + ys = task.ys[: config.n_eval_samples] + + for prompt in tqdm(prompts): + preds, seqs = predictor.predict(prompt, xs, return_seq=True) + + scores = [] + for i in range(len(xs)): + scores.append(1 if preds[0][i] == ys[i] else 0) + + # clean up the sequences + seqs = [seq.replace("\n", "").strip() for seq in seqs] + + # if single prompts should be stored + # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores)) + # df.to_csv(args.output + "_detailed", index=False) + + accuracy = np.array(scores).mean() + + results = pd.DataFrame( + dict( + model=args.model, + dataset=dataset, + prompt=prompt, + accuracy=accuracy, + n_samples=len(xs), + ), + index=[0], + ) + results.to_csv(args.output, mode="a", header=False, index=False) + +total_inference_time = time.time() - start_time +print( + f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens." +) +print(f"Results saved to {args.output}") From 8c365c72c3753bb37ddd9f5572f828398b859802 Mon Sep 17 00:00:00 2001 From: Tom Zehle Date: Sat, 8 Mar 2025 13:09:55 +0100 Subject: [PATCH 12/41] allow for splits not based on white space (such as new line break etc) --- promptolution/predictors/classificator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py index f33bfc6..c23278a 100644 --- a/promptolution/predictors/classificator.py +++ b/promptolution/predictors/classificator.py @@ -44,7 +44,7 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray response = [] for pred in preds: predicted_class = self.classes[0] # use first class as default pred - for word in pred.split(" "): + for word in pred.split(): word = "".join([c for c in word if c.isalnum()]) if word in self.classes: predicted_class = word From 7e7d2b57aafb158b37dc50b9cf0eb1c4b0878e89 Mon Sep 17 00:00:00 2001 From: finitearth Date: Sat, 8 Mar 2025 17:55:49 +0100 Subject: [PATCH 13/41] include task descriptions --- promptolution/predictors/classificator.py | 57 ++++++++++++++++++++++- promptolution/templates.py | 8 ++++ promptolution/utils/prompt_creation.py | 12 +++-- 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py index c23278a..89eb5d4 100644 --- a/promptolution/predictors/classificator.py +++ b/promptolution/predictors/classificator.py @@ -7,7 +7,7 @@ from promptolution.predictors.base_predictor import BasePredictor -class Classificator(BasePredictor): +class FirstOccurrenceClassificator(BasePredictor): """A predictor class for classification tasks using language models. This class takes a language model and a list of classes, and provides a method @@ -33,6 +33,10 @@ def __init__(self, llm, classes, *args, **kwargs): """ super().__init__(llm) self.classes = classes + self.extraction_description = ( + f"The task is to classify the texts into one of those classes: {', '.join(classes)}." + "The first occurrence of a valid class label in the prediction is used as the predicted class." + ) def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray: """Extract class labels from the predictions, based on the list of valid class labels. @@ -54,3 +58,54 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray response = np.array(response).reshape(*shape) return response + + +class MarkerBasedClassificator(BasePredictor): + """A predictor class for classification tasks using language models. + + This class takes a language model and a list of classes, and provides a method + to predict classes for given prompts and input data. The class labels are extracted. + + Attributes: + llm: The language model used for generating predictions. + classes (List[str]): The list of valid class labels. + marker (str): The marker to use for extracting the class label. + + Inherits from: + BasePredictor: The base class for predictors in the promptolution library. + """ + + def __init__(self, llm, classes, marker="", *args, **kwargs): + """Initialize the Classificator. + + Args: + llm: The language model to use for predictions. + classes (List[str]): The list of valid class labels. + marker (str): The marker to use for extracting the class label. + *args, **kwargs: Additional arguments for the BasePredictor. + """ + super().__init__(llm) + self.classes = classes + self.marker = marker + self.extraction_description = ( + f"The task is to classify the texts into one of those classes: {','.join(classes)}." + f"The class label is extracted from the text following the marker: {marker}." + ) + + def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray: + """Extract class labels from the predictions, by extracting the text following the marker. + + Args: + preds: The raw predictions from the language model. + shape: The shape of the output array: (n_prompts, n_samples). + """ + response = [] + for pred in preds: + predicted_class = pred.split(self.marker)[-1].strip() + if predicted_class not in self.classes: + predicted_class = self.classes[0] + + response.append(predicted_class) + + response = np.array(response).reshape(*shape) + return response diff --git a/promptolution/templates.py b/promptolution/templates.py index 05d7ae3..18c0765 100644 --- a/promptolution/templates.py +++ b/promptolution/templates.py @@ -114,3 +114,11 @@ The instruction was""" + +PROMPT_CREATION_TEMPLATE_TD = """You are asked to give the corresponding prompt that gives the following outputs given these inputs for the following task: . +Return it starting with and ending with tags. +Include the name of the output classes in the prompt. + + + +The instruction was""" diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index e0c7c0b..07f8c16 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -7,7 +7,7 @@ from promptolution.llms.base_llm import BaseLLM from promptolution.tasks.base_task import BaseTask from promptolution.tasks.classification_tasks import ClassificationTask -from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_VARIATION_TEMPLATE +from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_CREATION_TEMPLATE_TD, PROMPT_VARIATION_TEMPLATE def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_prompt: str = None) -> List[str]: @@ -35,7 +35,9 @@ def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_pr return varied_prompts -def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = None, n_samples: int = 3) -> List[str]: +def create_prompts_from_samples( + task: BaseTask, llm: BaseLLM, meta_prompt: str = None, n_samples: int = 3, task_description: str = None +) -> List[str]: """Generate a set of prompts from dataset examples sampled from a given task. Idea taken from the paper Zhou et al. (2021) https://arxiv.org/pdf/2211.01910 @@ -50,6 +52,7 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = meta_prompt (str): The meta prompt to use for generating the prompts. If None, a default meta prompt is used. n_samples (int): The number of samples to use for generating prompts. + task_description (str): The description of the task to include in the prompt. Returns: List[str]: A list of generated prompts. @@ -76,7 +79,10 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = xs = task.xs[indices].tolist() ys = task.ys[indices].tolist() - meta_prompt = PROMPT_CREATION_TEMPLATE if meta_prompt is None else meta_prompt + if meta_prompt is None: + meta_prompt = PROMPT_CREATION_TEMPLATE + if task_description is None: + meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("", task_description) examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)]) meta_prompt = meta_prompt.replace("", examples) prompt = llm.get_response([meta_prompt])[0] From edcd28dc3d7b3a4ddab99edadf51ccd7c1aaa272 Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 19:33:27 +0100 Subject: [PATCH 14/41] add tokenizer based token count to vllm class --- promptolution/llms/vllm.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 5380e87..f558458 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -2,6 +2,7 @@ from logging import INFO, Logger +from typing import List try: import torch @@ -32,8 +33,7 @@ class VLLM(BaseLLM): Methods: get_response: Generate responses for a list of prompts. - get_token_count: Get the current count of input and output tokens. - reset_token_count: Reset the token counters to zero. + update_token_count: Update the token count based on the given inputs and outputs. """ def __init__( @@ -152,6 +152,21 @@ def _get_response(self, inputs: list[str]): return all_responses + def update_token_count(self, inputs: List[str], outputs: List[str]): + """Update the token count based on the given inputs and outputs. + + Uses the tokenizer to count the tokens. + + Args: + inputs (List[str]): A list of input prompts. + outputs (List[str]): A list of generated responses. + """ + for input in inputs: + self.input_token_count += len(self.tokenizer.encode(input)) + + for output in outputs: + self.output_token_count += len(self.tokenizer.encode(output)) + def __del__(self): """Cleanup method to delete the LLM instance and free up GPU memory.""" del self.llm From f2d73d4c8a65defdc6546a2f24ae9f775a1921ab Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 21:55:49 +0100 Subject: [PATCH 15/41] update test run script --- scripts/opro_test_run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py index 474af3e..854f543 100644 --- a/scripts/opro_test_run.py +++ b/scripts/opro_test_run.py @@ -16,13 +16,13 @@ def main(): """Run a test run for the Opro optimizer.""" config = Config( - meta_llm="meta-llama/Meta-Llama-3-8B-Instruct", + meta_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ", ds_path="data_sets/agnews", task_name="agnews", n_steps=10, optimizer="opro", - downstream_llm="meta-llama/Meta-Llama-3-8B-Instruct", - evaluation_llm="meta-llama/Meta-Llama-3-8B-Instruct", + downstream_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ", + evaluation_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ", ) task = get_task(config, split="dev") @@ -37,7 +37,7 @@ def main(): callbacks=[LoggerCallback(logger)], n_samples=5, ) - prompts = optimizer.optimize(n_steps=10) + prompts = optimizer.optimize(n_steps=2) logger.info(f"Optimized prompts: {prompts}") From a725384a3c536bd9e12ebe9756008ef302c20158 Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 22:13:43 +0100 Subject: [PATCH 16/41] use classifiers accordingly --- promptolution/predictors/__init__.py | 16 ++++++++++------ scripts/llm_test_run.py | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py index d850759..65705c0 100644 --- a/promptolution/predictors/__init__.py +++ b/promptolution/predictors/__init__.py @@ -3,25 +3,26 @@ from promptolution.llms import get_llm from .base_predictor import DummyPredictor -from .classificator import Classificator +from .classificator import FirstOccurrenceClassificator, MarkerBasedClassificator -def get_predictor(name, *args, **kwargs): +def get_predictor(name, type: str = "first_occurrence", *args, **kwargs): """Factory function to create and return a predictor instance based on the provided name. This function supports two types of predictors: 1. DummyPredictor: A mock predictor for testing purposes. - 2. Classificator: A real predictor using a language model for classification tasks. + 2. FirstOccurrenceClassificator: A real predictor using a language model for classification tasks. Args: name (str): Identifier for the predictor to use. Special case: - "dummy" for DummyPredictor - - Any other string for Classificator with the specified LLM + - Any other string for FirstOccurrenceClassificator with the specified LLM + type () *args: Variable length argument list passed to the predictor constructor. **kwargs: Arbitrary keyword arguments passed to the predictor constructor. Returns: - An instance of DummyPredictor or Classificator based on the name. + An instance of DummyPredictor or FirstOccurrenceClassificator based on the name. Notes: - For non-dummy predictors, this function calls get_llm to obtain the language model. @@ -36,4 +37,7 @@ def get_predictor(name, *args, **kwargs): downstream_llm = get_llm(name) - return Classificator(downstream_llm, *args, **kwargs) + if type == "first_occurrence": + return FirstOccurrenceClassificator(downstream_llm, *args, **kwargs) + elif type == "marker": + return MarkerBasedClassificator(downstream_llm, *args, **kwargs) diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py index 15b1203..442475a 100644 --- a/scripts/llm_test_run.py +++ b/scripts/llm_test_run.py @@ -7,7 +7,7 @@ import pandas as pd from promptolution.config import Config from promptolution.llms import get_llm -from promptolution.predictors import Classificator +from promptolution.predictors import FirstOccurrenceClassificator from promptolution.tasks import get_task from tqdm import tqdm @@ -56,7 +56,7 @@ ) task = get_task(config, split="dev") - predictor = Classificator(llm, classes=task.classes) + predictor = FirstOccurrenceClassificator(llm, classes=task.classes) prompts = [task.initial_population[0]] From b0f7931fada3116823f00fa72fa5b324037e57cc Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 22:15:21 +0100 Subject: [PATCH 17/41] small fix --- scripts/opro_test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py index 854f543..dc343df 100644 --- a/scripts/opro_test_run.py +++ b/scripts/opro_test_run.py @@ -17,7 +17,7 @@ def main(): """Run a test run for the Opro optimizer.""" config = Config( meta_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ", - ds_path="data_sets/agnews", + ds_path="data_sets/cls/agnews", task_name="agnews", n_steps=10, optimizer="opro", From 30e171282936970e3498f2f7a4ea72f93df443af Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 22:17:04 +0100 Subject: [PATCH 18/41] add storage path --- scripts/opro_test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py index dc343df..db71ba3 100644 --- a/scripts/opro_test_run.py +++ b/scripts/opro_test_run.py @@ -28,7 +28,7 @@ def main(): task = get_task(config, split="dev") predictor = get_predictor(config.evaluation_llm, classes=task.classes) - llm = get_llm(config.meta_llm) + llm = get_llm(config.meta_llm, model_storage_path="../models/") optimizer = Opro( llm, initial_prompts=task.initial_population, From 80b19d2ef2ef8e3f4cac54a8c3c445c9757d2a67 Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 22:21:33 +0100 Subject: [PATCH 19/41] helpers should use classificator --- promptolution/helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/promptolution/helpers.py b/promptolution/helpers.py index 9d776a9..345d849 100644 --- a/promptolution/helpers.py +++ b/promptolution/helpers.py @@ -9,7 +9,7 @@ from promptolution.exemplar_selectors import get_exemplar_selector from promptolution.llms import get_llm from promptolution.optimizers import get_optimizer -from promptolution.predictors import Classificator +from promptolution.predictors import FirstOccurrenceClassificator from promptolution.tasks import get_task @@ -38,7 +38,7 @@ def run_optimization(config: Config): """ task = get_task(config) llm = get_llm(config.meta_llm, token=config.api_token) - predictor = Classificator(llm, classes=task.classes) + predictor = FirstOccurrenceClassificator(llm, classes=task.classes) if config.init_pop_size: init_pop = np.random.choice(task.initial_population, size=config.init_pop_size, replace=True) @@ -76,7 +76,7 @@ def run_evaluation(config: Config, prompts: List[str]): task = get_task(config, split="test") llm = get_llm(config.evaluation_llm, token=config.api_token) - predictor = Classificator(llm, classes=task.classes) + predictor = FirstOccurrenceClassificator(llm, classes=task.classes) scores = task.evaluate(prompts, predictor, subsample=True, n_samples=config.n_eval_samples) df = pd.DataFrame(dict(prompt=prompts, score=scores)) From ec4861ae5a008d62e12aba57f38582ea1f96fdca Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 22:40:28 +0100 Subject: [PATCH 20/41] use different model --- scripts/opro_test_run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py index db71ba3..6a5b1f6 100644 --- a/scripts/opro_test_run.py +++ b/scripts/opro_test_run.py @@ -16,13 +16,13 @@ def main(): """Run a test run for the Opro optimizer.""" config = Config( - meta_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ", + meta_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4", ds_path="data_sets/cls/agnews", task_name="agnews", n_steps=10, optimizer="opro", - downstream_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ", - evaluation_llm="vllm-shuyuej/Llama-3.3-70B-Instruct-GPTQ", + downstream_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4", + evaluation_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4", ) task = get_task(config, split="dev") From bf7f1df50fc79bc3d9cf2fcf558ae709142a1e8b Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 22:45:14 +0100 Subject: [PATCH 21/41] changes in opro test --- scripts/opro_test_run.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py index 6a5b1f6..f7cf0c0 100644 --- a/scripts/opro_test_run.py +++ b/scripts/opro_test_run.py @@ -28,7 +28,12 @@ def main(): task = get_task(config, split="dev") predictor = get_predictor(config.evaluation_llm, classes=task.classes) - llm = get_llm(config.meta_llm, model_storage_path="../models/") + llm = get_llm( + config.meta_llm, + max_model_len=512, + model_storage_path="../models/", + revision="main" + ) optimizer = Opro( llm, initial_prompts=task.initial_population, From 3969e03b8fe6f5bf9366b3629deebd82b371b975 Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 22:57:40 +0100 Subject: [PATCH 22/41] change get_predictor function --- promptolution/predictors/__init__.py | 6 ++---- scripts/opro_test_run.py | 10 ++++++---- scripts/prompt_creation_run.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py index 65705c0..9d4d5b4 100644 --- a/promptolution/predictors/__init__.py +++ b/promptolution/predictors/__init__.py @@ -6,7 +6,7 @@ from .classificator import FirstOccurrenceClassificator, MarkerBasedClassificator -def get_predictor(name, type: str = "first_occurrence", *args, **kwargs): +def get_predictor(downstream_llm=None, type: str = "first_occurrence", *args, **kwargs): """Factory function to create and return a predictor instance based on the provided name. This function supports two types of predictors: @@ -32,11 +32,9 @@ def get_predictor(name, type: str = "first_occurrence", *args, **kwargs): >>> dummy_pred = get_predictor("dummy", classes=["A", "B", "C"]) >>> real_pred = get_predictor("gpt-3.5-turbo", classes=["positive", "negative"]) """ - if name == "dummy": + if downstream_llm is None: return DummyPredictor("", *args, **kwargs) - downstream_llm = get_llm(name) - if type == "first_occurrence": return FirstOccurrenceClassificator(downstream_llm, *args, **kwargs) elif type == "marker": diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py index f7cf0c0..81e670d 100644 --- a/scripts/opro_test_run.py +++ b/scripts/opro_test_run.py @@ -15,18 +15,19 @@ def main(): """Run a test run for the Opro optimizer.""" + llm_name = "vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4" + config = Config( - meta_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4", + meta_llm=llm_name, ds_path="data_sets/cls/agnews", task_name="agnews", n_steps=10, optimizer="opro", - downstream_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4", - evaluation_llm="vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4", + downstream_llm=llm_name, + evaluation_llm=llm_name, ) task = get_task(config, split="dev") - predictor = get_predictor(config.evaluation_llm, classes=task.classes) llm = get_llm( config.meta_llm, @@ -34,6 +35,7 @@ def main(): model_storage_path="../models/", revision="main" ) + predictor = get_predictor(llm, classes=task.classes) optimizer = Opro( llm, initial_prompts=task.initial_population, diff --git a/scripts/prompt_creation_run.py b/scripts/prompt_creation_run.py index 4c17694..f7d54c3 100644 --- a/scripts/prompt_creation_run.py +++ b/scripts/prompt_creation_run.py @@ -21,7 +21,7 @@ def main(): llm = get_llm("meta-llama/Meta-Llama-3-8B-Instruct") task = get_task(config, split="dev") - predictor = get_predictor("meta-llama/Meta-Llama-3-8B-Instruct", classes=task.classes) + predictor = get_predictor(llm, classes=task.classes) init_prompts = create_prompts_from_samples(task, llm) logger.critical(f"Initial prompts: {init_prompts}") From bd05cd80cee6c3b5491a6bbf19336edff8089c9c Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 23:04:58 +0100 Subject: [PATCH 23/41] fix callback calling --- promptolution/optimizers/base_optimizer.py | 4 ++-- scripts/opro_test_run.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py index 95ec7c2..e150c1a 100644 --- a/promptolution/optimizers/base_optimizer.py +++ b/promptolution/optimizers/base_optimizer.py @@ -71,14 +71,14 @@ def _on_epoch_end(self): """Call all registered callbacks at the end of each optimization epoch.""" continue_optimization = True for callback in self.callbacks: - continue_optimization &= callback._on_epoch_end(self) # if any callback returns False, end the optimization + continue_optimization &= callback.on_epoch_end(self) # if any callback returns False, end the optimization return continue_optimization def _on_train_end(self): """Call all registered callbacks at the end of the entire optimization process.""" for callback in self.callbacks: - callback._on_train_end(self) + callback.on_train_end(self) class DummyOptimizer(BaseOptimizer): diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py index 81e670d..e6654a0 100644 --- a/scripts/opro_test_run.py +++ b/scripts/opro_test_run.py @@ -31,7 +31,7 @@ def main(): llm = get_llm( config.meta_llm, - max_model_len=512, + max_model_len=2000, model_storage_path="../models/", revision="main" ) From 96e1bf613b3c100b1725aa9f040903181d4f3266 Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 23:11:02 +0100 Subject: [PATCH 24/41] change optimizer test run script --- scripts/opro_test_run.py | 53 ------------------------- scripts/optimizer_test_run.py | 73 +++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 53 deletions(-) delete mode 100644 scripts/opro_test_run.py create mode 100644 scripts/optimizer_test_run.py diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py deleted file mode 100644 index e6654a0..0000000 --- a/scripts/opro_test_run.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Test run for the Opro optimizer.""" - -from logging import Logger - -from promptolution.callbacks import LoggerCallback -from promptolution.llms import get_llm -from promptolution.optimizers import Opro -from promptolution.predictors import get_predictor -from promptolution.tasks import get_task - -from promptolution.config import Config - -logger = Logger(__name__) - - -def main(): - """Run a test run for the Opro optimizer.""" - llm_name = "vllm-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4" - - config = Config( - meta_llm=llm_name, - ds_path="data_sets/cls/agnews", - task_name="agnews", - n_steps=10, - optimizer="opro", - downstream_llm=llm_name, - evaluation_llm=llm_name, - - ) - task = get_task(config, split="dev") - - llm = get_llm( - config.meta_llm, - max_model_len=2000, - model_storage_path="../models/", - revision="main" - ) - predictor = get_predictor(llm, classes=task.classes) - optimizer = Opro( - llm, - initial_prompts=task.initial_population, - task=task, - predictor=predictor, - callbacks=[LoggerCallback(logger)], - n_samples=5, - ) - prompts = optimizer.optimize(n_steps=2) - - logger.info(f"Optimized prompts: {prompts}") - - -if __name__ == "__main__": - main() diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py new file mode 100644 index 0000000..03fc79e --- /dev/null +++ b/scripts/optimizer_test_run.py @@ -0,0 +1,73 @@ +"""Test run for the Opro optimizer.""" +import argparse +from logging import Logger + +from promptolution.callbacks import LoggerCallback +from promptolution.llms import get_llm +from promptolution.optimizers import Opro, EvoPromptDE, EvoPromptGA +from promptolution.predictors import get_predictor +from promptolution.tasks import get_task + +from promptolution.config import Config + +logger = Logger(__name__) + +"""Run a test run for any of the implemented optimizers.""" +parser = argparse.ArgumentParser() +parser.add_argument("--model") +parser.add_argument("--model-storage-path", default="../models/") +parser.add_argument("--optimizer", default="evoprompt_de") +parser.add_argument("--n-steps", default=10) +args = parser.parse_args() + +config = Config( + meta_llm=args.model, + ds_path="data_sets/cls/agnews", + task_name="agnews", + n_steps=10, + optimizer="opro", + downstream_llm=args.model, + evaluation_llm=args.model, + +) +task = get_task(config, split="dev") + +llm = get_llm( + config.meta_llm, + max_model_len=2000, + model_storage_path=args.model_storage_path, + revision="main" +) +predictor = get_predictor(llm, classes=task.classes) + +if args.optimizer == "evoprompt_de": + optimizer = EvoPromptDE( + llm, + initial_prompts=task.initial_population, + task=task, + predictor=predictor, + callbacks=[LoggerCallback(logger)], + n_samples=5, + ) +elif args.optimizer == "evoprompt_ga": + optimizer = EvoPromptGA( + llm, + initial_prompts=task.initial_population, + task=task, + predictor=predictor, + callbacks=[LoggerCallback(logger)], + n_samples=5, + ) +else: + optimizer = Opro( + llm, + initial_prompts=task.initial_population, + task=task, + predictor=predictor, + callbacks=[LoggerCallback(logger)], + n_samples=5, + ) + +prompts = optimizer.optimize(n_steps=args.n_steps) + +logger.info(f"Optimized prompts: {prompts}") From 62c8de79c2eec0418f23d1f7612b4296b7e6986d Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 23:15:41 +0100 Subject: [PATCH 25/41] small alignments --- scripts/optimizer_test_run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py index 03fc79e..e165993 100644 --- a/scripts/optimizer_test_run.py +++ b/scripts/optimizer_test_run.py @@ -42,7 +42,7 @@ if args.optimizer == "evoprompt_de": optimizer = EvoPromptDE( - llm, + meta_llm=llm, initial_prompts=task.initial_population, task=task, predictor=predictor, @@ -51,7 +51,7 @@ ) elif args.optimizer == "evoprompt_ga": optimizer = EvoPromptGA( - llm, + meta_llm=llm, initial_prompts=task.initial_population, task=task, predictor=predictor, @@ -60,7 +60,7 @@ ) else: optimizer = Opro( - llm, + meta_llm=llm, initial_prompts=task.initial_population, task=task, predictor=predictor, From 1aa56067e289a8b0f4e7efb7a42205444edfbc27 Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 23:17:27 +0100 Subject: [PATCH 26/41] small alignments --- scripts/optimizer_test_run.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py index e165993..9abf794 100644 --- a/scripts/optimizer_test_run.py +++ b/scripts/optimizer_test_run.py @@ -47,7 +47,6 @@ task=task, predictor=predictor, callbacks=[LoggerCallback(logger)], - n_samples=5, ) elif args.optimizer == "evoprompt_ga": optimizer = EvoPromptGA( @@ -56,7 +55,6 @@ task=task, predictor=predictor, callbacks=[LoggerCallback(logger)], - n_samples=5, ) else: optimizer = Opro( From 7214658b25e3bbc3e6f3bf00b709d9f5863036ea Mon Sep 17 00:00:00 2001 From: mo374z Date: Sat, 8 Mar 2025 23:19:57 +0100 Subject: [PATCH 27/41] small alignments --- scripts/optimizer_test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py index 9abf794..8676fda 100644 --- a/scripts/optimizer_test_run.py +++ b/scripts/optimizer_test_run.py @@ -17,7 +17,7 @@ parser.add_argument("--model") parser.add_argument("--model-storage-path", default="../models/") parser.add_argument("--optimizer", default="evoprompt_de") -parser.add_argument("--n-steps", default=10) +parser.add_argument("--n-steps", type=int, default=10) args = parser.parse_args() config = Config( From 0b15410863d4d036fa7113dd1820825932acdf04 Mon Sep 17 00:00:00 2001 From: mo374z Date: Sun, 9 Mar 2025 00:05:49 +0100 Subject: [PATCH 28/41] some changes to match the current optimizer implementation --- promptolution/config.py | 1 + promptolution/helpers.py | 5 +-- promptolution/llms/api_llm.py | 4 +-- promptolution/optimizers/__init__.py | 10 ++++-- scripts/optimizer_test_run.py | 51 +++++----------------------- 5 files changed, 21 insertions(+), 50 deletions(-) diff --git a/promptolution/config.py b/promptolution/config.py index dac2d9a..ca07522 100644 --- a/promptolution/config.py +++ b/promptolution/config.py @@ -56,6 +56,7 @@ class Config: include_task_desc: bool = True donor_random: bool = False random_seed: int = 42 + model_storage_path: Optional[Path] = Path("../models/") selection_mode: Optional[Literal["random", "wheel", "tour"]] = "random" meta_bs: Optional[int] = None downstream_bs: Optional[int] = None diff --git a/promptolution/helpers.py b/promptolution/helpers.py index 345d849..0420b17 100644 --- a/promptolution/helpers.py +++ b/promptolution/helpers.py @@ -27,7 +27,7 @@ def run_experiment(config: Config): return df -def run_optimization(config: Config): +def run_optimization(config: Config, callbacks: List = None): """Run the optimization phase of the experiment. Args: @@ -37,7 +37,7 @@ def run_optimization(config: Config): List[str]: The optimized list of prompts. """ task = get_task(config) - llm = get_llm(config.meta_llm, token=config.api_token) + llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path) predictor = FirstOccurrenceClassificator(llm, classes=task.classes) if config.init_pop_size: @@ -52,6 +52,7 @@ def run_optimization(config: Config): task=task, predictor=predictor, n_eval_samples=config.n_eval_samples, + callbacks=callbacks, ) prompts = optimizer.optimize(n_steps=config.n_steps) diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py index 14a70da..91c9942 100644 --- a/promptolution/llms/api_llm.py +++ b/promptolution/llms/api_llm.py @@ -3,7 +3,7 @@ import asyncio import time from logging import INFO, Logger -from typing import List +from typing import Any, List import nest_asyncio import openai @@ -63,7 +63,7 @@ class APILLM(BaseLLM): get_response_async: Asynchronously get responses for a list of prompts. """ - def __init__(self, model_id: str, token: str = None): + def __init__(self, model_id: str, token: str = None, **kwargs: Any): """Initialize the APILLM with a specific model. Args: diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py index 7e386a2..acde31e 100644 --- a/promptolution/optimizers/__init__.py +++ b/promptolution/optimizers/__init__.py @@ -51,9 +51,13 @@ def get_optimizer( if optimizer == "dummy": return DummyOptimizer(*args, **kwargs) if config.optimizer == "evopromptde": - return EvoPromptDE(donor_random=config.donor_random, *args, **kwargs) + if include_task_desc: + return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE_TD, *args, **kwargs) + return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE, *args, **kwargs) if config.optimizer == "evopromptga": - return EvoPromptGA(selection_mode=config.selection_mode, *args, **kwargs) + if include_task_desc: + return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE_TD, *args, **kwargs) + return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE, *args, **kwargs) if config.optimizer == "opro": - return Opro(*args, **kwargs) + return Opro(prompt_template=OPRO_TEMPLATE, *args, **kwargs) raise ValueError(f"Unknown optimizer: {config.optimizer}") diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py index 8676fda..4f1a851 100644 --- a/scripts/optimizer_test_run.py +++ b/scripts/optimizer_test_run.py @@ -3,10 +3,7 @@ from logging import Logger from promptolution.callbacks import LoggerCallback -from promptolution.llms import get_llm -from promptolution.optimizers import Opro, EvoPromptDE, EvoPromptGA -from promptolution.predictors import get_predictor -from promptolution.tasks import get_task +from promptolution.helpers import run_optimization from promptolution.config import Config @@ -16,56 +13,24 @@ parser = argparse.ArgumentParser() parser.add_argument("--model") parser.add_argument("--model-storage-path", default="../models/") -parser.add_argument("--optimizer", default="evoprompt_de") +parser.add_argument("--optimizer", default="evopromptde") parser.add_argument("--n-steps", type=int, default=10) +parser.add_argument("--token", default=None) args = parser.parse_args() config = Config( meta_llm=args.model, ds_path="data_sets/cls/agnews", task_name="agnews", - n_steps=10, - optimizer="opro", + n_steps=args.n_steps, + optimizer=args.optimizer, downstream_llm=args.model, evaluation_llm=args.model, - -) -task = get_task(config, split="dev") - -llm = get_llm( - config.meta_llm, - max_model_len=2000, + include_task_desc=True, + api_token=args.token, model_storage_path=args.model_storage_path, - revision="main" ) -predictor = get_predictor(llm, classes=task.classes) - -if args.optimizer == "evoprompt_de": - optimizer = EvoPromptDE( - meta_llm=llm, - initial_prompts=task.initial_population, - task=task, - predictor=predictor, - callbacks=[LoggerCallback(logger)], - ) -elif args.optimizer == "evoprompt_ga": - optimizer = EvoPromptGA( - meta_llm=llm, - initial_prompts=task.initial_population, - task=task, - predictor=predictor, - callbacks=[LoggerCallback(logger)], - ) -else: - optimizer = Opro( - meta_llm=llm, - initial_prompts=task.initial_population, - task=task, - predictor=predictor, - callbacks=[LoggerCallback(logger)], - n_samples=5, - ) -prompts = optimizer.optimize(n_steps=args.n_steps) +prompts = run_optimization(config, callbacks=[LoggerCallback(logger)]) logger.info(f"Optimized prompts: {prompts}") From 39679788e437d1fd6ab86db5aecea62ec1833092 Mon Sep 17 00:00:00 2001 From: finitearth Date: Sun, 9 Mar 2025 11:38:08 +0100 Subject: [PATCH 29/41] changes in template and config --- promptolution/config.py | 7 +++++-- promptolution/helpers.py | 10 ++++++++-- promptolution/optimizers/__init__.py | 22 +++++++++++++++------- promptolution/optimizers/opro.py | 1 - promptolution/templates.py | 17 +++++++++++++++-- scripts/optimizer_test_run.py | 2 +- 6 files changed, 44 insertions(+), 15 deletions(-) diff --git a/promptolution/config.py b/promptolution/config.py index ca07522..25e254c 100644 --- a/promptolution/config.py +++ b/promptolution/config.py @@ -17,15 +17,17 @@ class Config: ds_path (str): Path to the dataset. Should not be None if used. n_steps (int): Number of optimization steps. Should not be None if used. optimizer (str): Name of the optimizer to use. Should not be None if used. + predictor (str): Name of the predictor to use. Defaults to "FirstOccurenceClassificator". meta_llm (str): Name of the meta language model. Should not be None if used. downstream_llm (str): Name of the downstream language model. Should not be None if used. evaluation_llm (str): Name of the evaluation language model. Should not be None if used. init_pop_size (int): Initial population size. Defaults to 10. logging_dir (str): Directory for logging. Defaults to "logs/run.csv". experiment_name (str): Name of the experiment. Defaults to "experiment". - include_task_desc (bool): Whether to include task description. Defaults to False. + task_description (str): Task Description fed to the optimizer. Defaults to None. donor_random (bool): Whether to use random donor prompts for EvoPromptDE. Defaults to False. random_seed (int): Random seed for reproducibility. Defaults to 42. + model_storage_path (str): Path to the model storage directory (used for VLLM). Defaults to "../models/". selection_mode (str): Selection mode for EvoPromptGA. Defaults to "random". meta_bs (int): Batch size for local meta LLM. Should not be None if llm is run locally. Defaults to None. downstream_bs (int): Batch size for local downstream LLM. @@ -46,6 +48,7 @@ class Config: task_name: str = None ds_path: Path = None optimizer: str = None + predictor: Literal["MarkerBasedClassificator", "FirstOccurenceClassificator"] = "FirstOccurenceClassificator" meta_llm: str = None downstream_llm: str = None evaluation_llm: str = None @@ -53,7 +56,7 @@ class Config: init_pop_size: int = None logging_dir: Path = Path("logs/run.csv") experiment_name: str = "experiment" - include_task_desc: bool = True + task_description: str = None donor_random: bool = False random_seed: int = 42 model_storage_path: Optional[Path] = Path("../models/") diff --git a/promptolution/helpers.py b/promptolution/helpers.py index 0420b17..52472ea 100644 --- a/promptolution/helpers.py +++ b/promptolution/helpers.py @@ -9,7 +9,7 @@ from promptolution.exemplar_selectors import get_exemplar_selector from promptolution.llms import get_llm from promptolution.optimizers import get_optimizer -from promptolution.predictors import FirstOccurrenceClassificator +from promptolution.predictors import FirstOccurrenceClassificator, MarkerBasedClassificator from promptolution.tasks import get_task @@ -38,7 +38,12 @@ def run_optimization(config: Config, callbacks: List = None): """ task = get_task(config) llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path) - predictor = FirstOccurrenceClassificator(llm, classes=task.classes) + if config.predictor == "MarkerBasedClassificator": + predictor = MarkerBasedClassificator(llm, classes=task.classes) + elif config.predictor == "FirstOccurenceClassificator": + predictor = FirstOccurrenceClassificator(llm, classes=task.classes) + else: + raise ValueError(f"Predictor {config.predictor} not supported.") if config.init_pop_size: init_pop = np.random.choice(task.initial_population, size=config.init_pop_size, replace=True) @@ -53,6 +58,7 @@ def run_optimization(config: Config, callbacks: List = None): predictor=predictor, n_eval_samples=config.n_eval_samples, callbacks=callbacks, + task_description=predictor.extraction_description, ) prompts = optimizer.optimize(n_steps=config.n_steps) diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py index acde31e..09c57fa 100644 --- a/promptolution/optimizers/__init__.py +++ b/promptolution/optimizers/__init__.py @@ -6,6 +6,7 @@ EVOPROMPT_GA_TEMPLATE, EVOPROMPT_GA_TEMPLATE_TD, OPRO_TEMPLATE, + OPRO_TEMPLATE_TD, ) from .base_optimizer import DummyOptimizer @@ -15,7 +16,7 @@ def get_optimizer( - config=None, optimizer: str = None, include_task_desc: bool = None, meta_prompt: str = None, *args, **kwargs + config=None, optimizer: str = None, meta_prompt: str = None, task_description: str = None, *args, **kwargs ): """Factory function to create and return an optimizer instance based on the provided configuration. @@ -30,6 +31,7 @@ def get_optimizer( - Any other string for the specified optimizer class include_task_desc (bool): Flag to include task description in the prompt. meta_prompt (str): Meta prompt for the optimizer. + task_description (str): Task description for the optimizer. *args: Variable length argument list passed to the optimizer constructor. **kwargs: Arbitrary keyword arguments passed to the optimizer constructor @@ -42,8 +44,8 @@ def get_optimizer( if optimizer is None: optimizer = config.optimizer - if include_task_desc is None: - include_task_desc = config.include_task_desc + if task_description is None: + task_description = config.task_description if config is not None and meta_prompt is None: meta_prompt = config.meta_prompt @@ -51,13 +53,19 @@ def get_optimizer( if optimizer == "dummy": return DummyOptimizer(*args, **kwargs) if config.optimizer == "evopromptde": - if include_task_desc: - return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE_TD, *args, **kwargs) + if task_description is not None: + return EvoPromptDE( + prompt_template=EVOPROMPT_DE_TEMPLATE_TD.replace("", task_description), *args, **kwargs + ) return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE, *args, **kwargs) if config.optimizer == "evopromptga": - if include_task_desc: - return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE_TD, *args, **kwargs) + if task_description is not None: + return EvoPromptGA( + prompt_template=EVOPROMPT_GA_TEMPLATE_TD.replace("", task_description), *args, **kwargs + ) return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE, *args, **kwargs) if config.optimizer == "opro": + if task_description is not None: + return Opro(prompt_template=OPRO_TEMPLATE_TD.replace("", task_description), *args, **kwargs) return Opro(prompt_template=OPRO_TEMPLATE, *args, **kwargs) raise ValueError(f"Unknown optimizer: {config.optimizer}") diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py index ef6f6fd..7ef3616 100644 --- a/promptolution/optimizers/opro.py +++ b/promptolution/optimizers/opro.py @@ -36,7 +36,6 @@ def __init__(self, meta_llm: BaseLLM, n_samples: int = 2, prompt_template: str = self.meta_prompt = prompt_template if prompt_template else OPRO_TEMPLATE super().__init__(**args) - self.meta_prompt = self.meta_prompt.replace("", self.task.description) self.scores = [ self.task.evaluate(p, self.predictor, subsample=True, n_samples=self.n_eval_samples)[0] diff --git a/promptolution/templates.py b/promptolution/templates.py index 18c0765..6cbc39e 100644 --- a/promptolution/templates.py +++ b/promptolution/templates.py @@ -86,8 +86,21 @@ 1.""" -OPRO_TEMPLATE = """Your task is to generate an instruction for the following task: - +OPRO_TEMPLATE = """Your task is to generate an instruction. + +Below are some previous instructions with their scores. The score ranges from 0 to 100. + + + +Here are some examples of the target dataset: + + +Generate a new instruction bracketed with and ending it with that is different from all the instructions above and has a higher score than all the instructions above. The instruction should be concise, effective, and generally applicable to the task described. + +Your new instruction:""" + +OPRO_TEMPLATE_TD = """Your task is to generate an instruction for the following task: + Below are some previous instructions with their scores. The score ranges from 0 to 100. diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py index 4f1a851..eaa683b 100644 --- a/scripts/optimizer_test_run.py +++ b/scripts/optimizer_test_run.py @@ -22,11 +22,11 @@ meta_llm=args.model, ds_path="data_sets/cls/agnews", task_name="agnews", + predictor = "MarkerBasedClassificator", n_steps=args.n_steps, optimizer=args.optimizer, downstream_llm=args.model, evaluation_llm=args.model, - include_task_desc=True, api_token=args.token, model_storage_path=args.model_storage_path, ) From 9f8c0b6080ee0a791c51ec06aca62e14aca49f33 Mon Sep 17 00:00:00 2001 From: finitearth Date: Sun, 9 Mar 2025 12:15:48 +0100 Subject: [PATCH 30/41] allow for batching of prompt creation --- promptolution/utils/prompt_creation.py | 76 ++++++++++++++------------ 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index 07f8c16..08e88dd 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -36,7 +36,12 @@ def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_pr def create_prompts_from_samples( - task: BaseTask, llm: BaseLLM, meta_prompt: str = None, n_samples: int = 3, task_description: str = None + task: BaseTask, + llm: BaseLLM, + meta_prompt: str = None, + n_samples: int = 3, + task_description: str = None, + n_prompts: int = 1, ) -> List[str]: """Generate a set of prompts from dataset examples sampled from a given task. @@ -53,40 +58,43 @@ def create_prompts_from_samples( If None, a default meta prompt is used. n_samples (int): The number of samples to use for generating prompts. task_description (str): The description of the task to include in the prompt. + n_prompts (int): The number of prompts to generate. Returns: List[str]: A list of generated prompts. """ - if isinstance(task, ClassificationTask): - # if classification task sample such that all classes are represented - unique_labels, counts = np.unique(task.ys, return_counts=True) - proportions = counts / len(task.ys) - samples_per_class = np.round(proportions * n_samples).astype(int) - samples_per_class = np.maximum(samples_per_class, 1) - - # sample - xs = [] - ys = [] - for label, n_samples in zip(unique_labels, samples_per_class): - indices = np.where(task.ys == label)[0] - indices = np.random.choice(indices, n_samples, replace=False) - xs.extend(task.xs[indices]) - ys.extend(task.ys[indices]) - - else: - # if not classification task, sample randomly - indices = np.random.choice(len(task.xs), n_samples, replace=False) - xs = task.xs[indices].tolist() - ys = task.ys[indices].tolist() - - if meta_prompt is None: - meta_prompt = PROMPT_CREATION_TEMPLATE - if task_description is None: - meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("", task_description) - examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)]) - meta_prompt = meta_prompt.replace("", examples) - prompt = llm.get_response([meta_prompt])[0] - prompt = prompt.split("")[0].split("")[-1] - prompt = prompt.strip() - - return prompt + meta_prompts = [] + for _ in range(n_prompts): + if isinstance(task, ClassificationTask): + # if classification task sample such that all classes are represented + unique_labels, counts = np.unique(task.ys, return_counts=True) + proportions = counts / len(task.ys) + samples_per_class = np.round(proportions * n_samples).astype(int) + samples_per_class = np.maximum(samples_per_class, 1) + + # sample + xs = [] + ys = [] + for label, n_samples in zip(unique_labels, samples_per_class): + indices = np.where(task.ys == label)[0] + indices = np.random.choice(indices, n_samples, replace=False) + xs.extend(task.xs[indices]) + ys.extend(task.ys[indices]) + + else: + # if not classification task, sample randomly + indices = np.random.choice(len(task.xs), n_samples, replace=False) + xs = task.xs[indices].tolist() + ys = task.ys[indices].tolist() + + if meta_prompt is None: + meta_prompt = PROMPT_CREATION_TEMPLATE + if task_description is None: + meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("", task_description) + examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)]) + meta_prompt = meta_prompt.replace("", examples) + meta_prompts.append(meta_prompt) + prompts = llm.get_response(meta_prompts) + prompts = [prompt.split("")[0].split("")[-1].strip() for prompt in prompts] + + return prompts From 8ecc6a8bfdef8835d69e29b9c9de49eaa2ef0838 Mon Sep 17 00:00:00 2001 From: Moritz Schlager <87517800+mo374z@users.noreply.github.com> Date: Sun, 9 Mar 2025 21:45:49 +0100 Subject: [PATCH 31/41] v1.3.0 (#34) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Feature/workflows (#8) * chore: add codeowners file * chore: add python poetry action and docs workflow * chore: update pre-commit file * chore: update docs * chore: update logo * chore: add cicd pipeline for automated deployment * chore: update poetry version * chore: fix action versioning * chore: add gitattributes to ignore line count in jupyter notebooks * chore: add and update docstrings * chore: fix end of files * chore: update action versions * Update README.md --------- Co-authored-by: mo374z * Fix/workflows (#11) * chore: fix workflow execution * chore: fix version check in CICD pipeline * Opro implementation (#7) * update gitignore * initial implementation of opro * formatting of prompt template * added opro test run * opro refinements * fixed sampling error * add docs to opro * fix pre commit issues# * fix pre commit issues# * fixed end of line * Patch/pre commit config (#10) * fixed pre commit config and removed end of file line breaks in tempaltes * added / * Feature/prompt generation (#12) * added prompt_creation.py * change version * Create LICENSE (#14) * Refactor/remove deepinfra (#16) * Remove deepinfra file * change langchain-community version * Usability patches (#15) * renamed get_tasks to get_task and change functionality accordingly. moved templates and data_sets * init * move templates to templates.py * Add nested asyncio to make it useable in notebooks * Update README.md * changed getting_started.ipynb and created helper functions * added sampling of initial population * fixed config * fixed callbacks * adjust runs * fix run evaluation api token * fix naming convention in opro, remove on epoch end for logger callback, fixed to allow for numeric values in class names * Update promptolution/llms/api_llm.py Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com> * fixed comments * Update pyproject.toml * resolve comments --------- Co-authored-by: mo374z Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com> Co-authored-by: Moritz Schlager <87517800+mo374z@users.noreply.github.com> * Feature/examplar selection (#17) * implemented random selector * added random search selector * increased version count * fix typos * Update promptolution/predictors/base_predictor.py Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com> * Update promptolution/tasks/classification_tasks.py Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com> * resolve comments * resolve comments --------- Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com> * Chore/docs release notes (#18) * Update release-notes.md * Fix release note links * revert Chore/docs release notes (#18)" This reverts commit e23dd743cf7e1eefc89746409619dc947bd6d349. * revert last commit * updated release notes and read me * Feature/read from df (#21) * Delete Experiment files * Removed config necessities * improved opro meta-prompts * added read from data frame feature * changed required python version to 3.9 * Update pyproject.toml * Update release-notes.md * merge * merge * resolve merge mistakes * delete duplicated lines * Update release-notes.md (#24) * Fix/dependencies (#28) * delete poetry.lock and upgrade transformers dependency * Update release-notes.md * Add vllm as feature and a llm_test_run_script * small fixes in vllm class * differentiate between vllm and api inference * set up experiment over multiple tasks and prompts * change csv saving * add base llm super class * add changes from PR review * change some VLLM params * fix tensor parallel size to 1 * experiment with batch size * experiment with larger batch sizes * add continuous batch llm * remove arg * remove continuous batch inference try * add batching to vllm * add batching in script * Add release notes and increase version number * remove llm_test_run.py script * change system prompt * Fix/vllm (#33) * add token count, flexible batch size and kwargs to vllm class * add testing script for implementation * fix batch size calculation * small changes * add revision test * add argument to parser * max model len to int * remove script * Change version and Release notes * changed callback behaviour and impelemented token count callback * added super inits * allow for splits not based on white space (such as new line break etc) * include task descriptions * add tokenizer based token count to vllm class * update test run script * use classifiers accordingly * small fix * add storage path * helpers should use classificator * use different model * changes in opro test * change get_predictor function * fix callback calling * change optimizer test run script * small alignments * small alignments * small alignments * some changes to match the current optimizer implementation * changes in template and config * allow for batching of prompt creation * update release notes and version * extend csvcallback functionality * change callback csv export * change step time calculation * small changes * remove llm_test_run script * update release notes * fix issues in token stepswise calculation * small fix --------- Co-authored-by: finitearth * implement changes from review * add typing to token count callback --------- Co-authored-by: Timo Heiß <87521684+timo282@users.noreply.github.com> Co-authored-by: Tom Zehle Co-authored-by: Timo Heiß --- .gitignore | 1 + docs/release-notes.md | 14 +++ promptolution/callbacks.py | 121 ++++++++++++++++++--- promptolution/config.py | 8 +- promptolution/helpers.py | 17 ++- promptolution/llms/api_llm.py | 7 +- promptolution/llms/base_llm.py | 59 +++++++++- promptolution/llms/local_llm.py | 4 +- promptolution/llms/vllm.py | 70 +++++++++--- promptolution/optimizers/__init__.py | 24 +++- promptolution/optimizers/base_optimizer.py | 12 +- promptolution/optimizers/evoprompt_de.py | 6 +- promptolution/optimizers/evoprompt_ga.py | 5 +- promptolution/optimizers/opro.py | 5 +- promptolution/predictors/__init__.py | 45 ++++---- promptolution/predictors/classificator.py | 59 +++++++++- promptolution/templates.py | 25 ++++- promptolution/utils/prompt_creation.py | 78 +++++++------ pyproject.toml | 2 +- scripts/opro_test_run.py | 46 -------- scripts/optimizer_test_run.py | 36 ++++++ scripts/prompt_creation_run.py | 2 +- 22 files changed, 480 insertions(+), 166 deletions(-) delete mode 100644 scripts/opro_test_run.py create mode 100644 scripts/optimizer_test_run.py diff --git a/.gitignore b/.gitignore index 5786ca0..088f43a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ __pycache__/ temp/ dist/ outputs/ +results/ poetry.lock diff --git a/docs/release-notes.md b/docs/release-notes.md index 20b97b7..8ea09ca 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,5 +1,19 @@ # Release Notes +## Release v1.3.0 +### What's changed +#### Added features +* new features for the VLLM Wrapper (automatic batch size determination, accepting kwargs) +* allow callbacks to terminate optimization run +* add token count functionality +* renamed "Classificator"-Predictor to "FirstOccurenceClassificator" +* introduced "MarkerBasedClassifcator" +* automatic task description creation +* use task description in prompt creation +* implement CSV callbacks + +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.2.0...v1.3.0) + ## Release v1.2.0 ### What's changed #### Added features diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py index fe655d6..48a9b3e 100644 --- a/promptolution/callbacks.py +++ b/promptolution/callbacks.py @@ -1,7 +1,10 @@ """Callback classes for logging, saving, and tracking optimization progress.""" import os +import time +from typing import Literal +import numpy as np import pandas as pd from tqdm import tqdm @@ -14,24 +17,33 @@ def on_step_end(self, optimizer): Args: optimizer: The optimizer object that called the callback. + + Returns: + Bool: True if the optimization should continue, False if it should stop. """ - pass + return True def on_epoch_end(self, optimizer): """Called at the end of each optimization epoch. Args: optimizer: The optimizer object that called the callback. + + Returns: + Bool: True if the optimization should continue, False if it should stop. """ - pass + return True def on_train_end(self, optimizer): """Called at the end of the entire optimization process. Args: optimizer: The optimizer object that called the callback. + + Returns: + Bool: True if the optimization should continue, False if it should stop. """ - pass + return True class LoggerCallback(Callback): @@ -57,6 +69,8 @@ def on_step_end(self, optimizer): self.logger.critical(f"*** Prompt {i}: Score: {score}") self.logger.critical(f"{prompt}") + return True + def on_train_end(self, optimizer, logs=None): """Log information at the end of training. @@ -64,7 +78,12 @@ def on_train_end(self, optimizer, logs=None): optimizer: The optimizer object that called the callback. logs: Additional information to log. """ - self.logger.critical(f"Training ended - {logs}") + if logs is None: + self.logger.critical("Training ended") + else: + self.logger.critical(f"Training ended - {logs}") + + return True class CSVCallback(Callback): @@ -73,25 +92,25 @@ class CSVCallback(Callback): This callback saves prompts and scores at each step to a CSV file. Attributes: - path (str): The path to the CSV file. + dir (str): Directory the CSV file is saved to. step (int): The current step number. """ - def __init__(self, path): + def __init__(self, dir): """Initialize the CSVCallback. Args: - path (str): The path to the CSV file. + dir (str): Directory the CSV file is saved to. """ - # if dir does not exist - if not os.path.exists(os.path.dirname(path)): - os.makedirs(os.path.dirname(path)) - - # create file in path with header: "step,prompt,score" - with open(path, "w") as f: - f.write("step,prompt,score\n") - self.path = path + if not os.path.exists(dir): + os.makedirs(dir) + + self.dir = dir self.step = 0 + self.input_tokens = 0 + self.output_tokens = 0 + self.start_time = time.time() + self.step_time = time.time() def on_step_end(self, optimizer): """Save prompts and scores to csv. @@ -101,9 +120,25 @@ def on_step_end(self, optimizer): """ self.step += 1 df = pd.DataFrame( - {"step": [self.step] * len(optimizer.prompts), "prompt": optimizer.prompts, "score": optimizer.scores} + { + "step": [self.step] * len(optimizer.prompts), + "input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts), + "output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts), + "time_elapsed": [time.time() - self.step_time] * len(optimizer.prompts), + "score": optimizer.scores, + "prompt": optimizer.prompts, + } ) - df.to_csv(self.path, mode="a", header=False, index=False) + self.step_time = time.time() + self.input_tokens = optimizer.meta_llm.input_token_count + self.output_tokens = optimizer.meta_llm.output_token_count + + if not os.path.exists(self.dir + "step_results.csv"): + df.to_csv(self.dir + "step_results.csv", index=False) + else: + df.to_csv(self.dir + "step_results.csv", mode="a", header=False, index=False) + + return True def on_train_end(self, optimizer): """Called at the end of training. @@ -111,7 +146,24 @@ def on_train_end(self, optimizer): Args: optimizer: The optimizer object that called the callback. """ - pass + df = pd.DataFrame( + dict( + steps=self.step, + input_tokens=optimizer.meta_llm.input_token_count, + output_tokens=optimizer.meta_llm.output_token_count, + time_elapsed=time.time() - self.start_time, + score=np.array(optimizer.scores).mean(), + best_prompts=str(optimizer.prompts), + ), + index=[0], + ) + + if not os.path.exists(self.dir + "train_results.csv"): + df.to_csv(self.dir + "train_results.csv", index=False) + else: + df.to_csv(self.dir + "train_results.csv", mode="a", header=False, index=False) + + return True class BestPromptCallback(Callback): @@ -139,6 +191,8 @@ def on_step_end(self, optimizer): self.best_score = optimizer.scores[0] self.best_prompt = optimizer.prompts[0] + return True + def get_best_prompt(self): """Get the best prompt and score achieved during optimization. @@ -173,6 +227,8 @@ def on_step_end(self, optimizer): """ self.pbar.update(1) + return True + def on_train_end(self, optimizer): """Close the progress bar at the end of training. @@ -180,3 +236,32 @@ def on_train_end(self, optimizer): optimizer: The optimizer object that called the callback. """ self.pbar.close() + + return True + + +class TokenCountCallback(Callback): + """Callback for stopping optimization based on the total token count.""" + + def __init__( + self, + max_tokens_for_termination: int, + token_type_for_termination: Literal["input_tokens", "output_tokens", "total_tokens"], + ): + """Initialize the TokenCountCallback. + + Args: + max_tokens_for_termination (int): Maximum number of tokens which is allowed befor the algorithm is stopped. + token_type_for_termination (str): Can be one of either "input_tokens", "output_tokens" or "total_tokens". + """ + self.max_tokens_for_termination = max_tokens_for_termination + self.token_type_for_termination = token_type_for_termination + + def on_step_end(self, optimizer): + """Check if the total token count exceeds the maximum allowed. If so, stop the optimization.""" + token_counts = optimizer.predictor.llm.get_token_count() + + if token_counts[self.token_type_for_termination] > self.max_tokens_for_termination: + return False + + return True diff --git a/promptolution/config.py b/promptolution/config.py index dac2d9a..25e254c 100644 --- a/promptolution/config.py +++ b/promptolution/config.py @@ -17,15 +17,17 @@ class Config: ds_path (str): Path to the dataset. Should not be None if used. n_steps (int): Number of optimization steps. Should not be None if used. optimizer (str): Name of the optimizer to use. Should not be None if used. + predictor (str): Name of the predictor to use. Defaults to "FirstOccurenceClassificator". meta_llm (str): Name of the meta language model. Should not be None if used. downstream_llm (str): Name of the downstream language model. Should not be None if used. evaluation_llm (str): Name of the evaluation language model. Should not be None if used. init_pop_size (int): Initial population size. Defaults to 10. logging_dir (str): Directory for logging. Defaults to "logs/run.csv". experiment_name (str): Name of the experiment. Defaults to "experiment". - include_task_desc (bool): Whether to include task description. Defaults to False. + task_description (str): Task Description fed to the optimizer. Defaults to None. donor_random (bool): Whether to use random donor prompts for EvoPromptDE. Defaults to False. random_seed (int): Random seed for reproducibility. Defaults to 42. + model_storage_path (str): Path to the model storage directory (used for VLLM). Defaults to "../models/". selection_mode (str): Selection mode for EvoPromptGA. Defaults to "random". meta_bs (int): Batch size for local meta LLM. Should not be None if llm is run locally. Defaults to None. downstream_bs (int): Batch size for local downstream LLM. @@ -46,6 +48,7 @@ class Config: task_name: str = None ds_path: Path = None optimizer: str = None + predictor: Literal["MarkerBasedClassificator", "FirstOccurenceClassificator"] = "FirstOccurenceClassificator" meta_llm: str = None downstream_llm: str = None evaluation_llm: str = None @@ -53,9 +56,10 @@ class Config: init_pop_size: int = None logging_dir: Path = Path("logs/run.csv") experiment_name: str = "experiment" - include_task_desc: bool = True + task_description: str = None donor_random: bool = False random_seed: int = 42 + model_storage_path: Optional[Path] = Path("../models/") selection_mode: Optional[Literal["random", "wheel", "tour"]] = "random" meta_bs: Optional[int] = None downstream_bs: Optional[int] = None diff --git a/promptolution/helpers.py b/promptolution/helpers.py index 9d776a9..52472ea 100644 --- a/promptolution/helpers.py +++ b/promptolution/helpers.py @@ -9,7 +9,7 @@ from promptolution.exemplar_selectors import get_exemplar_selector from promptolution.llms import get_llm from promptolution.optimizers import get_optimizer -from promptolution.predictors import Classificator +from promptolution.predictors import FirstOccurrenceClassificator, MarkerBasedClassificator from promptolution.tasks import get_task @@ -27,7 +27,7 @@ def run_experiment(config: Config): return df -def run_optimization(config: Config): +def run_optimization(config: Config, callbacks: List = None): """Run the optimization phase of the experiment. Args: @@ -37,8 +37,13 @@ def run_optimization(config: Config): List[str]: The optimized list of prompts. """ task = get_task(config) - llm = get_llm(config.meta_llm, token=config.api_token) - predictor = Classificator(llm, classes=task.classes) + llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path) + if config.predictor == "MarkerBasedClassificator": + predictor = MarkerBasedClassificator(llm, classes=task.classes) + elif config.predictor == "FirstOccurenceClassificator": + predictor = FirstOccurrenceClassificator(llm, classes=task.classes) + else: + raise ValueError(f"Predictor {config.predictor} not supported.") if config.init_pop_size: init_pop = np.random.choice(task.initial_population, size=config.init_pop_size, replace=True) @@ -52,6 +57,8 @@ def run_optimization(config: Config): task=task, predictor=predictor, n_eval_samples=config.n_eval_samples, + callbacks=callbacks, + task_description=predictor.extraction_description, ) prompts = optimizer.optimize(n_steps=config.n_steps) @@ -76,7 +83,7 @@ def run_evaluation(config: Config, prompts: List[str]): task = get_task(config, split="test") llm = get_llm(config.evaluation_llm, token=config.api_token) - predictor = Classificator(llm, classes=task.classes) + predictor = FirstOccurrenceClassificator(llm, classes=task.classes) scores = task.evaluate(prompts, predictor, subsample=True, n_samples=config.n_eval_samples) df = pd.DataFrame(dict(prompt=prompts, score=scores)) diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py index cf966bf..91c9942 100644 --- a/promptolution/llms/api_llm.py +++ b/promptolution/llms/api_llm.py @@ -3,7 +3,7 @@ import asyncio import time from logging import INFO, Logger -from typing import List +from typing import Any, List import nest_asyncio import openai @@ -63,7 +63,7 @@ class APILLM(BaseLLM): get_response_async: Asynchronously get responses for a list of prompts. """ - def __init__(self, model_id: str, token: str = None): + def __init__(self, model_id: str, token: str = None, **kwargs: Any): """Initialize the APILLM with a specific model. Args: @@ -73,6 +73,7 @@ def __init__(self, model_id: str, token: str = None): Raises: ValueError: If an unknown model identifier is provided. """ + super().__init__() if "claude" in model_id: self.model = ChatAnthropic(model=model_id, api_key=token) elif "gpt" in model_id: @@ -80,7 +81,7 @@ def __init__(self, model_id: str, token: str = None): else: self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token) - def get_response(self, prompts: List[str]) -> List[str]: + def _get_response(self, prompts: List[str]) -> List[str]: """Get responses for a list of prompts in a synchronous manner. This method includes retry logic for handling connection errors and rate limits. diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py index 7f0e95d..dbe26fa 100644 --- a/promptolution/llms/base_llm.py +++ b/promptolution/llms/base_llm.py @@ -1,10 +1,13 @@ """Base module for LLMs in the promptolution library.""" +import logging from abc import ABC, abstractmethod from typing import List import numpy as np +logger = logging.getLogger(__name__) + class BaseLLM(ABC): """Abstract base class for Language Models in the promptolution library. @@ -18,10 +21,62 @@ class BaseLLM(ABC): def __init__(self, *args, **kwargs): """Initialize the LLM.""" - pass + self.input_token_count = 0 + self.output_token_count = 0 + + def get_token_count(self): + """Get the current count of input and output tokens. + + Returns: + dict: A dictionary containing the input and output token counts. + """ + return { + "input_tokens": self.input_token_count, + "output_tokens": self.output_token_count, + "total_tokens": self.input_token_count + self.output_token_count, + } + + def reset_token_count(self): + """Reset the token counters to zero.""" + self.input_token_count = 0 + self.output_token_count = 0 + + def update_token_count(self, inputs: List[str], outputs: List[str]): + """Update the token count based on the given inputs and outputs. + + Args: + inputs (List[str]): A list of input prompts. + outputs (List[str]): A list of generated responses. + """ + logger.warning("Token count is approximated using word count split by whitespace, not an actual tokenizer.") + input_tokens = sum([len(i.split()) for i in inputs]) + output_tokens = sum([len(o.split()) for o in outputs]) + self.input_token_count += input_tokens + self.output_token_count += output_tokens + + def get_response(self, prompts: str) -> str: + """Generate responses for the given prompts. + + This method calls the _get_response method to generate responses + for the given prompts. It also updates the token count for the + input and output tokens. + + Args: + prompts (str or List[str]): Input prompt(s). If a single string is provided, + it's converted to a list containing that string. + + Returns: + List[str]: A list of generated responses, one for each input prompt. + """ + if isinstance(prompts, str): + prompts = [prompts] + responses = self._get_response(prompts) + self.update_token_count(prompts, responses) + + return responses @abstractmethod - def get_response(self, prompts: List[str]) -> List[str]: + def _get_response(self, prompts: List[str]) -> List[str]: """Generate responses for the given prompts. This method should be implemented by subclasses to define how diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py index 074bf01..577d4a0 100644 --- a/promptolution/llms/local_llm.py +++ b/promptolution/llms/local_llm.py @@ -35,6 +35,8 @@ def __init__(self, model_id: str, batch_size=8): This method sets up a text generation pipeline with bfloat16 precision, automatic device mapping, and specific generation parameters. """ + super().__init__() + self.pipeline = transformers.pipeline( "text-generation", model=model_id, @@ -48,7 +50,7 @@ def __init__(self, model_id: str, batch_size=8): self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id self.pipeline.tokenizer.padding_side = "left" - def get_response(self, prompts: list[str]): + def _get_response(self, prompts: list[str]): """Generate responses for a list of prompts using the local language model. Args: diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index d99c542..f558458 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -2,6 +2,7 @@ from logging import INFO, Logger +from typing import List try: import torch @@ -32,22 +33,24 @@ class VLLM(BaseLLM): Methods: get_response: Generate responses for a list of prompts. + update_token_count: Update the token count based on the given inputs and outputs. """ def __init__( self, model_id: str, - batch_size: int = 64, + batch_size: int | None = None, max_generated_tokens: int = 256, temperature: float = 0.1, top_p: float = 0.9, - model_storage_path: str = None, - token: str = None, + model_storage_path: str | None = None, + token: str | None = None, dtype: str = "auto", tensor_parallel_size: int = 1, gpu_memory_utilization: float = 0.95, max_model_len: int = 2048, trust_remote_code: bool = False, + **kwargs, ): """Initialize the VLLM with a specific model. @@ -64,11 +67,13 @@ def __init__( gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95. max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048. trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. + **kwargs: Additional keyword arguments to pass to the LLM class initialization. Note: This method sets up a vLLM engine with specified parameters for efficient inference. """ - self.batch_size = batch_size + super().__init__() + self.dtype = dtype self.tensor_parallel_size = tensor_parallel_size self.gpu_memory_utilization = gpu_memory_utilization @@ -78,22 +83,33 @@ def __init__( # Configure sampling parameters self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens) - # Initialize the vLLM engine - self.llm = LLM( - model=model_id, - tokenizer=model_id, - dtype=self.dtype, - tensor_parallel_size=self.tensor_parallel_size, - gpu_memory_utilization=self.gpu_memory_utilization, - max_model_len=self.max_model_len, - download_dir=model_storage_path, - trust_remote_code=self.trust_remote_code, - ) + # Initialize the vLLM engine with both explicit parameters and any additional kwargs + llm_params = { + "model": model_id, + "tokenizer": model_id, + "dtype": self.dtype, + "tensor_parallel_size": self.tensor_parallel_size, + "gpu_memory_utilization": self.gpu_memory_utilization, + "max_model_len": self.max_model_len, + "download_dir": model_storage_path, + "trust_remote_code": self.trust_remote_code, + **kwargs, + } + + self.llm = LLM(**llm_params) + + if batch_size is None: + gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks + block_size = self.llm.llm_engine.model_executor.cache_config.block_size + self.batch_size = int((gpu_blocks * block_size / self.max_model_len) * 0.95) + logger.info(f"Batch size set to {self.batch_size} based on GPU memory.") + else: + self.batch_size = batch_size # Initialize tokenizer separately for potential pre-processing self.tokenizer = AutoTokenizer.from_pretrained(model_id) - def get_response(self, inputs: list[str]): + def _get_response(self, inputs: list[str]): """Generate responses for a list of prompts using the vLLM engine. Args: @@ -104,6 +120,7 @@ def get_response(self, inputs: list[str]): Note: This method uses vLLM's batched generation capabilities for efficient inference. + It also counts input and output tokens. """ prompts = [ self.tokenizer.apply_chat_template( @@ -119,16 +136,37 @@ def get_response(self, inputs: list[str]): for input in inputs ] + # Count input tokens + for prompt in prompts: + input_tokens = self.tokenizer.encode(prompt) + self.input_token_count += len(input_tokens) + # generate responses for self.batch_size prompts at the same time all_responses = [] for i in range(0, len(prompts), self.batch_size): batch = prompts[i : i + self.batch_size] outputs = self.llm.generate(batch, self.sampling_params) responses = [output.outputs[0].text for output in outputs] + all_responses.extend(responses) return all_responses + def update_token_count(self, inputs: List[str], outputs: List[str]): + """Update the token count based on the given inputs and outputs. + + Uses the tokenizer to count the tokens. + + Args: + inputs (List[str]): A list of input prompts. + outputs (List[str]): A list of generated responses. + """ + for input in inputs: + self.input_token_count += len(self.tokenizer.encode(input)) + + for output in outputs: + self.output_token_count += len(self.tokenizer.encode(output)) + def __del__(self): """Cleanup method to delete the LLM instance and free up GPU memory.""" del self.llm diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py index 7e386a2..09c57fa 100644 --- a/promptolution/optimizers/__init__.py +++ b/promptolution/optimizers/__init__.py @@ -6,6 +6,7 @@ EVOPROMPT_GA_TEMPLATE, EVOPROMPT_GA_TEMPLATE_TD, OPRO_TEMPLATE, + OPRO_TEMPLATE_TD, ) from .base_optimizer import DummyOptimizer @@ -15,7 +16,7 @@ def get_optimizer( - config=None, optimizer: str = None, include_task_desc: bool = None, meta_prompt: str = None, *args, **kwargs + config=None, optimizer: str = None, meta_prompt: str = None, task_description: str = None, *args, **kwargs ): """Factory function to create and return an optimizer instance based on the provided configuration. @@ -30,6 +31,7 @@ def get_optimizer( - Any other string for the specified optimizer class include_task_desc (bool): Flag to include task description in the prompt. meta_prompt (str): Meta prompt for the optimizer. + task_description (str): Task description for the optimizer. *args: Variable length argument list passed to the optimizer constructor. **kwargs: Arbitrary keyword arguments passed to the optimizer constructor @@ -42,8 +44,8 @@ def get_optimizer( if optimizer is None: optimizer = config.optimizer - if include_task_desc is None: - include_task_desc = config.include_task_desc + if task_description is None: + task_description = config.task_description if config is not None and meta_prompt is None: meta_prompt = config.meta_prompt @@ -51,9 +53,19 @@ def get_optimizer( if optimizer == "dummy": return DummyOptimizer(*args, **kwargs) if config.optimizer == "evopromptde": - return EvoPromptDE(donor_random=config.donor_random, *args, **kwargs) + if task_description is not None: + return EvoPromptDE( + prompt_template=EVOPROMPT_DE_TEMPLATE_TD.replace("", task_description), *args, **kwargs + ) + return EvoPromptDE(prompt_template=EVOPROMPT_DE_TEMPLATE, *args, **kwargs) if config.optimizer == "evopromptga": - return EvoPromptGA(selection_mode=config.selection_mode, *args, **kwargs) + if task_description is not None: + return EvoPromptGA( + prompt_template=EVOPROMPT_GA_TEMPLATE_TD.replace("", task_description), *args, **kwargs + ) + return EvoPromptGA(prompt_template=EVOPROMPT_GA_TEMPLATE, *args, **kwargs) if config.optimizer == "opro": - return Opro(*args, **kwargs) + if task_description is not None: + return Opro(prompt_template=OPRO_TEMPLATE_TD.replace("", task_description), *args, **kwargs) + return Opro(prompt_template=OPRO_TEMPLATE, *args, **kwargs) raise ValueError(f"Unknown optimizer: {config.optimizer}") diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py index 2cac685..bfb828a 100644 --- a/promptolution/optimizers/base_optimizer.py +++ b/promptolution/optimizers/base_optimizer.py @@ -1,5 +1,6 @@ """Base class for prompt optimizers.""" +import time from abc import ABC, abstractmethod from typing import Callable, List @@ -61,13 +62,19 @@ def optimize(self, n_steps: int) -> List[str]: def _on_step_end(self): """Call all registered callbacks at the end of each optimization step.""" + continue_optimization = True for callback in self.callbacks: - callback.on_step_end(self) + continue_optimization &= callback.on_step_end(self) # if any callback returns False, end the optimization + + return continue_optimization def _on_epoch_end(self): """Call all registered callbacks at the end of each optimization epoch.""" + continue_optimization = True for callback in self.callbacks: - callback.on_epoch_end(self) + continue_optimization &= callback.on_epoch_end(self) # if any callback returns False, end the optimization + + return continue_optimization def _on_train_end(self): """Call all registered callbacks at the end of the entire optimization process.""" @@ -111,4 +118,5 @@ def optimize(self, n_steps) -> list[str]: self._on_step_end() self._on_epoch_end() self._on_train_end() + return self.prompts diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py index 17d74b3..f44556e 100644 --- a/promptolution/optimizers/evoprompt_de.py +++ b/promptolution/optimizers/evoprompt_de.py @@ -89,7 +89,11 @@ def optimize(self, n_steps: int) -> List[str]: self.prompts[i] = child_prompts[i] self.scores[i] = child_scores[i] - self._on_step_end() + continue_optimization = self._on_step_end() + + if not continue_optimization: + break self._on_train_end() + return self.prompts diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py index 2ec789b..f6efcb8 100644 --- a/promptolution/optimizers/evoprompt_ga.py +++ b/promptolution/optimizers/evoprompt_ga.py @@ -77,7 +77,10 @@ def optimize(self, n_steps: int) -> List[str]: self.prompts = [prompt for _, prompt in sorted(zip(scores, prompts), reverse=True)][: len(self.prompts)] self.scores = sorted(scores, reverse=True)[: len(self.prompts)] - self._on_step_end() + continue_optimization = self._on_step_end() + if not continue_optimization: + break + return self.prompts def _crossover(self, prompts, scores) -> str: diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py index 3c71f4e..7ef3616 100644 --- a/promptolution/optimizers/opro.py +++ b/promptolution/optimizers/opro.py @@ -36,7 +36,6 @@ def __init__(self, meta_llm: BaseLLM, n_samples: int = 2, prompt_template: str = self.meta_prompt = prompt_template if prompt_template else OPRO_TEMPLATE super().__init__(**args) - self.meta_prompt = self.meta_prompt.replace("", self.task.description) self.scores = [ self.task.evaluate(p, self.predictor, subsample=True, n_samples=self.n_eval_samples)[0] @@ -89,7 +88,9 @@ def optimize(self, n_steps: int) -> List[str]: self.prompts.append(prompt) self.scores.append(score) - self._on_step_end() + continue_optimization = self._on_step_end() + if not continue_optimization: + break self._on_epoch_end() diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py index d850759..e5aa69c 100644 --- a/promptolution/predictors/__init__.py +++ b/promptolution/predictors/__init__.py @@ -1,39 +1,38 @@ """Module for LLM predictors.""" -from promptolution.llms import get_llm +from typing import Literal from .base_predictor import DummyPredictor -from .classificator import Classificator +from .classificator import FirstOccurrenceClassificator, MarkerBasedClassificator -def get_predictor(name, *args, **kwargs): - """Factory function to create and return a predictor instance based on the provided name. +def get_predictor( + downstream_llm=None, type: Literal["first_occurence", "marker"] = "first_occurrence", *args, **kwargs +): + """Factory function to create and return a predictor instance. - This function supports two types of predictors: - 1. DummyPredictor: A mock predictor for testing purposes. - 2. Classificator: A real predictor using a language model for classification tasks. + This function supports three types of predictors: + 1. DummyPredictor: A mock predictor for testing purposes when no downstream_llm is provided. + 2. FirstOccurrenceClassificator: A predictor that classifies based on first occurrence of the label. + 3. MarkerBasedClassificator: A predictor that classifies based on a marker. Args: - name (str): Identifier for the predictor to use. Special case: - - "dummy" for DummyPredictor - - Any other string for Classificator with the specified LLM + downstream_llm: The language model to use for prediction. If None, returns a DummyPredictor. + type (Literal["first_occurrence", "marker"]): The type of predictor to create: + - "first_occurrence" (default) for FirstOccurrenceClassificator + - "marker" for MarkerBasedClassificator *args: Variable length argument list passed to the predictor constructor. **kwargs: Arbitrary keyword arguments passed to the predictor constructor. Returns: - An instance of DummyPredictor or Classificator based on the name. - - Notes: - - For non-dummy predictors, this function calls get_llm to obtain the language model. - - The batch_size for the language model is currently commented out and not used. - - Examples: - >>> dummy_pred = get_predictor("dummy", classes=["A", "B", "C"]) - >>> real_pred = get_predictor("gpt-3.5-turbo", classes=["positive", "negative"]) + An instance of DummyPredictor, FirstOccurrenceClassificator, or MarkerBasedClassificator. """ - if name == "dummy": + if downstream_llm is None: return DummyPredictor("", *args, **kwargs) - downstream_llm = get_llm(name) - - return Classificator(downstream_llm, *args, **kwargs) + if type == "first_occurrence": + return FirstOccurrenceClassificator(downstream_llm, *args, **kwargs) + elif type == "marker": + return MarkerBasedClassificator(downstream_llm, *args, **kwargs) + else: + raise ValueError(f"Invalid predictor type: '{type}'") diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py index f33bfc6..89eb5d4 100644 --- a/promptolution/predictors/classificator.py +++ b/promptolution/predictors/classificator.py @@ -7,7 +7,7 @@ from promptolution.predictors.base_predictor import BasePredictor -class Classificator(BasePredictor): +class FirstOccurrenceClassificator(BasePredictor): """A predictor class for classification tasks using language models. This class takes a language model and a list of classes, and provides a method @@ -33,6 +33,10 @@ def __init__(self, llm, classes, *args, **kwargs): """ super().__init__(llm) self.classes = classes + self.extraction_description = ( + f"The task is to classify the texts into one of those classes: {', '.join(classes)}." + "The first occurrence of a valid class label in the prediction is used as the predicted class." + ) def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray: """Extract class labels from the predictions, based on the list of valid class labels. @@ -44,7 +48,7 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray response = [] for pred in preds: predicted_class = self.classes[0] # use first class as default pred - for word in pred.split(" "): + for word in pred.split(): word = "".join([c for c in word if c.isalnum()]) if word in self.classes: predicted_class = word @@ -54,3 +58,54 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray response = np.array(response).reshape(*shape) return response + + +class MarkerBasedClassificator(BasePredictor): + """A predictor class for classification tasks using language models. + + This class takes a language model and a list of classes, and provides a method + to predict classes for given prompts and input data. The class labels are extracted. + + Attributes: + llm: The language model used for generating predictions. + classes (List[str]): The list of valid class labels. + marker (str): The marker to use for extracting the class label. + + Inherits from: + BasePredictor: The base class for predictors in the promptolution library. + """ + + def __init__(self, llm, classes, marker="", *args, **kwargs): + """Initialize the Classificator. + + Args: + llm: The language model to use for predictions. + classes (List[str]): The list of valid class labels. + marker (str): The marker to use for extracting the class label. + *args, **kwargs: Additional arguments for the BasePredictor. + """ + super().__init__(llm) + self.classes = classes + self.marker = marker + self.extraction_description = ( + f"The task is to classify the texts into one of those classes: {','.join(classes)}." + f"The class label is extracted from the text following the marker: {marker}." + ) + + def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray: + """Extract class labels from the predictions, by extracting the text following the marker. + + Args: + preds: The raw predictions from the language model. + shape: The shape of the output array: (n_prompts, n_samples). + """ + response = [] + for pred in preds: + predicted_class = pred.split(self.marker)[-1].strip() + if predicted_class not in self.classes: + predicted_class = self.classes[0] + + response.append(predicted_class) + + response = np.array(response).reshape(*shape) + return response diff --git a/promptolution/templates.py b/promptolution/templates.py index 05d7ae3..6cbc39e 100644 --- a/promptolution/templates.py +++ b/promptolution/templates.py @@ -86,8 +86,21 @@ 1.""" -OPRO_TEMPLATE = """Your task is to generate an instruction for the following task: - +OPRO_TEMPLATE = """Your task is to generate an instruction. + +Below are some previous instructions with their scores. The score ranges from 0 to 100. + + + +Here are some examples of the target dataset: + + +Generate a new instruction bracketed with and ending it with that is different from all the instructions above and has a higher score than all the instructions above. The instruction should be concise, effective, and generally applicable to the task described. + +Your new instruction:""" + +OPRO_TEMPLATE_TD = """Your task is to generate an instruction for the following task: + Below are some previous instructions with their scores. The score ranges from 0 to 100. @@ -114,3 +127,11 @@ The instruction was""" + +PROMPT_CREATION_TEMPLATE_TD = """You are asked to give the corresponding prompt that gives the following outputs given these inputs for the following task: . +Return it starting with and ending with tags. +Include the name of the output classes in the prompt. + + + +The instruction was""" diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index d85edd9..08e88dd 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -7,7 +7,7 @@ from promptolution.llms.base_llm import BaseLLM from promptolution.tasks.base_task import BaseTask from promptolution.tasks.classification_tasks import ClassificationTask -from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_VARIATION_TEMPLATE +from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_CREATION_TEMPLATE_TD, PROMPT_VARIATION_TEMPLATE def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_prompt: str = None) -> List[str]: @@ -35,7 +35,14 @@ def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_pr return varied_prompts -def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = None, n_samples: int = 3) -> List[str]: +def create_prompts_from_samples( + task: BaseTask, + llm: BaseLLM, + meta_prompt: str = None, + n_samples: int = 3, + task_description: str = None, + n_prompts: int = 1, +) -> List[str]: """Generate a set of prompts from dataset examples sampled from a given task. Idea taken from the paper Zhou et al. (2021) https://arxiv.org/pdf/2211.01910 @@ -50,37 +57,44 @@ def create_prompts_from_samples(task: BaseTask, llm: BaseLLM, meta_prompt: str = meta_prompt (str): The meta prompt to use for generating the prompts. If None, a default meta prompt is used. n_samples (int): The number of samples to use for generating prompts. + task_description (str): The description of the task to include in the prompt. + n_prompts (int): The number of prompts to generate. Returns: List[str]: A list of generated prompts. """ - if isinstance(task, ClassificationTask): - # if classification task sample such that all classes are represented - unique_classes, counts = np.unique(task.ys, return_counts=True) - proportions = counts / len(task.ys) - samples_per_class = np.round(proportions * n_samples).astype(int) - samples_per_class = np.maximum(samples_per_class, 1) - - # sample - xs = [] - ys = [] - for cls, n_samples in zip(unique_classes, samples_per_class): - indices = np.where(task.ys == cls)[0] - indices = np.random.choice(indices, n_samples, replace=False) - xs.extend(task.xs[indices]) - ys.extend(task.ys[indices]) - - else: - # if not classification task, sample randomly - indices = np.random.choice(len(task.xs), n_samples, replace=False) - xs = task.xs[indices].tolist() - ys = task.ys[indices].tolist() - - meta_prompt = PROMPT_CREATION_TEMPLATE if meta_prompt is None else meta_prompt - examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)]) - meta_prompt = meta_prompt.replace("")[0].split("")[-1] - - return prompt + meta_prompts = [] + for _ in range(n_prompts): + if isinstance(task, ClassificationTask): + # if classification task sample such that all classes are represented + unique_labels, counts = np.unique(task.ys, return_counts=True) + proportions = counts / len(task.ys) + samples_per_class = np.round(proportions * n_samples).astype(int) + samples_per_class = np.maximum(samples_per_class, 1) + + # sample + xs = [] + ys = [] + for label, n_samples in zip(unique_labels, samples_per_class): + indices = np.where(task.ys == label)[0] + indices = np.random.choice(indices, n_samples, replace=False) + xs.extend(task.xs[indices]) + ys.extend(task.ys[indices]) + + else: + # if not classification task, sample randomly + indices = np.random.choice(len(task.xs), n_samples, replace=False) + xs = task.xs[indices].tolist() + ys = task.ys[indices].tolist() + + if meta_prompt is None: + meta_prompt = PROMPT_CREATION_TEMPLATE + if task_description is None: + meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("", task_description) + examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)]) + meta_prompt = meta_prompt.replace("", examples) + meta_prompts.append(meta_prompt) + prompts = llm.get_response(meta_prompts) + prompts = [prompt.split("")[0].split("")[-1].strip() for prompt in prompts] + + return prompts diff --git a/pyproject.toml b/pyproject.toml index e4f5be3..bd5b6ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "promptolution" -version = "1.2.0" +version = "1.3.0" description = "" authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"] readme = "README.md" diff --git a/scripts/opro_test_run.py b/scripts/opro_test_run.py deleted file mode 100644 index 474af3e..0000000 --- a/scripts/opro_test_run.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Test run for the Opro optimizer.""" - -from logging import Logger - -from promptolution.callbacks import LoggerCallback -from promptolution.llms import get_llm -from promptolution.optimizers import Opro -from promptolution.predictors import get_predictor -from promptolution.tasks import get_task - -from promptolution.config import Config - -logger = Logger(__name__) - - -def main(): - """Run a test run for the Opro optimizer.""" - config = Config( - meta_llm="meta-llama/Meta-Llama-3-8B-Instruct", - ds_path="data_sets/agnews", - task_name="agnews", - n_steps=10, - optimizer="opro", - downstream_llm="meta-llama/Meta-Llama-3-8B-Instruct", - evaluation_llm="meta-llama/Meta-Llama-3-8B-Instruct", - - ) - task = get_task(config, split="dev") - predictor = get_predictor(config.evaluation_llm, classes=task.classes) - - llm = get_llm(config.meta_llm) - optimizer = Opro( - llm, - initial_prompts=task.initial_population, - task=task, - predictor=predictor, - callbacks=[LoggerCallback(logger)], - n_samples=5, - ) - prompts = optimizer.optimize(n_steps=10) - - logger.info(f"Optimized prompts: {prompts}") - - -if __name__ == "__main__": - main() diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py new file mode 100644 index 0000000..d60efb9 --- /dev/null +++ b/scripts/optimizer_test_run.py @@ -0,0 +1,36 @@ +"""Test run for the Opro optimizer.""" +import argparse +from logging import Logger + +from promptolution.callbacks import LoggerCallback, CSVCallback +from promptolution.helpers import run_optimization + +from promptolution.config import Config + +logger = Logger(__name__) + +"""Run a test run for any of the implemented optimizers.""" +parser = argparse.ArgumentParser() +parser.add_argument("--model") +parser.add_argument("--model-storage-path", default="../models/") +parser.add_argument("--optimizer", default="evopromptde") +parser.add_argument("--n-steps", type=int, default=10) +parser.add_argument("--token", default=None) +args = parser.parse_args() + +config = Config( + meta_llm=args.model, + ds_path="data_sets/cls/agnews", + task_name="agnews", + predictor="FirstOccurenceClassificator", + n_steps=args.n_steps, + optimizer=args.optimizer, + downstream_llm=args.model, + evaluation_llm=args.model, + api_token=args.token, + model_storage_path=args.model_storage_path, +) + +prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/{args.model}/")]) + +logger.info(f"Optimized prompts: {prompts}") diff --git a/scripts/prompt_creation_run.py b/scripts/prompt_creation_run.py index 4c17694..f7d54c3 100644 --- a/scripts/prompt_creation_run.py +++ b/scripts/prompt_creation_run.py @@ -21,7 +21,7 @@ def main(): llm = get_llm("meta-llama/Meta-Llama-3-8B-Instruct") task = get_task(config, split="dev") - predictor = get_predictor("meta-llama/Meta-Llama-3-8B-Instruct", classes=task.classes) + predictor = get_predictor(llm, classes=task.classes) init_prompts = create_prompts_from_samples(task, llm) logger.critical(f"Initial prompts: {init_prompts}") From 859831cb15ac7ade08405c616b0f6d82819957cd Mon Sep 17 00:00:00 2001 From: finitearth Date: Sun, 9 Mar 2025 22:52:30 +0100 Subject: [PATCH 32/41] fixed prompt creation with task description --- promptolution/utils/prompt_creation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index 08e88dd..9667a48 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -89,7 +89,7 @@ def create_prompts_from_samples( if meta_prompt is None: meta_prompt = PROMPT_CREATION_TEMPLATE - if task_description is None: + if task_description is not None: meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("", task_description) examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)]) meta_prompt = meta_prompt.replace("", examples) From f53e4d2efe1f614e33fa140e43a5e79fafad4785 Mon Sep 17 00:00:00 2001 From: finitearth Date: Mon, 10 Mar 2025 00:06:49 +0100 Subject: [PATCH 33/41] make classifaction task for prompt creation optional --- promptolution/utils/prompt_creation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index 9667a48..560b464 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -42,6 +42,7 @@ def create_prompts_from_samples( n_samples: int = 3, task_description: str = None, n_prompts: int = 1, + get_uniform_labels: bool = False, ) -> List[str]: """Generate a set of prompts from dataset examples sampled from a given task. @@ -59,13 +60,14 @@ def create_prompts_from_samples( n_samples (int): The number of samples to use for generating prompts. task_description (str): The description of the task to include in the prompt. n_prompts (int): The number of prompts to generate. + get_uniform_labels (bool): If True, samples are selected such that all classes are represented. Returns: List[str]: A list of generated prompts. """ meta_prompts = [] for _ in range(n_prompts): - if isinstance(task, ClassificationTask): + if isinstance(task, ClassificationTask) and get_uniform_labels: # if classification task sample such that all classes are represented unique_labels, counts = np.unique(task.ys, return_counts=True) proportions = counts / len(task.ys) From c0630393a36e9b4dcc749548d78b6c7f9fcc21ee Mon Sep 17 00:00:00 2001 From: finitearth Date: Mon, 10 Mar 2025 00:10:04 +0100 Subject: [PATCH 34/41] fix meta_prompt_template --- promptolution/utils/prompt_creation.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index 560b464..718ad76 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -65,6 +65,10 @@ def create_prompts_from_samples( Returns: List[str]: A list of generated prompts. """ + if meta_prompt is None: + meta_prompt_template = PROMPT_CREATION_TEMPLATE + if task_description is not None: + meta_prompt_template = PROMPT_CREATION_TEMPLATE_TD.replace("", task_description) meta_prompts = [] for _ in range(n_prompts): if isinstance(task, ClassificationTask) and get_uniform_labels: @@ -89,13 +93,10 @@ def create_prompts_from_samples( xs = task.xs[indices].tolist() ys = task.ys[indices].tolist() - if meta_prompt is None: - meta_prompt = PROMPT_CREATION_TEMPLATE - if task_description is not None: - meta_prompt = PROMPT_CREATION_TEMPLATE_TD.replace("", task_description) examples = "\n\n".join([f"Input: {x}\nOutput: {y}" for x, y in zip(xs, ys)]) - meta_prompt = meta_prompt.replace("", examples) + meta_prompt = meta_prompt_template.replace("", examples) meta_prompts.append(meta_prompt) + prompts = llm.get_response(meta_prompts) prompts = [prompt.split("")[0].split("")[-1].strip() for prompt in prompts] From 5e0b8f7a2fcc493199dddf7cc09a2321e8872bde Mon Sep 17 00:00:00 2001 From: finitearth Date: Mon, 10 Mar 2025 00:24:25 +0100 Subject: [PATCH 35/41] enable not forcing class output for marker based classifactor --- promptolution/predictors/classificator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classificator.py index 89eb5d4..bb05930 100644 --- a/promptolution/predictors/classificator.py +++ b/promptolution/predictors/classificator.py @@ -75,12 +75,12 @@ class MarkerBasedClassificator(BasePredictor): BasePredictor: The base class for predictors in the promptolution library. """ - def __init__(self, llm, classes, marker="", *args, **kwargs): + def __init__(self, llm, classes=None, marker="", *args, **kwargs): """Initialize the Classificator. Args: llm: The language model to use for predictions. - classes (List[str]): The list of valid class labels. + classes (List[str]): The list of valid class labels. If None, does not force any class. marker (str): The marker to use for extracting the class label. *args, **kwargs: Additional arguments for the BasePredictor. """ @@ -101,11 +101,11 @@ def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray """ response = [] for pred in preds: - predicted_class = pred.split(self.marker)[-1].strip() - if predicted_class not in self.classes: - predicted_class = self.classes[0] + pred = pred.split(self.marker)[-1].strip() + if self.classes is not None and pred not in self.classes: + pred = self.classes[0] - response.append(predicted_class) + response.append(pred) response = np.array(response).reshape(*shape) return response From eeb6995a04727d19e402a8da49af1d6e565a9b51 Mon Sep 17 00:00:00 2001 From: finitearth Date: Tue, 11 Mar 2025 13:12:55 +0100 Subject: [PATCH 36/41] updated callbacks --- promptolution/callbacks.py | 21 ++++++++++++--------- promptolution/optimizers/evoprompt_ga.py | 1 + 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py index 48a9b3e..452aeae 100644 --- a/promptolution/callbacks.py +++ b/promptolution/callbacks.py @@ -1,7 +1,7 @@ """Callback classes for logging, saving, and tracking optimization progress.""" import os -import time +from datetime import datetime from typing import Literal import numpy as np @@ -64,7 +64,8 @@ def __init__(self, logger): def on_step_end(self, optimizer): """Log information about the current step.""" self.step += 1 - self.logger.critical(f"✨Step {self.step} ended✨") + time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f") + self.logger.critical(f"{time} - ✨Step {self.step} ended✨") for i, (prompt, score) in enumerate(zip(optimizer.prompts, optimizer.scores)): self.logger.critical(f"*** Prompt {i}: Score: {score}") self.logger.critical(f"{prompt}") @@ -78,10 +79,11 @@ def on_train_end(self, optimizer, logs=None): optimizer: The optimizer object that called the callback. logs: Additional information to log. """ + time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f") if logs is None: - self.logger.critical("Training ended") + self.logger.critical(f"{time} - Training ended") else: - self.logger.critical(f"Training ended - {logs}") + self.logger.critical(f"{time} - Training ended - {logs}") return True @@ -109,8 +111,8 @@ def __init__(self, dir): self.step = 0 self.input_tokens = 0 self.output_tokens = 0 - self.start_time = time.time() - self.step_time = time.time() + self.start_time = datetime.now() + self.step_time = datetime.now() def on_step_end(self, optimizer): """Save prompts and scores to csv. @@ -124,12 +126,12 @@ def on_step_end(self, optimizer): "step": [self.step] * len(optimizer.prompts), "input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts), "output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts), - "time_elapsed": [time.time() - self.step_time] * len(optimizer.prompts), + "time_elapsed": [(datetime.now() - self.step_time).total_seconds()] * len(optimizer.prompts), "score": optimizer.scores, "prompt": optimizer.prompts, } ) - self.step_time = time.time() + self.step_time = datetime.now() self.input_tokens = optimizer.meta_llm.input_token_count self.output_tokens = optimizer.meta_llm.output_token_count @@ -151,7 +153,8 @@ def on_train_end(self, optimizer): steps=self.step, input_tokens=optimizer.meta_llm.input_token_count, output_tokens=optimizer.meta_llm.output_token_count, - time_elapsed=time.time() - self.start_time, + time_elapsed=(datetime.now() - self.start_time).total_seconds(), + time=datetime.now(), score=np.array(optimizer.scores).mean(), best_prompts=str(optimizer.prompts), ), diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py index f6efcb8..b26ff53 100644 --- a/promptolution/optimizers/evoprompt_ga.py +++ b/promptolution/optimizers/evoprompt_ga.py @@ -81,6 +81,7 @@ def optimize(self, n_steps: int) -> List[str]: if not continue_optimization: break + self._on_train_end() return self.prompts def _crossover(self, prompts, scores) -> str: From 147052e180550412be4ac388f0ae40b577d2167b Mon Sep 17 00:00:00 2001 From: finitearth Date: Tue, 11 Mar 2025 13:30:15 +0100 Subject: [PATCH 37/41] add seeding to vllm and sampling params --- promptolution/llms/vllm.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index f558458..2021fea 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -44,12 +44,12 @@ def __init__( temperature: float = 0.1, top_p: float = 0.9, model_storage_path: str | None = None, - token: str | None = None, dtype: str = "auto", tensor_parallel_size: int = 1, gpu_memory_utilization: float = 0.95, max_model_len: int = 2048, trust_remote_code: bool = False, + seed: int = 42, **kwargs, ): """Initialize the VLLM with a specific model. @@ -67,6 +67,7 @@ def __init__( gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95. max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048. trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. + seed (int, optional): Random seed for the model. Defaults to 42. **kwargs: Additional keyword arguments to pass to the LLM class initialization. Note: @@ -81,7 +82,9 @@ def __init__( self.trust_remote_code = trust_remote_code # Configure sampling parameters - self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens) + self.sampling_params = SamplingParams( + temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens, seed=seed + ) # Initialize the vLLM engine with both explicit parameters and any additional kwargs llm_params = { @@ -93,6 +96,7 @@ def __init__( "max_model_len": self.max_model_len, "download_dir": model_storage_path, "trust_remote_code": self.trust_remote_code, + "seed": seed, **kwargs, } @@ -136,11 +140,6 @@ def _get_response(self, inputs: list[str]): for input in inputs ] - # Count input tokens - for prompt in prompts: - input_tokens = self.tokenizer.encode(prompt) - self.input_token_count += len(input_tokens) - # generate responses for self.batch_size prompts at the same time all_responses = [] for i in range(0, len(prompts), self.batch_size): From 984220b442e72414b9aef93ec91d68df2c34564c Mon Sep 17 00:00:00 2001 From: mo374z Date: Tue, 11 Mar 2025 18:06:45 +0100 Subject: [PATCH 38/41] add random seed do test script --- promptolution/helpers.py | 4 +++- scripts/optimizer_test_run.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/promptolution/helpers.py b/promptolution/helpers.py index 52472ea..28180ea 100644 --- a/promptolution/helpers.py +++ b/promptolution/helpers.py @@ -37,7 +37,9 @@ def run_optimization(config: Config, callbacks: List = None): List[str]: The optimized list of prompts. """ task = get_task(config) - llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path) + llm = get_llm( + config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path, seed=config.random_seed + ) if config.predictor == "MarkerBasedClassificator": predictor = MarkerBasedClassificator(llm, classes=task.classes) elif config.predictor == "FirstOccurenceClassificator": diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py index d60efb9..808fc5a 100644 --- a/scripts/optimizer_test_run.py +++ b/scripts/optimizer_test_run.py @@ -16,6 +16,7 @@ parser.add_argument("--optimizer", default="evopromptde") parser.add_argument("--n-steps", type=int, default=10) parser.add_argument("--token", default=None) +parser.add_argument("--seed", type=int, default=187) args = parser.parse_args() config = Config( @@ -29,8 +30,9 @@ evaluation_llm=args.model, api_token=args.token, model_storage_path=args.model_storage_path, + random_seed=args.seed, ) -prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/{args.model}/")]) +prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")]) logger.info(f"Optimized prompts: {prompts}") From aa26e5fa0051b1fe1a677b7af1286275782ae15c Mon Sep 17 00:00:00 2001 From: mo374z Date: Tue, 11 Mar 2025 18:27:32 +0100 Subject: [PATCH 39/41] align with token / no token --- promptolution/helpers.py | 9 +++++---- scripts/optimizer_test_run.py | 5 ++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/promptolution/helpers.py b/promptolution/helpers.py index 28180ea..da70be7 100644 --- a/promptolution/helpers.py +++ b/promptolution/helpers.py @@ -27,7 +27,7 @@ def run_experiment(config: Config): return df -def run_optimization(config: Config, callbacks: List = None): +def run_optimization(config: Config, callbacks: List = None, use_token: bool = False): """Run the optimization phase of the experiment. Args: @@ -37,9 +37,10 @@ def run_optimization(config: Config, callbacks: List = None): List[str]: The optimized list of prompts. """ task = get_task(config) - llm = get_llm( - config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path, seed=config.random_seed - ) + if use_token: + llm = get_llm(config.meta_llm, token=config.api_token) + else: + llm = get_llm(config.meta_llm, model_storage_path=config.model_storage_path, seed=config.random_seed) if config.predictor == "MarkerBasedClassificator": predictor = MarkerBasedClassificator(llm, classes=task.classes) elif config.predictor == "FirstOccurenceClassificator": diff --git a/scripts/optimizer_test_run.py b/scripts/optimizer_test_run.py index 808fc5a..802208e 100644 --- a/scripts/optimizer_test_run.py +++ b/scripts/optimizer_test_run.py @@ -33,6 +33,9 @@ random_seed=args.seed, ) -prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")]) +if args.token is None: + prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")]) +else: + prompts = run_optimization(config, callbacks=[LoggerCallback(logger), CSVCallback(f"results/seedingtest/{args.model}/")], use_token=True) logger.info(f"Optimized prompts: {prompts}") From 5b483df7a3b293d751ae399eeae3dcc74dd3b47c Mon Sep 17 00:00:00 2001 From: mo374z Date: Tue, 11 Mar 2025 18:38:21 +0100 Subject: [PATCH 40/41] delete script --- scripts/llm_test_run.py | 98 ----------------------------------------- 1 file changed, 98 deletions(-) delete mode 100644 scripts/llm_test_run.py diff --git a/scripts/llm_test_run.py b/scripts/llm_test_run.py deleted file mode 100644 index 442475a..0000000 --- a/scripts/llm_test_run.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Test script for measuring raw LLM inference performance on a dataset.""" -import argparse -import time -from logging import Logger - -import numpy as np -import pandas as pd -from promptolution.config import Config -from promptolution.llms import get_llm -from promptolution.predictors import FirstOccurrenceClassificator -from promptolution.tasks import get_task -from tqdm import tqdm - -logger = Logger(__name__) - -# TODO: Align this script with how we import datasets in capo - - -"""Run inference test on a dataset using a specified LLM.""" -parser = argparse.ArgumentParser() -parser.add_argument("--model") -parser.add_argument("--output") -parser.add_argument("--datasets", default=["subj"]) -parser.add_argument("--token", default=None) -parser.add_argument("--batch-size", default=None) -parser.add_argument("--revision", default="main") -parser.add_argument("--max-model-len", default=None) -parser.add_argument("--model-storage-path", default=None) -args = parser.parse_args() - -start_time = time.time() - -if args.max_model_len is not None: - max_model_len = int(args.max_model_len) - -if "vllm" in args.model: - llm = get_llm( - args.model, - batch_size=args.batch_size, - max_model_len=max_model_len, - model_storage_path=args.model_storage_path, - revision=args.revision, - ) -else: - llm = get_llm(args.model, args.token) - -results = pd.DataFrame() - -for dataset in args.datasets: - config = Config( - evaluation_llm=args.model, - ds_path=f"data_sets/cls/{dataset}/", - task_name=dataset, - api_token=args.token, - n_eval_samples=200, - ) - - task = get_task(config, split="dev") - predictor = FirstOccurrenceClassificator(llm, classes=task.classes) - - prompts = [task.initial_population[0]] - - xs = task.xs[: config.n_eval_samples] - ys = task.ys[: config.n_eval_samples] - - for prompt in tqdm(prompts): - preds, seqs = predictor.predict(prompt, xs, return_seq=True) - - scores = [] - for i in range(len(xs)): - scores.append(1 if preds[0][i] == ys[i] else 0) - - # clean up the sequences - seqs = [seq.replace("\n", "").strip() for seq in seqs] - - # if single prompts should be stored - # df = pd.DataFrame(dict(prompt=prompt, seq=seqs, score=scores)) - # df.to_csv(args.output + "_detailed", index=False) - - accuracy = np.array(scores).mean() - - results = pd.DataFrame( - dict( - model=args.model, - dataset=dataset, - prompt=prompt, - accuracy=accuracy, - n_samples=len(xs), - ), - index=[0], - ) - results.to_csv(args.output, mode="a", header=False, index=False) - -total_inference_time = time.time() - start_time -print( - f"Total inference took {total_inference_time:.2f} seconds and required {llm.get_token_count()} tokens." -) -print(f"Results saved to {args.output}") From 39c58e47340eacd63d510d00b74367ed068d049a Mon Sep 17 00:00:00 2001 From: mo374z Date: Wed, 12 Mar 2025 16:53:15 +0100 Subject: [PATCH 41/41] fix prompt creation if else --- promptolution/utils/prompt_creation.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index 718ad76..85a613e 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -65,10 +65,15 @@ def create_prompts_from_samples( Returns: List[str]: A list of generated prompts. """ - if meta_prompt is None: + if meta_prompt is None and task_description is None: meta_prompt_template = PROMPT_CREATION_TEMPLATE - if task_description is not None: + elif meta_prompt is None and task_description is not None: meta_prompt_template = PROMPT_CREATION_TEMPLATE_TD.replace("", task_description) + elif meta_prompt is not None and task_description is None: + meta_prompt_template = meta_prompt + elif meta_prompt is not None and task_description is not None: + meta_prompt_template = meta_prompt.replace("", task_description) + meta_prompts = [] for _ in range(n_prompts): if isinstance(task, ClassificationTask) and get_uniform_labels: