automl · finitearth · Sep 2, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ results/
 poetry.lock
 CLAUDE.md
 **/CLAUDE.local.md
+.mypy_cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,6 +18,16 @@ repos:
     rev: 5.12.0
     hooks:
       - id: isort
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.8.0
+    hooks:
+      - id: mypy
+        files: ^promptolution/
+        additional_dependencies:
+          - types-requests
+          - pandas-stubs
+          - numpy
+        args: [--explicit-package-bases, --config-file=pyproject.toml]
   - repo: https://github.com/pycqa/pydocstyle
     rev: 6.3.0
     hooks:

diff --git a/promptolution/exemplar_selectors/base_exemplar_selector.py b/promptolution/exemplar_selectors/base_exemplar_selector.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
@@ -18,7 +18,7 @@ class BaseExemplarSelector(ABC):
     that all exemplar selectors should implement.
     """
 
-    def __init__(self, task: "BaseTask", predictor: "BasePredictor", config: "ExperimentConfig" = None):
+    def __init__(self, task: "BaseTask", predictor: "BasePredictor", config: Optional["ExperimentConfig"] = None):
         """Initialize the BaseExemplarSelector.
 
         Args:

diff --git a/promptolution/exemplar_selectors/random_search_selector.py b/promptolution/exemplar_selectors/random_search_selector.py
@@ -10,28 +10,29 @@ class RandomSearchSelector(BaseExemplarSelector):
     evaluates their performance, and selects the best performing set.
     """
 
-    def select_exemplars(self, prompt, n_examples: int = 5, n_trials: int = 5):
+    def select_exemplars(self, prompt: str, n_trials: int = 5) -> str:
         """Select exemplars using a random search strategy.
 
         This method generates multiple sets of random examples, evaluates their performance
         when combined with the original prompt, and returns the best performing set.
 
         Args:
             prompt (str): The input prompt to base the exemplar selection on.
-            n_examples (int, optional): The number of exemplars to select in each trial. Defaults to 5.
             n_trials (int, optional): The number of random trials to perform. Defaults to 5.
 
         Returns:
             str: The best performing prompt, which includes the original prompt and the selected exemplars.
         """
-        best_score = 0
+        best_score = 0.0
         best_prompt = prompt
 
         for _ in range(n_trials):
-            _, seq = self.task.evaluate(prompt, self.predictor, n_samples=n_examples, subsample=True, return_seq=True)
-            prompt_with_examples = "\n\n".join([prompt] + seq) + "\n\n"
+            _, seq = self.task.evaluate(
+                prompt, self.predictor, eval_strategy="subsample", return_seq=True, return_agg_scores=False
+            )
+            prompt_with_examples = "\n\n".join([prompt] + [seq[0][0]]) + "\n\n"
             # evaluate prompts as few shot prompt
-            score = self.task.evaluate(prompt_with_examples, self.predictor, subsample=True)
+            score = self.task.evaluate(prompt_with_examples, self.predictor, eval_strategy="subsample")[0]
             if score > best_score:
                 best_score = score
                 best_prompt = prompt_with_examples

diff --git a/promptolution/exemplar_selectors/random_selector.py b/promptolution/exemplar_selectors/random_selector.py
@@ -1,6 +1,8 @@
 """Random exemplar selector."""
 
-from typing import TYPE_CHECKING
+import numpy as np
+
+from typing import TYPE_CHECKING, List, Optional
 
 from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
 
@@ -18,8 +20,12 @@ class RandomSelector(BaseExemplarSelector):
     """
 
     def __init__(
-        self, task: "BaseTask", predictor: "BasePredictor", desired_score: int = 1, config: "ExperimentConfig" = None
-    ):
+        self,
+        task: "BaseTask",
+        predictor: "BasePredictor",
+        desired_score: int = 1,
+        config: Optional["ExperimentConfig"] = None,
+    ) -> None:
         """Initialize the RandomSelector.
 
         Args:
@@ -44,11 +50,13 @@ def select_exemplars(self, prompt: str, n_examples: int = 5) -> str:
         Returns:
             str: A new prompt that includes the original prompt and the selected exemplars.
         """
-        examples = []
+        examples: List[str] = []
         while len(examples) < n_examples:
-            score, seq = self.task.evaluate(prompt, self.predictor, n_samples=1, return_seq=True)
+            scores, seqs = self.task.evaluate(
+                prompt, self.predictor, eval_strategy="subsample", return_seq=True, return_agg_scores=False
+            )
+            score = np.mean(scores)
+            seq = seqs[0][0]
             if score == self.desired_score:
-                examples.append(seq[0])
-        prompt = "\n\n".join([prompt] + examples) + "\n\n"
-
-        return prompt
+                examples.append(seq)
+        return "\n\n".join([prompt] + examples) + "\n\n"
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
@@ -1,7 +1,7 @@
 """Helper functions for the usage of the libary."""
 
 
-from typing import TYPE_CHECKING, Callable, List, Literal
+from typing import TYPE_CHECKING, Callable, List, Literal, Optional
 
 from promptolution.tasks.judge_tasks import JudgeTask
 from promptolution.tasks.reward_tasks import RewardTask
@@ -45,7 +45,7 @@
 logger = get_logger(__name__)
 
 
-def run_experiment(df: pd.DataFrame, config: "ExperimentConfig"):
+def run_experiment(df: pd.DataFrame, config: "ExperimentConfig") -> pd.DataFrame:
     """Run a full experiment based on the provided configuration.
 
     Args:
@@ -79,7 +79,7 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]:
     llm = get_llm(config=config)
     predictor = get_predictor(llm, config=config)
 
-    config.task_description = config.task_description + " " + predictor.extraction_description
+    config.task_description = (config.task_description or "") + " " + (predictor.extraction_description or "")
     if config.optimizer == "capo" and (config.eval_strategy is None or "block" not in config.eval_strategy):
         logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
         config.eval_strategy = "sequential_block"
@@ -126,7 +126,7 @@ def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[s
     return df
 
 
-def get_llm(model_id: str = None, config: "ExperimentConfig" = None) -> "BaseLLM":
+def get_llm(model_id: Optional[str] = None, config: Optional["ExperimentConfig"] = None) -> "BaseLLM":
     """Factory function to create and return a language model instance based on the provided model_id.
 
     This function supports three types of language models:
@@ -144,16 +144,18 @@ def get_llm(model_id: str = None, config: "ExperimentConfig" = None) -> "BaseLLM
     Returns:
         An instance of LocalLLM, or APILLM based on the model_id.
     """
-    if model_id is None:
-        model_id = config.model_id
-    if "local" in model_id:
-        model_id = "-".join(model_id.split("-")[1:])
-        return LocalLLM(model_id, config)
-    if "vllm" in model_id:
-        model_id = "-".join(model_id.split("-")[1:])
-        return VLLM(model_id, config=config)
+    final_model_id = model_id or (config.model_id if config else None)
+    if not final_model_id:
+        raise ValueError("model_id must be provided either directly or through config.")
 
-    return APILLM(model_id=model_id, config=config)
+    if "local" in final_model_id:
+        model_name = "-".join(final_model_id.split("-")[1:])
+        return LocalLLM(model_name, config=config)
+    if "vllm" in final_model_id:
+        model_name = "-".join(final_model_id.split("-")[1:])
+        return VLLM(model_name, config=config)
+
+    return APILLM(model_id=final_model_id, config=config)
 
 
 def get_task(
@@ -174,16 +176,19 @@ def get_task(
     Returns:
         BaseTask: An instance of a task class based on the provided DataFrame and configuration.
     """
-    if task_type is None:
-        task_type = config.task_type
+    final_task_type = task_type or (config.task_type if config else None)
 
-    if task_type == "reward":
+    if final_task_type == "reward":
+        if reward_function is None:
+            reward_function = config.reward_function if config else None
+        assert reward_function is not None, "Reward function must be provided for reward tasks."
         return RewardTask(
             df=df,
             reward_function=reward_function,
             config=config,
         )
-    elif task_type == "judge":
+    elif final_task_type == "judge":
+        assert judge_llm is not None, "Judge LLM must be provided for judge tasks."
         return JudgeTask(df, judge_llm=judge_llm, config=config)
 
     return ClassificationTask(df, config=config)
@@ -193,10 +198,9 @@ def get_optimizer(
     predictor: "BasePredictor",
     meta_llm: "BaseLLM",
     task: "BaseTask",
-    optimizer: OptimizerType = None,
-    meta_prompt: str = None,
-    task_description: str = None,
-    config: "ExperimentConfig" = None,
+    optimizer: Optional[OptimizerType] = None,
+    task_description: Optional[str] = None,
+    config: Optional["ExperimentConfig"] = None,
 ) -> "BaseOptimizer":
     """Creates and returns an optimizer instance based on provided parameters.
 
@@ -215,22 +219,18 @@ def get_optimizer(
     Raises:
         ValueError: If an unknown optimizer type is specified
     """
-    if optimizer is None:
-        optimizer = config.optimizer
-    if task_description is None:
-        task_description = config.task_description
-    if meta_prompt is None and hasattr(config, "meta_prompt"):
-        meta_prompt = config.meta_prompt
-
-    if config.optimizer == "capo":
+    final_optimizer = optimizer or (config.optimizer if config else None)
+    final_task_description = task_description or (config.task_description if config else None)
+
+    if final_optimizer == "capo":
         crossover_template = (
-            CAPO_CROSSOVER_TEMPLATE.replace("<task_desc>", task_description)
-            if task_description
+            CAPO_CROSSOVER_TEMPLATE.replace("<task_desc>", final_task_description)
+            if final_task_description
             else CAPO_CROSSOVER_TEMPLATE
         )
         mutation_template = (
-            CAPO_MUTATION_TEMPLATE.replace("<task_desc>", task_description)
-            if task_description
+            CAPO_MUTATION_TEMPLATE.replace("<task_desc>", final_task_description)
+            if final_task_description
             else CAPO_MUTATION_TEMPLATE
         )
 
@@ -243,27 +243,29 @@ def get_optimizer(
             config=config,
         )
 
-    if config.optimizer == "evopromptde":
+    if final_optimizer == "evopromptde":
         template = (
-            EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", task_description)
-            if task_description
+            EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", final_task_description)
+            if final_task_description
             else EVOPROMPT_DE_TEMPLATE
         )
         return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
 
-    if config.optimizer == "evopromptga":
+    if final_optimizer == "evopromptga":
         template = (
-            EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", task_description)
-            if task_description
+            EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", final_task_description)
+            if final_task_description
             else EVOPROMPT_GA_TEMPLATE
         )
         return EvoPromptGA(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
 
-    if config.optimizer == "opro":
-        template = OPRO_TEMPLATE_TD.replace("<task_desc>", task_description) if task_description else OPRO_TEMPLATE
+    if final_optimizer == "opro":
+        template = (
+            OPRO_TEMPLATE_TD.replace("<task_desc>", final_task_description) if final_task_description else OPRO_TEMPLATE
+        )
         return OPRO(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
 
-    raise ValueError(f"Unknown optimizer: {config.optimizer}")
+    raise ValueError(f"Unknown optimizer: {final_optimizer}")
 
 
 def get_exemplar_selector(

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
@@ -1,17 +1,17 @@
 """Module to interface with various language models through their respective APIs."""
 
-
 try:
     import asyncio
 
     from openai import AsyncOpenAI
+    from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
 
     import_successful = True
 except ImportError:
     import_successful = False
 
 
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, Dict, List, Optional
 
 from promptolution.llms.base_llm import BaseLLM
 
@@ -23,9 +23,21 @@
 logger = get_logger(__name__)
 
 
-async def _invoke_model(prompt, system_prompt, max_tokens, model_id, client, semaphore, max_retries=20, retry_delay=5):
+async def _invoke_model(
+    prompt: str,
+    system_prompt: str,
+    max_tokens: int,
+    model_id: str,
+    client: AsyncOpenAI,
+    semaphore: asyncio.Semaphore,
+    max_retries: int = 20,
+    retry_delay: float = 5,
+) -> ChatCompletion:
     async with semaphore:
-        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
+        messages: List[ChatCompletionMessageParam] = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt},
+        ]
 
         for attempt in range(max_retries + 1):  # +1 for the initial attempt
             try:
@@ -46,7 +58,8 @@ async def _invoke_model(prompt, system_prompt, max_tokens, model_id, client, sem
                 else:
                     # Log the final failure and re-raise the exception
                     logger.error(f"❌ API call failed after {max_retries + 1} attempts: {str(e)}")
-                    raise
+                    raise  # Re-raise the exception after all retries fail
+        raise RuntimeError("Failed to get response after multiple retries.")
 
 
 class APILLM(BaseLLM):
@@ -65,13 +78,13 @@ class APILLM(BaseLLM):
 
     def __init__(
         self,
-        api_url: str = None,
-        model_id: str = None,
-        api_key: str = None,
-        max_concurrent_calls=50,
-        max_tokens=512,
-        config: "ExperimentConfig" = None,
-    ):
+        api_url: Optional[str] = None,
+        model_id: Optional[str] = None,
+        api_key: Optional[str] = None,
+        max_concurrent_calls: int = 50,
+        max_tokens: int = 512,
+        config: Optional["ExperimentConfig"] = None,
+    ) -> None:
         """Initialize the APILLM with a specific model and API configuration.
 
         Args:
@@ -103,14 +116,26 @@ def __init__(
 
     def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
         # Setup for async execution in sync context
-        loop = asyncio.get_event_loop()
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:  # 'get_running_loop' raises a RuntimeError if there is no running loop
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
         responses = loop.run_until_complete(self._get_response_async(prompts, system_prompts))
         return responses
 
     async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
+        assert self.model_id is not None, "model_id must be set"
         tasks = [
             _invoke_model(prompt, system_prompt, self.max_tokens, self.model_id, self.client, self.semaphore)
             for prompt, system_prompt in zip(prompts, system_prompts)
         ]
-        responses = await asyncio.gather(*tasks)
-        return [response.choices[0].message.content for response in responses]
+        messages = await asyncio.gather(*tasks)
+        responses = []
+        for message in messages:
+            response = message.choices[0].message.content
+            if response is None:
+                raise ValueError("Received None response from the API.")
+            responses.append(response)
+        return responses