diff --git a/.coverage b/.coverage
index c760106..96f4053 100644
Binary files a/.coverage and b/.coverage differ
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 514cf01..774e10b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,19 +22,19 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
-      
+
       - name: Set up Python and Poetry
         uses: ./.github/actions/python-poetry
         with:
           groups: main,dev,test
-      
+
       - name: Run pre-commit
         uses: pre-commit/action@v3.0.1
-      
+
       - name: Run tests with coverage
         run: |
           poetry run python -m pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=. tests/ | tee pytest-coverage.txt
-      
+
       - name: Generate coverage report & comment on PR
         id: coverageComment
         uses: MishaKav/pytest-coverage-comment@main
@@ -42,7 +42,7 @@ jobs:
           pytest-coverage-path: ./pytest-coverage.txt
           junitxml-path: ./pytest.xml
           hide-report: true
-      
+
       - name: Update README with coverage badge
         run: |
           cat > update_readme.py << 'EOF'
@@ -54,15 +54,15 @@ jobs:
           with open('README.md', 'r') as f:
               content = f.read()
           new_content = re.sub(
-              r'!\[Coverage\]\(https://img\.shields\.io/badge/Coverage-[^)]+\)', 
-              badge_md, 
+              r'!\[Coverage\]\(https://img\.shields\.io/badge/Coverage-[^)]+\)',
+              badge_md,
               content
           )
           with open('README.md', 'w') as f:
               f.write(new_content)
           EOF
           python update_readme.py
-        
+
       - name: Commit coverage updates to README
         uses: stefanzweifel/git-auto-commit-action@v4
         with:
@@ -77,15 +77,15 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
-      
+
       - name: Set up Python and Poetry
         uses: ./.github/actions/python-poetry
-      
+
       - name: Build wheel
         run: poetry build --format wheel
-      
+
       - name: Upload wheel
         uses: actions/upload-artifact@v4
         with:
           name: ${{ github.event.repository.name }}
-          path: dist/
\ No newline at end of file
+          path: dist/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 92415b8..bc3417e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 fail_fast: true
-exclude: '^(?!(promptolution/|tests/)).*$|^promptolution/templates.py'
+exclude: '^.*\.md$'
 repos:
   - repo: https://github.com/gitleaks/gitleaks
     rev: v8.18.2
@@ -46,4 +46,4 @@ repos:
     hooks:
       - id: conventional-pre-commit
         stages: [commit-msg]
-        args: []
\ No newline at end of file
+        args: []
diff --git a/README.md b/README.md
index 4b65f69..92bd4b7 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,16 @@
 ![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af)
 
-
-![Coverage](https://img.shields.io/badge/Coverage-90%25-brightgreen)
+![Coverage](https://img.shields.io/badge/Coverage-87%25-green)
 [![CI](https://github.com/finitearth/promptolution/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/ci.yml)
 [![Docs](https://github.com/finitearth/promptolution/actions/workflows/docs.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/docs.yml)
 ![Code Style](https://img.shields.io/badge/Code%20Style-black-black)
 ![Python Versions](https://img.shields.io/badge/Python%20Versions-≥3.9-blue)
 
-# Promptolution
 
-Promptolution is a library that provides a modular and extensible framework for implementing prompt tuning experiments. It offers a user-friendly interface to assemble the core components for various prompt optimization tasks.
 
-This project was developed by [Timo Heiß](https://www.linkedin.com/in/timo-heiss/), [Moritz Schlager](https://www.linkedin.com/in/moritz-schlager/) and [Tom Zehle](https://www.linkedin.com/in/tom-zehle/).
+Promptolution is a library that provides a modular and extensible framework for implementing prompt tuning for single tasks and larger experiments. It offers a user-friendly interface to assemble the core components for various prompt optimization tasks.
 
-## Getting Started
-Take a look at our getting started notebook: [getting_started.ipynb](https://github.com/finitearth/promptolution/blob/main/notebooks/getting_started.ipynb)
+This project was developed by [Timo Heiß](https://www.linkedin.com/in/timo-heiss/), [Moritz Schlager](https://www.linkedin.com/in/moritz-schlager/) and [Tom Zehle](https://www.linkedin.com/in/tom-zehle/) as part of a study program at LMU Munich.
 
 ## Installation
 
@@ -30,7 +26,6 @@ If you want to run your prompt optimization locally, either via transformers or
 pip install promptolution[vllm,transformers]
 ```
 
-
 Alternatively, clone the repository, run
 
 ```
@@ -39,34 +34,28 @@ poetry install
 
 to install the necessary dependencies. You might need to install [pipx](https://pipx.pypa.io/stable/installation/) and [poetry](https://python-poetry.org/docs/) first.
 
-## Documentation
-
-A comprehensive documentation with API reference is availabe at https://finitearth.github.io/promptolution/.
-
 ## Usage
 
-Create API Keys for the models you want to use:
-- OpenAI: store token in openaitoken.txt
-- Anthropic: store token in anthropictoken.txt
-- DeepInfra (for Llama): store token in deepinfratoken.txt
+To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/notebooks/getting_started.ipynb).
+For more details, a comprehensive **documentation** with API reference is availabe at https://finitearth.github.io/promptolution/.
 
-## Optimization Algorithms to choose from
-| **Name** | **Paper** | **init prompts** | **Exploration** | **Costs** | **Convergence Speed** | **Parallelizable** | **Utilizes Fewshot Examples** |
-|:--------:|:-------:|:--------------:|:---------------:|:---------:|:---------------------:|:------------------:|:---------------------:|
-| `CAPO` | [Zehle et al.](https://arxiv.org/abs/2504.16005)| *required* | 👍 | 💲 | ⚡⚡ | ✅ | ✅ |
-| `EvoPromptDE` | [Guo et al.](https://arxiv.org/abs/2309.08532) | *required* | 👍 | 💲💲 | ⚡⚡ | ✅ | ❌ |
-| `EvoPromptGA` | [Guo et al.](https://arxiv.org/abs/2309.08532) | *required* | 👍 | 💲💲 | ⚡⚡ | ✅ | ❌ |
-| `OPRO` | [Yang et al.](https://arxiv.org/abs/2309.03409)| *optional* | 👎 | 💲💲 | ⚡ | ❌ | ❌ |
+### Featured Optimizers
 
-## Core Components
+|   **Name**    |                    **Paper**                     | **init prompts** | **Exploration** | **Costs**  | **Parallelizable** | **Utilizes Fewshot Examples** |
+| :-----------: | :----------------------------------------------: | :--------------: | :-------------: | :-------: | :-------------------: | :---------------------------: |
+|    `CAPO`     | [Zehle et al.](https://arxiv.org/abs/2504.16005) |    _required_    |       👍        |    💲     |        ✅         |              ✅               |
+| `EvoPromptDE` |  [Guo et al.](https://arxiv.org/abs/2309.08532)  |    _required_    |       👍        |   💲💲    |               ✅         |              ❌               |
+| `EvoPromptGA` |  [Guo et al.](https://arxiv.org/abs/2309.08532)  |    _required_    |       👍        |   💲💲    |               ✅         |              ❌               |
+|    `OPRO`     | [Yang et al.](https://arxiv.org/abs/2309.03409)  |    _optional_    |       👎        |   💲💲    |                  ❌         |              ❌               |
 
-- Task: Encapsulates initial prompts, dataset features, targets, and evaluation methods.
-- Predictor: Implements the prediction logic, interfacing between the Task and LLM components.
-- LLM: Unifies the process of obtaining responses from language models, whether locally hosted or accessed via API.
-- Optimizer: Implements prompt optimization algorithms, utilizing the other components during the optimization process.
-- Exemplar Selectors: Implements algorithms for the search of few shot examples that are added to the prompt.
+### Core Components
 
-## Key Features
+- `Task`: Encapsulates initial prompts, dataset features, targets, and evaluation methods.
+- `Predictor`: Implements the prediction logic, interfacing between the `Task` and `LLM` components.
+- `LLM`: Unifies the process of obtaining responses from language models, whether locally hosted or accessed via API.
+- `Optimizer`: Implements prompt optimization algorithms, utilizing the other components during the optimization process.
+
+### Key Features
 
 - Modular and object-oriented design
 - Extensible architecture
@@ -75,6 +64,48 @@ Create API Keys for the models you want to use:
 - Integration with langchain for standardized LLM API calls
 - Detailed logging and callback system for optimization analysis
 
+## Changelog
+
+Release notes for each version of the library can be found [here](https://finitearth.github.io/promptolution/release-notes/)
+
+## Contributing
+
+The first step to contributing is to open an issue describing the bug, feature, or enhancements. Ensure the issue is clearly described, assigned, and properly tagged. All work should be linked to an open issue.
+
+### Code Style and Linting
+
+We use Black for code formatting, Flake8 for linting, pydocstyle for docstring conventions (Google format), and isort to sort imports. All these checks are enforced via pre-commit hooks, which automatically run on every commit. Install the pre-commit hooks to ensure that all checks run automatically:
+
+```
+pre-commit install
+```
 
+To run all checks manually:
 
-This project was developed for the "AutoML in the age of large pre-trained models" seminar at LMU Munich.
+```
+pre-commit run --all-files
+```
+
+### Branch Protection and Merging Guidelines
+
+- The main branch is protected. No direct commits are allowed for non-administrators.
+- Rebase your branch on main before opening a pull request.
+- All contributions must be made on dedicated branches linked to specific issues.
+- Name the branch according to {prefix}/{description} with one of the prefixes fix, feature, chore, or refactor.
+- A pull request must have at least one approval from a code owner before it can be merged into main.
+- CI checks must pass before a pull request can be merged.
+- New releases will only be created by code owners.
+
+### Testing
+
+We use pytest to run tests, and coverage to track code coverage. Tests automatically run on pull requests and pushes to the main branch, but please ensure they also pass locally before pushing!
+To run the tests with coverage locally, use the following commands or your IDE's test runner:
+
+```
+poetry run python -m coverage run -m pytest
+```
+
+To see the coverage report run:
+```
+poetry run python -m coverage report
+```
diff --git a/docs/api/config.md b/docs/api/config.md
deleted file mode 100644
index 8793eb5..0000000
--- a/docs/api/config.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Config
-
-::: promptolution.config.ExperimentConfig
-    options:
-      show_root_heading: true
-      show_root_full_path: false
-      show_if_no_docstring: true
diff --git a/docs/api/exemplar_selectors.md b/docs/api/exemplar_selectors.md
new file mode 100644
index 0000000..cbb6d20
--- /dev/null
+++ b/docs/api/exemplar_selectors.md
@@ -0,0 +1,5 @@
+# Exemplar Selectors
+
+::: promptolution.exemplar_selectors
+    options:
+      show_submodules: true
diff --git a/docs/api/callbacks.md b/docs/api/helpers.md
similarity index 51%
rename from docs/api/callbacks.md
rename to docs/api/helpers.md
index af752e5..8350ce3 100644
--- a/docs/api/callbacks.md
+++ b/docs/api/helpers.md
@@ -1,5 +1,5 @@
-# Callbacks
+# Helpers
 
-::: promptolution.callbacks
+::: promptolution.helpers
     options:
       show_submodules: true
diff --git a/docs/api/utils.md b/docs/api/utils.md
new file mode 100644
index 0000000..b666fbe
--- /dev/null
+++ b/docs/api/utils.md
@@ -0,0 +1,5 @@
+# Utils
+
+::: promptolution.utils
+    options:
+      show_root_heading: true
diff --git a/docs/examples/getting_started.md b/docs/examples/getting_started.md
index 57c8f17..020699d 100644
--- a/docs/examples/getting_started.md
+++ b/docs/examples/getting_started.md
@@ -94,7 +94,7 @@ prompts = run_experiment(df, config)
 
     Cell In[48], line 1
     ----> 1 prompts = run_experiment(df, config)
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\helpers.py:32, in run_experiment(df, config)
          30 train_df = df.sample(frac=0.8, random_state=42)
@@ -102,7 +102,7 @@ prompts = run_experiment(df, config)
     ---> 32 prompts = run_optimization(train_df, config)
          33 df_prompt_scores = run_evaluation(test_df, config, prompts)
          35 return df_prompt_scores
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\helpers.py:59, in run_optimization(df, config)
          51 task = get_task(df, config)
@@ -115,10 +115,10 @@ prompts = run_experiment(df, config)
     ---> 59 prompts = optimizer.optimize(n_steps=config.n_steps)
          61 if config.prepend_exemplars:
          62     selector = get_exemplar_selector(config.exemplar_selector, task, predictor)
-    
+
 
     File <string>:15, in optimize(self, n_steps)
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\optimizers\evoprompt_ga.py:69, in EvoPromptGA._pre_optimization_loop(self)
          67     logger.warning(f"Initial sequences: {seq}")
@@ -128,7 +128,7 @@ prompts = run_experiment(df, config)
          71     ).tolist()
          72 # sort prompts by score
          73 self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\tasks\classification_tasks.py:101, in ClassificationTask.evaluate(self, prompts, predictor, system_prompts, n_samples, subsample, return_seq)
          98 ys_subsample = self.ys[indices]
@@ -136,7 +136,7 @@ prompts = run_experiment(df, config)
     --> 101 preds = predictor.predict(prompts, xs_subsample, system_prompts=system_prompts, return_seq=return_seq)
         103 if return_seq:
         104     preds, seqs = preds
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\predictors\base_predictor.py:57, in BasePredictor.predict(self, prompts, xs, system_prompts, return_seq)
          54 if isinstance(prompts, str):
@@ -146,7 +146,7 @@ prompts = run_experiment(df, config)
          59 )
          60 preds = self._extract_preds(outputs)
          62 shape = (len(prompts), len(xs))
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\llms\base_llm.py:97, in BaseLLM.get_response(self, prompts, system_prompts)
          95 if isinstance(system_prompts, str):
@@ -154,7 +154,7 @@ prompts = run_experiment(df, config)
     ---> 97 responses = self._get_response(prompts, system_prompts)
          98 self.update_token_count(prompts + system_prompts, responses)
         100 return responses
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\llms\api_llm.py:82, in APILLM._get_response(self, prompts, system_prompts)
          79 def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
@@ -162,21 +162,21 @@ prompts = run_experiment(df, config)
          81     loop = asyncio.get_event_loop()
     ---> 82     responses = loop.run_until_complete(self._get_response_async(prompts, system_prompts))
          83     return responses
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\nest_asyncio.py:98, in _patch_loop.<locals>.run_until_complete(self, future)
          95 if not f.done():
          96     raise RuntimeError(
          97         'Event loop stopped before Future completed.')
     ---> 98 return f.result()
-    
+
 
     File ~\AppData\Local\Programs\Python\Python312\Lib\asyncio\futures.py:203, in Future.result(self)
         201 self.__log_traceback = False
         202 if self._exception is not None:
     --> 203     raise self._exception.with_traceback(self._exception_tb)
         204 return self._result
-    
+
 
     File ~\AppData\Local\Programs\Python\Python312\Lib\asyncio\tasks.py:316, in Task.__step_run_and_handle_result(***failed resolving arguments***)
         314         result = coro.send(None)
@@ -185,7 +185,7 @@ prompts = run_experiment(df, config)
         317 except StopIteration as exc:
         318     if self._must_cancel:
         319         # Task is cancelled right before coro stops.
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\llms\api_llm.py:90, in APILLM._get_response_async(self, prompts, system_prompts)
          85 async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
@@ -195,7 +195,7 @@ prompts = run_experiment(df, config)
          89     ]
     ---> 90     responses = await asyncio.gather(*tasks)
          91     return [response.choices[0].message.content for response in responses]
-    
+
 
     File ~\AppData\Local\Programs\Python\Python312\Lib\asyncio\tasks.py:385, in Task.__wakeup(self, future)
         383 def __wakeup(self, future):
@@ -204,7 +204,7 @@ prompts = run_experiment(df, config)
         386     except BaseException as exc:
         387         # This may also be a cancellation.
         388         self.__step(exc)
-    
+
 
     File ~\AppData\Local\Programs\Python\Python312\Lib\asyncio\tasks.py:314, in Task.__step_run_and_handle_result(***failed resolving arguments***)
         310 try:
@@ -214,7 +214,7 @@ prompts = run_experiment(df, config)
     --> 314         result = coro.send(None)
         315     else:
         316         result = coro.throw(exc)
-    
+
 
     File ~\Documents\programming\promptolution\promptolution\llms\api_llm.py:25, in _invoke_model(prompt, system_prompt, max_tokens, model_id, client, semaphore)
          23 async with semaphore:
@@ -225,7 +225,7 @@ prompts = run_experiment(df, config)
          28         max_tokens=max_tokens,
          29     )
          30     return response
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\resources\chat\completions\completions.py:2032, in AsyncCompletions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, web_search_options, extra_headers, extra_query, extra_body, timeout)
        1989 @required_args(["messages", "model"], ["messages", "model", "stream"])
@@ -281,7 +281,7 @@ prompts = run_experiment(df, config)
        2076         stream=stream or False,
        2077         stream_cls=AsyncStream[ChatCompletionChunk],
        2078     )
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1805, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls)
        1791 async def post(
@@ -293,7 +293,7 @@ prompts = run_experiment(df, config)
        1803         method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options
        1804     )
     -> 1805     return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1495, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls, remaining_retries)
        1492 else:
@@ -305,7 +305,7 @@ prompts = run_experiment(df, config)
        1499     stream_cls=stream_cls,
        1500     retries_taken=retries_taken,
        1501 )
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1585, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
        1583 if remaining_retries > 0 and self._should_retry(err.response):
@@ -321,7 +321,7 @@ prompts = run_experiment(df, config)
        1594 # If the response is streamed then we need to explicitly read the response
        1595 # to completion before attempting to access the response text.
        1596 if not err.response.is_closed:
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1632, in AsyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls)
        1628 log.info("Retrying request to %s in %f seconds", options.url, timeout)
@@ -333,7 +333,7 @@ prompts = run_experiment(df, config)
        1636     stream=stream,
        1637     stream_cls=stream_cls,
        1638 )
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1585, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
        1583 if remaining_retries > 0 and self._should_retry(err.response):
@@ -349,7 +349,7 @@ prompts = run_experiment(df, config)
        1594 # If the response is streamed then we need to explicitly read the response
        1595 # to completion before attempting to access the response text.
        1596 if not err.response.is_closed:
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1632, in AsyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls)
        1628 log.info("Retrying request to %s in %f seconds", options.url, timeout)
@@ -361,7 +361,7 @@ prompts = run_experiment(df, config)
        1636     stream=stream,
        1637     stream_cls=stream_cls,
        1638 )
-    
+
 
     File c:\Users\tzehl\Documents\programming\promptolution\.venv\Lib\site-packages\openai\_base_client.py:1600, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken)
        1597         await err.response.aread()
@@ -372,7 +372,7 @@ prompts = run_experiment(df, config)
        1604     options=options,
        (...)   1608     retries_taken=retries_taken,
        1609 )
-    
+
 
     RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-3DmWJfR4tphuKTSzcsMB3vHF on requests per min (RPM): Limit 500, Used 500, Requested 1. Please try again in 120ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}
 
diff --git a/docs/release-notes.md b/docs/release-notes.md
index 77d26c7..e61bcc1 100644
--- a/docs/release-notes.md
+++ b/docs/release-notes.md
@@ -1,2 +1 @@
 Welcome to the release notes of Promptolution! Please find the release notes for the corresponding versions of the library in the menu to your left.
-
diff --git a/docs/release-notes/v0.2.0.md b/docs/release-notes/v0.2.0.md
index effd671..d79cbe2 100644
--- a/docs/release-notes/v0.2.0.md
+++ b/docs/release-notes/v0.2.0.md
@@ -1,7 +1,7 @@
 ## Release v0.2.0
 
 ### What's Changed
-#### Added Features: 
+#### Added Features:
 * Prompt creation utility function
 * Prompt variation utility function
 * New optimizer: OPro (see [arXiv paper](https://arxiv.org/abs/2309.03409))
diff --git a/docs/release-notes/v1.1.1.md b/docs/release-notes/v1.1.1.md
index 3c7e707..7f11ba6 100644
--- a/docs/release-notes/v1.1.1.md
+++ b/docs/release-notes/v1.1.1.md
@@ -5,4 +5,3 @@
 - updated transformers dependency: bumped from 4.46.3 to 4.48.0
 
 **Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.0...v1.1.1)
-
diff --git a/docs/release-notes/v2.0.0.md b/docs/release-notes/v2.0.0.md
new file mode 100644
index 0000000..ea7529b
--- /dev/null
+++ b/docs/release-notes/v2.0.0.md
@@ -0,0 +1,18 @@
+## Release v2.0.0
+### What's changed
+
+#### Added features
+* We welcome CAPO to the family of our optimizers! CAPO is an optimizer, capable of utilizing few-shot examples to improve prompt performance. Additionally it implements multiple AutoML-approaches. Check out the paper by [Zehle et al. (2025)](https://arxiv.org/abs/2504.16005) for more details (yep it's us :))
+* Eval-Cache is now part of the ClassificationTask! This saves a lot of LLM-calls as we do not rerun already evaluated data points
+* Similar to the Eval-Cache, we added a Sequence-Cache, allowing to extract reasoning chains for few-shot examples
+* introduced evaluation strategies to the ClassificationTask, allowing for random subsampling, sequential blocking of the dataset or just retrieving scores of datapoints that were already evaluated on prompts
+
+#### Further changes
+* rearanged imports and module memberships
+* Classificators are now called Classifiers
+* Fixed multiple docstrings and namings of variables.
+* Simplified testing and extended the testcases to the new implementations
+* Classification task can now also output a per-datapoint score
+* Introduced statistical tests (specifically paired-t-test), for CAPO
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.4.0...v2.0.0)
diff --git a/docs/release-notes/vX.X.X.md b/docs/release-notes/vX.X.X.md
index 526829d..bd4ed03 100644
--- a/docs/release-notes/vX.X.X.md
+++ b/docs/release-notes/vX.X.X.md
@@ -5,7 +5,6 @@
 *
 
 #### Further changes:
-* 
+*
 
 **Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/W.W.W...vX.X.X)
-
diff --git a/mkdocs.yml b/mkdocs.yml
index a26607d..b3b5428 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -45,7 +45,7 @@ plugins:
 
 nav:
   - Home: index.md
-  - Release Notes: 
+  - Release Notes:
     - Overview: release-notes.md
     - v1.4.0: release-notes/v1.4.0.md
     - v1.3.2: release-notes/v1.3.2.md
@@ -65,8 +65,9 @@ nav:
     - Optimizers: api/optimizers.md
     - Predictors: api/predictors.md
     - Tasks: api/tasks.md
-    - Callbacks: api/callbacks.md
-    - Config: api/config.md
+    - Helpers: api/helpers.md
+    - Utils: api/utils.md
+    - Exemplar Selectors: api/exemplar_selectors.md
   - Tutorials:
     - Getting Started: examples/getting_started.md
 
diff --git a/notebooks/getting_started.ipynb b/notebooks/getting_started.ipynb
index b0d4e07..54bf3f2 100644
--- a/notebooks/getting_started.ipynb
+++ b/notebooks/getting_started.ipynb
@@ -43,14 +43,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "from promptolution import ExperimentConfig, run_experiment\n",
+    "from promptolution.utils import ExperimentConfig\n",
+    "from promptolution.helpers import run_experiment\n",
     "import nest_asyncio\n",
-    "nest_asyncio.apply() # Required for notebook environments"
+    "\n",
+    "nest_asyncio.apply()  # Required for notebook environments"
    ]
   },
   {
@@ -76,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -84,9 +86,11 @@
     "df = df.rename(columns={\"Sentence\": \"x\", \"Label\": \"y\"})\n",
     "df = df.replace({\"OBJ\": \"objective\", \"SUBJ\": \"subjective\"})\n",
     "\n",
-    "task_description = \"The dataset contains sentences labeled as either subjective or objective. \"\\\n",
-    "        \"The task is to classify each sentence as either subjective or objective. \" \\\n",
-    "        \"The class mentioned first in the response of the LLM will be the prediction.\""
+    "task_description = (\n",
+    "    \"The dataset contains sentences labeled as either subjective or objective. \"\n",
+    "    \"The task is to classify each sentence as either subjective or objective. \"\n",
+    "    \"The class mentioned in between the answer tags <final_answer></final_answer> will be used as the prediction.\"\n",
+    ")"
    ]
   },
   {
@@ -105,23 +109,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
     "init_prompts = [\n",
     "    'Classify the given text as either an objective or subjective statement based on the tone and language used: e.g. the tone and language used should indicate whether the statement is a neutral, factual summary (objective) or an expression of opinion or emotional tone (subjective). Include the output classes \"objective\" or \"subjective\" in the prompt.',\n",
-    "    'What kind of statement is the following text: [Insert text here]? Is it <objective_statement> or <subjective_statement>?',\n",
+    "    \"What kind of statement is the following text: [Insert text here]? Is it <objective_statement> or <subjective_statement>?\",\n",
     "    'Identify whether a sentence is objective or subjective by analyzing the tone, language, and underlying perspective. Consider the emotion, opinion, and bias present in the sentence. Are the authors presenting objective facts or expressing a personal point of view? The output will be either \"objective\" (output class: objective) or \"subjective\" (output class: subjective).',\n",
-    "    'Classify the following sentences as either objective or subjective, indicating the name of the output classes: [input sentence]. Output classes: objective, subjective',\n",
+    "    \"Classify the following sentences as either objective or subjective, indicating the name of the output classes: [input sentence]. Output classes: objective, subjective\",\n",
     "    '_query a text about legal or corporate-related issues, and predict whether the tone is objective or subjective, outputting the corresponding class \"objective\" for non-subjective language or \"subjective\" for subjective language_',\n",
     "    'Classify a statement as either \"subjective\" or \"objective\" based on whether it reflects a personal opinion or a verifiable fact. The output classes to include are \"objective\" and \"subjective\".',\n",
-    "    'Classify the text as objective or subjective based on its tone and language.',\n",
-    "    'Classify the text as objective or subjective based on the presence of opinions or facts. Output classes: objective, subjective.',\n",
-    "    'Classify the given text as objective or subjective based on its tone, focusing on its intention, purpose, and level of personal opinion or emotional appeal, with outputs including classes such as objective or subjective.',\n",
+    "    \"Classify the text as objective or subjective based on its tone and language.\",\n",
+    "    \"Classify the text as objective or subjective based on the presence of opinions or facts. Output classes: objective, subjective.\",\n",
+    "    \"Classify the given text as objective or subjective based on its tone, focusing on its intention, purpose, and level of personal opinion or emotional appeal, with outputs including classes such as objective or subjective.\",\n",
     "    \"Categorize the text as either objective or subjective, considering whether it presents neutral information or expresses a personal opinion/bias.\\n\\nObjective: The text has a neutral tone and presents factual information about the actions of Democrats in Congress and the union's negotiations.\\n\\nSubjective: The text has a evaluative tone and expresses a positive/negative opinion/evaluation about the past performance of the country.\",\n",
     "    'Given a sentence, classify it as either \"objective\" or \"subjective\" based on its tone and language, considering the presence of third-person pronouns, neutral language, and opinions. Classify the output as \"objective\" if the tone is neutral and detached, focusing on facts and data, or as \"subjective\" if the tone is evaluative, emotive, or biased.',\n",
-    "    'Identify whether the given sentence is subjective or objective, then correspondingly output \"objective\" or \"subjective\" in the form of \"<output class>, (e.g. \"objective\"), without quotes. Please note that the subjective orientation typically describes a sentence where the writer expresses their own opinion or attitude, whereas an objective sentence presents facts or information without personal involvement or bias. <output classes: subjective, objective>'\n",
+    "    'Identify whether the given sentence is subjective or objective, then correspondingly output \"objective\" or \"subjective\" in the form of \"<output class>, (e.g. \"objective\"), without quotes. Please note that the subjective orientation typically describes a sentence where the writer expresses their own opinion or attitude, whereas an objective sentence presents facts or information without personal involvement or bias. <output classes: subjective, objective>',\n",
     "]"
    ]
   },
@@ -147,11 +151,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
-    "token = \"YOUR_API_KEY\" # Replace with your Promptolution API key"
+    "api_key = \"YOUR_API_KEY\"  # Replace with your Promptolution API key"
    ]
   },
   {
@@ -170,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -180,8 +184,9 @@
     "    prompts=init_prompts,\n",
     "    n_steps=10,\n",
     "    api_url=\"https://api.deepinfra.com/v1/openai\",\n",
-    "    llm=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
-    "    token=token,\n",
+    "    model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "    api_key=api_key,\n",
+    "    n_subsamples=30,\n",
     ")"
    ]
   },
@@ -201,9 +206,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.\n",
+      "⚠️ The LLM does not have a tokenizer. Using simple token count.\n",
+      "🔥 Starting optimization...\n",
+      "📊 Starting evaluation...\n"
+     ]
+    }
+   ],
    "source": [
     "prompts = run_experiment(df, config)"
    ]
@@ -222,7 +238,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -253,99 +269,99 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>Classify each sentence according to its linguistic attributes, distilling whether it embodies a detached, factual narrative or a personal, opinionated viewpoint. Pay attention to the tone, lexical choices, and underlying perspective of each sentence, teasing apart verifiable facts from emotive convictions. The predicted classification will be the leading response, with the corresponding label nestled between &lt;final_answer&gt; and &lt;/final_answer&gt; markers, limited to \"objective\" and \"subjective\". Sentence analysis will identify objective presentations as neutral, straightforward conveyors of verifiable information and subjective ones as vessels for personal opinions, biases, or emotional influences.\\n\\nInput:</td>\n",
-       "      <td>0.766667</td>\n",
+       "      <td>Classify the text as objective or subjective based on the presence of opinions or facts. Output classes: objective, subjective.\\n\\nInput:\\nThe proposed agreement includes the best wage increases for rail workers in over forty years.\\n\\nOutput:\\nobjective\\n\\nInput:\\nThe principal reason, from the point of view of government, is that a universal income tax would be a powerful restraint upon the expansion of government.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.76</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>Classify each sentence in the dataset as either '&lt;final_answer&gt;objective&lt;/final_answer&gt;' or '&lt;final_answer&gt;subjective&lt;/final_answer&gt;' based on its linguistic tone, language used, and underlying authorial perspective. Identify whether the sentence presents factual, neutral information or expresses a personal opinion, bias, or emotional perspective. Consider the presence of emotion, opinion, and bias, as well as the tone, whether it's neutral, positive, or negative. Take into account the subtle nuances in language that distinguish objective from subjective text, including the author's voice, choice of words, and intention. Your classification should reflect the predominant perspective presented in each sentence, with the class mentioned first in your response being the prediction.\\n\\nInput:\\nIs this a warning of what’s to come?\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nShe said the hope had been to “change the whole region and finally push things forward in LGBTQ+ rights in the Balkans”.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
-       "      <td>0.666667</td>\n",
+       "      <td>Task: Linguistic Analysis for Sentence Classification\\n\\nClassify each sentence as either objective or subjective by applying linguistic insights to identify its tone, emotion, and degree of neutrality. Examine the sentences' language features, sentiment, and presence of verifiable facts or personal opinions. Determine whether each sentence presents impartial data or conveys the author's emotions, beliefs, or biases. Treat each sentence as a distinct entity, analyzing its contours, nuances, and purpose. Consider the distinction between factual reports like news articles and opinion-based writings like blog posts. Make a nuanced classification by scrutinizing the sentence's impact, intention, and emotional resonance.\\n\\nYour response should be comprised of two parts: the classification and the rationale. Enclose the first-mentioned class within the markers &lt;final_answer&gt; and &lt;/final_answer&gt;. For instance, if the classification is 'objective', the output should be &lt;final_answer&gt;objective&lt;/final_answer&gt;. Focus on the sentence's language, tone, and emotional appeal to make an informed decision about its categorization, prioritizing the sentence's intention and purpose.\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\n“This latest rule will open our borders even more, and the Court seems to relish making arbitrary decisions without thinking about consequences.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.72</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>Transform the given sentence into its corresponding objective or subjective classification. To achieve this, you will need to analyze the tone, purpose, and level of personal opinion or emotional appeal within each sentence. We're looking for a prediction that addresses the following inquiry: is the given sentence founded on verifiable facts, presenting information in a neutral or detached manner, or does it contain personal opinions, emotions, or biases that influence the message?\\n\\n Please indicate your response by specifying one of the two classes first: \"objective\" or \"subjective\". Use the following markers to guide your output: &lt;final_answer&gt; and &lt;/final_answer&gt;.\\n\\nInput:</td>\n",
-       "      <td>0.633333</td>\n",
+       "      <td>Classify each sentence as either objective or subjective by unpacking its linguistic nuances and emotional undertones. Analyze the sentence's language features, sentiment, and presence of verifiable facts or personal opinions to determine whether it presents impartial data or conveys the author's emotions, beliefs, or biases. Treat each sentence as a standalone entity, examining its contours, subtleties, and intended purpose. Consider the distinction between factual reporting, like news articles, and opinion-based writings, like blog posts. Make a refined classification by scrutinizing the sentence's impact, intention, and emotional resonance, prioritizing the sentence's intention and purpose. Your response should consist of two parts: the classification and the rationale. Enclose the primary classification within the markers &lt;final_answer&gt; and &lt;/final_answer&gt;. Focus on the sentence's language, tone, and emotional appeal to make an informed decision about its categorization. Classify each sentence as either objective or subjective by examining its linguistic tone, underlying intent, and purpose. Determine whether the text presents a neutral, factual account or expresses a personal opinion or emotional bias. Evaluate whether the text provides a neutral, factual report or reveals an evaluative tone, offering a positive or negative appraisal. Outputs will include classifications like objective or subjective, with the initial response serving as the prediction.\\n\\nInput:\\nOver several decades, Prime Central London – or PCL – had become a repository for cash from wealthy foreigners, whether they actually wanted to live there or not.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.71</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>In the realm of corporate and legal discourse, we need to accurately categorize sentences as either objective or subjective. Objectivity is achieved when providing factual information or recounting events in a neutral tone, while subjectivity arises from evaluative or interpretative opinions. Review each sentence for its tone, purpose, and emotional appeal, analyzing whether it presents verifiable facts or conveys personal opinions and biases. Based on the Large Language Model's response, identify the primary class label and extract it from the text between '&lt;final_answer&gt;' and '&lt;/final_answer&gt;'. Your prediction should be a standalone statement indicating the classified class, such as \"&lt;final_answer&gt;objective&lt;/final_answer&gt;\" or \"&lt;final_answer&gt;subjective&lt;/final_answer&gt;\".\\n\\nInput:</td>\n",
-       "      <td>0.633333</td>\n",
+       "      <td>&lt;promptгалтер/&gt;\\n\\nClassify each sentence as either objective or subjective by examining its linguistic tone, underlying intent, and purpose. Consider whether the text presents a neutral, factual account or expresses a personal opinion or emotional bias. Evaluate whether the text is neutral and provides mere reportage, such as a factual report on congressional Democrats' actions and labor union negotiations, or if it reveals an evaluative tone, offering a positive or negative appraisal of a nation's past performance. Outputs will include classifications like objective or subjective. The class mentioned first in the response will serve as the prediction, with the class label extracted from the text between the markers &lt;final_answer&gt; and &lt;/final_answer&gt;.\\n\\nInput:\\nOver several decades, Prime Central London – or PCL – had become a repository for cash from wealthy foreigners, whether they actually wanted to live there or not.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nFaced with a tighter labor market, many districts are raising base salaries and offering signing and relocation bonuses — up to a whopping $25,000 in one New Mexico school district.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nThat when liquidation of commodities and securities has gone too far it becomes the business of government to stop it, using public credit by such means as it may think fit.\\n\\nOutput:\\n&lt;final_answer&gt;subjective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.67</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>Categorize the text as either objective or subjective, considering whether it presents neutral information or expresses a personal opinion/bias.\\n\\nObjective: The text has a neutral tone and presents factual information about the actions of Democrats in Congress and the union's negotiations.\\n\\nSubjective: The text has a evaluative tone and expresses a positive/negative opinion/evaluation about the past performance of the country.\\n\\nInput:\\nTecovirimat is still available for doctors to obtain for their patients through an “expanded access” protocol set up by the CDC.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nFor now it appears that the extremists have got their way.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nThe fiscal swing was more like 4% of gdp in the two years from 2008 to 2010.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nUniversity of Washington epidemiologist Ali Mokdad predicted a rise in reported COVID-19 cases.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nHow shall the cost of government be reduced?\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
-       "      <td>0.633333</td>\n",
+       "      <td>Classify a given sentence as either \"objective\" or \"subjective\" based on its linguistic characteristics. Determine whether the sentence presents neutral information or expresses a personal opinion/bias. If the text maintains a detached tone, focusing on verifiable facts and data, assign the label \"objective\". Conversely, if the tone is evaluative, emotive, or reveals a bias, categorize it as \"subjective\". Compare the tone of a factual text discussing political events to a text expressing a clear opinion about a historical event to grasp the distinction between the two genres. The predicted class will be the first class mentioned in the language model's response, enclosed within the marks &lt;final_answer&gt; and &lt;/final_answer&gt;.\\n\\nInput:\\n“This latest rule will open our borders even more, and the Court seems to relish making arbitrary decisions without thinking about consequences.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nTransportation Secretary Pete Buttigieg confirmed to The Associated Press on Thursday that $104.6 million in federal funds coming from last year’s bipartisan infrastructure bill will go toward a plan to dismantle Interstate 375, a highway built to bisect Detroit’s Black Bottom neighborhood and its epicenter of Black business, Paradise Valley.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.67</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>Task: Sentence Classification - Distinguishing Between Objective and Subjective Text\\n\\nAnalyze the given sentences to categorize them into either objective or subjective presentations. Objective sentences are defined as providing neutral, factual information, whereas subjective sentences convey personal opinions, biases, or emotional perspectives. When processing each sentence, consider the linguistic tone, language used, and underlying authorial perspective. Determine whether the authors present objective facts or express their personal viewpoint. Consider the presence of emotion, opinion, and bias, as well as the tone, whether it's neutral, positive, or negative. The outcome will be either '&lt;final_answer&gt;objective&lt;/final_answer&gt;' or '&lt;final_answer&gt;subjective&lt;/final_answer&gt;', indicating whether the sentence presents factual information or expresses a personal opinion/bias.\\n\\nInput:\\nFor now it appears that the extremists have got their way.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nUniversity of Washington epidemiologist Ali Mokdad predicted a rise in reported COVID-19 cases.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
-       "      <td>0.633333</td>\n",
+       "      <td>Given a sentence, classify it as either \"objective\" or \"subjective\" based on its tone and language, considering the presence of third-person pronouns, neutral language, and opinions. Classify the output as \"objective\" if the tone is neutral and detached, focusing on facts and data, or as \"subjective\" if the tone is evaluative, emotive, or biased.\\n\\nInput:\\nTransportation Secretary Pete Buttigieg confirmed to The Associated Press on Thursday that $104.6 million in federal funds coming from last year’s bipartisan infrastructure bill will go toward a plan to dismantle Interstate 375, a highway built to bisect Detroit’s Black Bottom neighborhood and its epicenter of Black business, Paradise Valley.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\n“This latest rule will open our borders even more, and the Court seems to relish making arbitrary decisions without thinking about consequences.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nHe is fairly secure.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nIn a recent report on the “new poor,” made by the Welfare Council of New York City, there is a reference to “the mental infection of dependency.” This was upon the investigation of unemployment relief.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.67</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>Classify the provided sentences into whether they are presented in a straightforward, unbiased manner (\"objective\") or reveal the author's personal feelings, opinions, or perspectives (\"subjective\"). To make this distinction, carefully scrutinize the linguistic characteristics, such as tone, sentence structure, and the conceptual framework employed in each sentence. Pay attention to the presence of emotional language, evaluative statements, and potential biases. In the context of official or business communications, precision is key when reporting factual information or detailing events, whereas subjective expressions of opinion or interpretation are expected in discussions that aim to persuade, analyze, or critique. The classification output will comprise a single word, the initial response, which will be automatically bracketed within the markers &lt;final_answer&gt; and &lt;/final_answer&gt;, with the recognized class labels being limited to \"objective\" and \"subjective\".\\n\\nInput:</td>\n",
-       "      <td>0.600000</td>\n",
+       "      <td>Classify each sentence as objective or subjective by recognizing its language characteristics. Identify whether each sentence presents neutral information or expresses a personal opinion. If the sentence provides factual information without taking a bias, classify it as objective. Conversely, if the sentence conveys the author's perspective, emotions, or beliefs, label it as subjective. As our language model expert, carefully analyze each sentence, extracting its tone, and determine whether it presents verifiable data or the author's biased thoughts. For instance, compare a factual news report on politics to a blog post about a historical event and highlight the differences between objective and subjective writing. Our output will be the predicted class enclosed within the markers &lt;final_answer&gt; and &lt;/final_answer&gt;, with the first-mentioned class being the predicted label.\\n\\nInput:\\n“This latest rule will open our borders even more, and the Court seems to relish making arbitrary decisions without thinking about consequences.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.67</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>Transform a given sentence into either a subjective or objective classification by scrutinizing its linguistic tone, word choice, and underlying perspective. Determine whether the sentence conveys a personal opinion or presents verifiable facts by examining the presence of emotion, opinion, and potential bias. The predicted class will be the first mention in the response, with the class label extracted from the text enclosed within markers &lt;final_answer&gt; and &lt;/final_answer&gt;. The recognized classes will be limited to \"objective\" and \"subjective\".\\n\\nInput:</td>\n",
-       "      <td>0.600000</td>\n",
+       "      <td>Categorize the text as either objective or subjective, considering whether it presents neutral information or expresses a personal opinion/bias.\\n\\nObjective: The text has a neutral tone and presents factual information about the actions of Democrats in Congress and the union's negotiations.\\n\\nSubjective: The text has a evaluative tone and expresses a positive/negative opinion/evaluation about the past performance of the country.\\n\\nInput:\\nOver several decades, Prime Central London – or PCL – had become a repository for cash from wealthy foreigners, whether they actually wanted to live there or not.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nThat when liquidation of commodities and securities has gone too far it becomes the business of government to stop it, using public credit by such means as it may think fit.\\n\\nOutput:\\n&lt;final_answer&gt;subjective&lt;/final_answer&gt;\\n\\nInput:\\nThat is what it means to sell bonds.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.66</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>Task: Sentence Classification\\n\\nClassify each sentence in the given dataset as either objectively stating facts or subjectively expressing personal opinions. To make this determination, scrutinize the linguistic features, including tone, syntax, and underlying assumptions of each sentence. Identify telltale signs of objectivity, such as factual information presented in a neutral tone, absence of emotive language, and absence of authorial biases. Conversely, recognize indicators of subjectivity, like emotive language, biased wording, or the author's voice. Keep in mind the subtleties in language that distinguish objective from subjective text, including tone, which might be neutral, positive, or negative.\\n\\nWhen generating your prediction, only the first-mentioned class label will be used. This label will be extracted from the response text within the markers '&lt;final_answer&gt;' and '&lt;/final_answer&gt;'. The recognized class labels are limited to '&lt;final_answer&gt;objective&lt;/final_answer&gt;' and '&lt;final_answer&gt;subjective&lt;/final_answer&gt;'. Be cautious of emotions, opinions, and biases that may influence the author's stance when providing your prediction.\\n\\nInput:</td>\n",
-       "      <td>0.566667</td>\n",
+       "      <td>Classify a statement as either \"subjective\" or \"objective\" based on whether it reflects a personal opinion or a verifiable fact. The output classes to include are \"objective\" and \"subjective\".\\n\\nInput:\\nThe promotion of it for many is an avocation, for increasing numbers it is a profession, and for a very great number of more or less trained men and women it is employment and livelihood.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.65</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>Classify the given sentences into '&lt;final_answer&gt;objective&lt;/final_answer&gt;' or '&lt;final_answer&gt;subjective&lt;/final_answer&gt;' by analyzing the linguistic characteristics that define objectivity. Explore the tone, syntax, and underlying assumptions of each sentence to determine if it presents factual information or expresses a personal perspective. Specifically, look out for the presence of emotive language, biased wording, and the author's voice to decide whether the sentence is a neutral account of facts or a subjective expression of opinion. Pay attention to the subtleties in language, such as tone, that distinguish objective from subjective text, including whether it's neutral, positive, or negative. Remember that the predicted class will be the initial response, and the corresponding class label will be extracted from the response text within the markers &lt;final_answer&gt; and &lt;/final_answer&gt;. The recognized class labels will be limited to '&lt;final_answer&gt;objective&lt;/final_answer&gt;' and '&lt;final_answer&gt;subjective&lt;/final_answer&gt;'.\\n\\nInput:\\nIs this a warning of what’s to come?\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
-       "      <td>0.566667</td>\n",
+       "      <td>A labeling exercise necessitates scrutinizing provided text to classify them as either vastly personal ('subjective') or dispassionately factual ('objective') based on the presence of opinions, biases, or verifiable information. Your mission is to accurately determine whether the supplied sentence leans more towards subjective expression of personal thought or objective presentation of facts, then output the corresponding classification within the format \"&lt;final_answer&gt;&lt;output class&gt;, &lt;output class&gt;&lt;/final_answer&gt;\" (e.g. \"&lt;final_answer&gt;objective&lt;/final_answer&gt;\"). Recognize that subjective sentences usually embody the writer's own views or emotions, whereas objective sentences present data without personal investment or allegiance. The predicted outcome will be the one first mentioned in the response, and the extracted class label will be positioned between the markers &lt;final_answer&gt; and &lt;/final_answer&gt;, which can only be one of the two categories: subjective or objective.\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.65</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
-       "      <td>Use your linguistic expertise to objectively categorize given sentences into their respective subjective or objective class. For the purpose of this task, consider a sentence objective if it presents verifiable, fact-based information in a neutral manner. Conversely, classify a sentence as subjective if it conveys personal opinions, biases, or emotional perspectives. When evaluating each sentence, take note of the tone, language used, and underlying authorial perspective. Determine whether the sentence's linguistic attributes lean towards providing detached, factual information or expressing a person's viewpoint. Pay close attention to the presence of emotion, opinion, and bias, as well as the tone, whether it's neutral, positive, or negative. Classify each sentence according to its linguistic characteristics, distinguishing between factual, verifiable narratives and personal, opinion-based viewpoints. The first class mentioned in your response will be considered the prediction, and the class label will be extracted from the text between '&lt;final_answer&gt;' and '&lt;/final_answer&gt;'.\\n\\nInput:\\nUniversity of Washington epidemiologist Ali Mokdad predicted a rise in reported COVID-19 cases.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
-       "      <td>0.566667</td>\n",
+       "      <td>Classify a collection of labeled sentences as either based on fact or reflecting personal opinion, using linguistic features to distinguish between objective statements presenting verifiable information and subjective expressions of opinion or attitude, with the objective class being denoted by &lt;final_answer&gt;objective&lt;/final_answer&gt; and the subjective class by &lt;final_answer&gt;subjective&lt;/final_answer&gt;, where the first-mentioned class in the response will serve as the predicted outcome.\\n\\nInput:\\nThe principal reason, from the point of view of government, is that a universal income tax would be a powerful restraint upon the expansion of government.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.64</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
-       "      <td>Classify the provided sentences as either objective or subjective by examining their linguistic characteristics, including tone, language, and perspective. Be mindful of the presence of emotions, opinions, and biases that may impact the author's stance. When providing a prediction, identify and extract the relevant class label from the text between the &lt;final_answer&gt; and &lt;/final_answer&gt; markers, which will be either \"objective\" or \"subjective\". The first mentioned class label in the response will serve as the predicted output.\\n\\nInput:\\n“There’ll be a rise in hospitalization and mortality, but not as high proportionally to the rise in cases simply because many of us – the majority of Americans right now – have been either infected or vaccinated or both, so we have some immunity,” he said.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nIs this a warning of what’s to come?\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
-       "      <td>0.466667</td>\n",
+       "      <td>Given a dataset of sentences, use linguistic analysis to categorize each sentence as either 'objective' or 'subjective', reflecting its tone and language usage. Examine the presence of neutral third-person pronouns, factual data, and opinions to determine whether a sentence presents information in a detached and neutral manner ('objective') or conveys a personal perspective or emotional appeal ('subjective'). Your primary consideration should be the sentence's intention, purpose, and emotional resonance, with the predicted classification appearing first in your response. The predicted classification will be extracted from the text situated between the '&lt;final_answer&gt;' and '&lt;/final_answer&gt;' markers.\\n\\nInput:\\nCOVID is continually evolving to become more immune evasive, according to Ray, and Omicron is spawning exponentially.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:\\nOver several decades, Prime Central London – or PCL – had become a repository for cash from wealthy foreigners, whether they actually wanted to live there or not.\\n\\nOutput:\\n&lt;final_answer&gt;objective&lt;/final_answer&gt;\\n\\nInput:</td>\n",
+       "      <td>0.59</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           prompt  \\\n",
-       "0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Classify each sentence according to its linguistic attributes, distilling whether it embodies a detached, factual narrative or a personal, opinionated viewpoint. Pay attention to the tone, lexical choices, and underlying perspective of each sentence, teasing apart verifiable facts from emotive convictions. The predicted classification will be the leading response, with the corresponding label nestled between <final_answer> and </final_answer> markers, limited to \"objective\" and \"subjective\". Sentence analysis will identify objective presentations as neutral, straightforward conveyors of verifiable information and subjective ones as vessels for personal opinions, biases, or emotional influences.\\n\\nInput:   \n",
-       "1                                                                                                                                                                                       Classify each sentence in the dataset as either '<final_answer>objective</final_answer>' or '<final_answer>subjective</final_answer>' based on its linguistic tone, language used, and underlying authorial perspective. Identify whether the sentence presents factual, neutral information or expresses a personal opinion, bias, or emotional perspective. Consider the presence of emotion, opinion, and bias, as well as the tone, whether it's neutral, positive, or negative. Take into account the subtle nuances in language that distinguish objective from subjective text, including the author's voice, choice of words, and intention. Your classification should reflect the predominant perspective presented in each sentence, with the class mentioned first in your response being the prediction.\\n\\nInput:\\nIs this a warning of what’s to come?\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nShe said the hope had been to “change the whole region and finally push things forward in LGBTQ+ rights in the Balkans”.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
-       "2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Transform the given sentence into its corresponding objective or subjective classification. To achieve this, you will need to analyze the tone, purpose, and level of personal opinion or emotional appeal within each sentence. We're looking for a prediction that addresses the following inquiry: is the given sentence founded on verifiable facts, presenting information in a neutral or detached manner, or does it contain personal opinions, emotions, or biases that influence the message?\\n\\n Please indicate your response by specifying one of the two classes first: \"objective\" or \"subjective\". Use the following markers to guide your output: <final_answer> and </final_answer>.\\n\\nInput:   \n",
-       "3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          In the realm of corporate and legal discourse, we need to accurately categorize sentences as either objective or subjective. Objectivity is achieved when providing factual information or recounting events in a neutral tone, while subjectivity arises from evaluative or interpretative opinions. Review each sentence for its tone, purpose, and emotional appeal, analyzing whether it presents verifiable facts or conveys personal opinions and biases. Based on the Large Language Model's response, identify the primary class label and extract it from the text between '<final_answer>' and '</final_answer>'. Your prediction should be a standalone statement indicating the classified class, such as \"<final_answer>objective</final_answer>\" or \"<final_answer>subjective</final_answer>\".\\n\\nInput:   \n",
-       "4                                                                                                        Categorize the text as either objective or subjective, considering whether it presents neutral information or expresses a personal opinion/bias.\\n\\nObjective: The text has a neutral tone and presents factual information about the actions of Democrats in Congress and the union's negotiations.\\n\\nSubjective: The text has a evaluative tone and expresses a positive/negative opinion/evaluation about the past performance of the country.\\n\\nInput:\\nTecovirimat is still available for doctors to obtain for their patients through an “expanded access” protocol set up by the CDC.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nFor now it appears that the extremists have got their way.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nThe fiscal swing was more like 4% of gdp in the two years from 2008 to 2010.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nUniversity of Washington epidemiologist Ali Mokdad predicted a rise in reported COVID-19 cases.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nHow shall the cost of government be reduced?\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
-       "5                                                                                        Task: Sentence Classification - Distinguishing Between Objective and Subjective Text\\n\\nAnalyze the given sentences to categorize them into either objective or subjective presentations. Objective sentences are defined as providing neutral, factual information, whereas subjective sentences convey personal opinions, biases, or emotional perspectives. When processing each sentence, consider the linguistic tone, language used, and underlying authorial perspective. Determine whether the authors present objective facts or express their personal viewpoint. Consider the presence of emotion, opinion, and bias, as well as the tone, whether it's neutral, positive, or negative. The outcome will be either '<final_answer>objective</final_answer>' or '<final_answer>subjective</final_answer>', indicating whether the sentence presents factual information or expresses a personal opinion/bias.\\n\\nInput:\\nFor now it appears that the extremists have got their way.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nUniversity of Washington epidemiologist Ali Mokdad predicted a rise in reported COVID-19 cases.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
-       "6                                                                                                                                                                                                                                                                                      Classify the provided sentences into whether they are presented in a straightforward, unbiased manner (\"objective\") or reveal the author's personal feelings, opinions, or perspectives (\"subjective\"). To make this distinction, carefully scrutinize the linguistic characteristics, such as tone, sentence structure, and the conceptual framework employed in each sentence. Pay attention to the presence of emotional language, evaluative statements, and potential biases. In the context of official or business communications, precision is key when reporting factual information or detailing events, whereas subjective expressions of opinion or interpretation are expected in discussions that aim to persuade, analyze, or critique. The classification output will comprise a single word, the initial response, which will be automatically bracketed within the markers <final_answer> and </final_answer>, with the recognized class labels being limited to \"objective\" and \"subjective\".\\n\\nInput:   \n",
-       "7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Transform a given sentence into either a subjective or objective classification by scrutinizing its linguistic tone, word choice, and underlying perspective. Determine whether the sentence conveys a personal opinion or presents verifiable facts by examining the presence of emotion, opinion, and potential bias. The predicted class will be the first mention in the response, with the class label extracted from the text enclosed within markers <final_answer> and </final_answer>. The recognized classes will be limited to \"objective\" and \"subjective\".\\n\\nInput:   \n",
-       "8                                                                                              Task: Sentence Classification\\n\\nClassify each sentence in the given dataset as either objectively stating facts or subjectively expressing personal opinions. To make this determination, scrutinize the linguistic features, including tone, syntax, and underlying assumptions of each sentence. Identify telltale signs of objectivity, such as factual information presented in a neutral tone, absence of emotive language, and absence of authorial biases. Conversely, recognize indicators of subjectivity, like emotive language, biased wording, or the author's voice. Keep in mind the subtleties in language that distinguish objective from subjective text, including tone, which might be neutral, positive, or negative.\\n\\nWhen generating your prediction, only the first-mentioned class label will be used. This label will be extracted from the response text within the markers '<final_answer>' and '</final_answer>'. The recognized class labels are limited to '<final_answer>objective</final_answer>' and '<final_answer>subjective</final_answer>'. Be cautious of emotions, opinions, and biases that may influence the author's stance when providing your prediction.\\n\\nInput:   \n",
-       "9                                                                                                                        Classify the given sentences into '<final_answer>objective</final_answer>' or '<final_answer>subjective</final_answer>' by analyzing the linguistic characteristics that define objectivity. Explore the tone, syntax, and underlying assumptions of each sentence to determine if it presents factual information or expresses a personal perspective. Specifically, look out for the presence of emotive language, biased wording, and the author's voice to decide whether the sentence is a neutral account of facts or a subjective expression of opinion. Pay attention to the subtleties in language, such as tone, that distinguish objective from subjective text, including whether it's neutral, positive, or negative. Remember that the predicted class will be the initial response, and the corresponding class label will be extracted from the response text within the markers <final_answer> and </final_answer>. The recognized class labels will be limited to '<final_answer>objective</final_answer>' and '<final_answer>subjective</final_answer>'.\\n\\nInput:\\nIs this a warning of what’s to come?\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
-       "10  Use your linguistic expertise to objectively categorize given sentences into their respective subjective or objective class. For the purpose of this task, consider a sentence objective if it presents verifiable, fact-based information in a neutral manner. Conversely, classify a sentence as subjective if it conveys personal opinions, biases, or emotional perspectives. When evaluating each sentence, take note of the tone, language used, and underlying authorial perspective. Determine whether the sentence's linguistic attributes lean towards providing detached, factual information or expressing a person's viewpoint. Pay close attention to the presence of emotion, opinion, and bias, as well as the tone, whether it's neutral, positive, or negative. Classify each sentence according to its linguistic characteristics, distinguishing between factual, verifiable narratives and personal, opinion-based viewpoints. The first class mentioned in your response will be considered the prediction, and the class label will be extracted from the text between '<final_answer>' and '</final_answer>'.\\n\\nInput:\\nUniversity of Washington epidemiologist Ali Mokdad predicted a rise in reported COVID-19 cases.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
-       "11                                                                                                                                                                                                                                                                                                               Classify the provided sentences as either objective or subjective by examining their linguistic characteristics, including tone, language, and perspective. Be mindful of the presence of emotions, opinions, and biases that may impact the author's stance. When providing a prediction, identify and extract the relevant class label from the text between the <final_answer> and </final_answer> markers, which will be either \"objective\" or \"subjective\". The first mentioned class label in the response will serve as the predicted output.\\n\\nInput:\\n“There’ll be a rise in hospitalization and mortality, but not as high proportionally to the rise in cases simply because many of us – the majority of Americans right now – have been either infected or vaccinated or both, so we have some immunity,” he said.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nIs this a warning of what’s to come?\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  prompt  \\\n",
+       "0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Classify the text as objective or subjective based on the presence of opinions or facts. Output classes: objective, subjective.\\n\\nInput:\\nThe proposed agreement includes the best wage increases for rail workers in over forty years.\\n\\nOutput:\\nobjective\\n\\nInput:\\nThe principal reason, from the point of view of government, is that a universal income tax would be a powerful restraint upon the expansion of government.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "1                                                                                                                                                                                                                                      Task: Linguistic Analysis for Sentence Classification\\n\\nClassify each sentence as either objective or subjective by applying linguistic insights to identify its tone, emotion, and degree of neutrality. Examine the sentences' language features, sentiment, and presence of verifiable facts or personal opinions. Determine whether each sentence presents impartial data or conveys the author's emotions, beliefs, or biases. Treat each sentence as a distinct entity, analyzing its contours, nuances, and purpose. Consider the distinction between factual reports like news articles and opinion-based writings like blog posts. Make a nuanced classification by scrutinizing the sentence's impact, intention, and emotional resonance.\\n\\nYour response should be comprised of two parts: the classification and the rationale. Enclose the first-mentioned class within the markers <final_answer> and </final_answer>. For instance, if the classification is 'objective', the output should be <final_answer>objective</final_answer>. Focus on the sentence's language, tone, and emotional appeal to make an informed decision about its categorization, prioritizing the sentence's intention and purpose.\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\n“This latest rule will open our borders even more, and the Court seems to relish making arbitrary decisions without thinking about consequences.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "2   Classify each sentence as either objective or subjective by unpacking its linguistic nuances and emotional undertones. Analyze the sentence's language features, sentiment, and presence of verifiable facts or personal opinions to determine whether it presents impartial data or conveys the author's emotions, beliefs, or biases. Treat each sentence as a standalone entity, examining its contours, subtleties, and intended purpose. Consider the distinction between factual reporting, like news articles, and opinion-based writings, like blog posts. Make a refined classification by scrutinizing the sentence's impact, intention, and emotional resonance, prioritizing the sentence's intention and purpose. Your response should consist of two parts: the classification and the rationale. Enclose the primary classification within the markers <final_answer> and </final_answer>. Focus on the sentence's language, tone, and emotional appeal to make an informed decision about its categorization. Classify each sentence as either objective or subjective by examining its linguistic tone, underlying intent, and purpose. Determine whether the text presents a neutral, factual account or expresses a personal opinion or emotional bias. Evaluate whether the text provides a neutral, factual report or reveals an evaluative tone, offering a positive or negative appraisal. Outputs will include classifications like objective or subjective, with the initial response serving as the prediction.\\n\\nInput:\\nOver several decades, Prime Central London – or PCL – had become a repository for cash from wealthy foreigners, whether they actually wanted to live there or not.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "3                                                                                                                                                                                                                                                 <promptгалтер/>\\n\\nClassify each sentence as either objective or subjective by examining its linguistic tone, underlying intent, and purpose. Consider whether the text presents a neutral, factual account or expresses a personal opinion or emotional bias. Evaluate whether the text is neutral and provides mere reportage, such as a factual report on congressional Democrats' actions and labor union negotiations, or if it reveals an evaluative tone, offering a positive or negative appraisal of a nation's past performance. Outputs will include classifications like objective or subjective. The class mentioned first in the response will serve as the prediction, with the class label extracted from the text between the markers <final_answer> and </final_answer>.\\n\\nInput:\\nOver several decades, Prime Central London – or PCL – had become a repository for cash from wealthy foreigners, whether they actually wanted to live there or not.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nFaced with a tighter labor market, many districts are raising base salaries and offering signing and relocation bonuses — up to a whopping $25,000 in one New Mexico school district.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nThat when liquidation of commodities and securities has gone too far it becomes the business of government to stop it, using public credit by such means as it may think fit.\\n\\nOutput:\\n<final_answer>subjective</final_answer>\\n\\nInput:   \n",
+       "4                                                                                                                                                                                                                                                                                  Classify a given sentence as either \"objective\" or \"subjective\" based on its linguistic characteristics. Determine whether the sentence presents neutral information or expresses a personal opinion/bias. If the text maintains a detached tone, focusing on verifiable facts and data, assign the label \"objective\". Conversely, if the tone is evaluative, emotive, or reveals a bias, categorize it as \"subjective\". Compare the tone of a factual text discussing political events to a text expressing a clear opinion about a historical event to grasp the distinction between the two genres. The predicted class will be the first class mentioned in the language model's response, enclosed within the marks <final_answer> and </final_answer>.\\n\\nInput:\\n“This latest rule will open our borders even more, and the Court seems to relish making arbitrary decisions without thinking about consequences.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nTransportation Secretary Pete Buttigieg confirmed to The Associated Press on Thursday that $104.6 million in federal funds coming from last year’s bipartisan infrastructure bill will go toward a plan to dismantle Interstate 375, a highway built to bisect Detroit’s Black Bottom neighborhood and its epicenter of Black business, Paradise Valley.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "5                                                                                                                                                                                                                                                                                                                                                                                                                Given a sentence, classify it as either \"objective\" or \"subjective\" based on its tone and language, considering the presence of third-person pronouns, neutral language, and opinions. Classify the output as \"objective\" if the tone is neutral and detached, focusing on facts and data, or as \"subjective\" if the tone is evaluative, emotive, or biased.\\n\\nInput:\\nTransportation Secretary Pete Buttigieg confirmed to The Associated Press on Thursday that $104.6 million in federal funds coming from last year’s bipartisan infrastructure bill will go toward a plan to dismantle Interstate 375, a highway built to bisect Detroit’s Black Bottom neighborhood and its epicenter of Black business, Paradise Valley.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\n“This latest rule will open our borders even more, and the Court seems to relish making arbitrary decisions without thinking about consequences.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nHe is fairly secure.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nIn a recent report on the “new poor,” made by the Welfare Council of New York City, there is a reference to “the mental infection of dependency.” This was upon the investigation of unemployment relief.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         Classify each sentence as objective or subjective by recognizing its language characteristics. Identify whether each sentence presents neutral information or expresses a personal opinion. If the sentence provides factual information without taking a bias, classify it as objective. Conversely, if the sentence conveys the author's perspective, emotions, or beliefs, label it as subjective. As our language model expert, carefully analyze each sentence, extracting its tone, and determine whether it presents verifiable data or the author's biased thoughts. For instance, compare a factual news report on politics to a blog post about a historical event and highlight the differences between objective and subjective writing. Our output will be the predicted class enclosed within the markers <final_answer> and </final_answer>, with the first-mentioned class being the predicted label.\\n\\nInput:\\n“This latest rule will open our borders even more, and the Court seems to relish making arbitrary decisions without thinking about consequences.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Categorize the text as either objective or subjective, considering whether it presents neutral information or expresses a personal opinion/bias.\\n\\nObjective: The text has a neutral tone and presents factual information about the actions of Democrats in Congress and the union's negotiations.\\n\\nSubjective: The text has a evaluative tone and expresses a positive/negative opinion/evaluation about the past performance of the country.\\n\\nInput:\\nOver several decades, Prime Central London – or PCL – had become a repository for cash from wealthy foreigners, whether they actually wanted to live there or not.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nThat when liquidation of commodities and securities has gone too far it becomes the business of government to stop it, using public credit by such means as it may think fit.\\n\\nOutput:\\n<final_answer>subjective</final_answer>\\n\\nInput:\\nThat is what it means to sell bonds.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Classify a statement as either \"subjective\" or \"objective\" based on whether it reflects a personal opinion or a verifiable fact. The output classes to include are \"objective\" and \"subjective\".\\n\\nInput:\\nThe promotion of it for many is an avocation, for increasing numbers it is a profession, and for a very great number of more or less trained men and women it is employment and livelihood.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          A labeling exercise necessitates scrutinizing provided text to classify them as either vastly personal ('subjective') or dispassionately factual ('objective') based on the presence of opinions, biases, or verifiable information. Your mission is to accurately determine whether the supplied sentence leans more towards subjective expression of personal thought or objective presentation of facts, then output the corresponding classification within the format \"<final_answer><output class>, <output class></final_answer>\" (e.g. \"<final_answer>objective</final_answer>\"). Recognize that subjective sentences usually embody the writer's own views or emotions, whereas objective sentences present data without personal investment or allegiance. The predicted outcome will be the one first mentioned in the response, and the extracted class label will be positioned between the markers <final_answer> and </final_answer>, which can only be one of the two categories: subjective or objective.\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "10                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Classify a collection of labeled sentences as either based on fact or reflecting personal opinion, using linguistic features to distinguish between objective statements presenting verifiable information and subjective expressions of opinion or attitude, with the objective class being denoted by <final_answer>objective</final_answer> and the subjective class by <final_answer>subjective</final_answer>, where the first-mentioned class in the response will serve as the predicted outcome.\\n\\nInput:\\nThe principal reason, from the point of view of government, is that a universal income tax would be a powerful restraint upon the expansion of government.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
+       "11                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Given a dataset of sentences, use linguistic analysis to categorize each sentence as either 'objective' or 'subjective', reflecting its tone and language usage. Examine the presence of neutral third-person pronouns, factual data, and opinions to determine whether a sentence presents information in a detached and neutral manner ('objective') or conveys a personal perspective or emotional appeal ('subjective'). Your primary consideration should be the sentence's intention, purpose, and emotional resonance, with the predicted classification appearing first in your response. The predicted classification will be extracted from the text situated between the '<final_answer>' and '</final_answer>' markers.\\n\\nInput:\\nCOVID is continually evolving to become more immune evasive, according to Ray, and Omicron is spawning exponentially.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nThe last may go very deep.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:\\nOver several decades, Prime Central London – or PCL – had become a repository for cash from wealthy foreigners, whether they actually wanted to live there or not.\\n\\nOutput:\\n<final_answer>objective</final_answer>\\n\\nInput:   \n",
        "\n",
-       "       score  \n",
-       "0   0.766667  \n",
-       "1   0.666667  \n",
-       "2   0.633333  \n",
-       "3   0.633333  \n",
-       "4   0.633333  \n",
-       "5   0.633333  \n",
-       "6   0.600000  \n",
-       "7   0.600000  \n",
-       "8   0.566667  \n",
-       "9   0.566667  \n",
-       "10  0.566667  \n",
-       "11  0.466667  "
+       "    score  \n",
+       "0    0.76  \n",
+       "1    0.72  \n",
+       "2    0.71  \n",
+       "3    0.67  \n",
+       "4    0.67  \n",
+       "5    0.67  \n",
+       "6    0.67  \n",
+       "7    0.66  \n",
+       "8    0.65  \n",
+       "9    0.65  \n",
+       "10   0.64  \n",
+       "11   0.59  "
       ]
      },
-     "execution_count": 10,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/promptolution/__init__.py b/promptolution/__init__.py
deleted file mode 100644
index 0d9eeb2..0000000
--- a/promptolution/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-"""Promptolution: A framework for prompt optimization and a zoo of prompt optimization algorithms."""
-
-# Import main configuration classes
-from .config import ExperimentConfig
-from .helpers import run_evaluation, run_experiment, run_optimization
-
-# Import factory functions
-from .llms import get_llm
-from .optimizers import get_optimizer
-from .predictors import get_predictor
-from .tasks import get_task
diff --git a/promptolution/exemplar_selectors/__init__.py b/promptolution/exemplar_selectors/__init__.py
index f234373..62e6c9a 100644
--- a/promptolution/exemplar_selectors/__init__.py
+++ b/promptolution/exemplar_selectors/__init__.py
@@ -1,33 +1,4 @@
 """Module for exemplar selectors."""
 
-from typing import Literal
-
 from promptolution.exemplar_selectors.random_search_selector import RandomSearchSelector
 from promptolution.exemplar_selectors.random_selector import RandomSelector
-from promptolution.predictors.base_predictor import BasePredictor
-from promptolution.tasks.base_task import BaseTask
-
-SELECTOR_MAP = {
-    "random": RandomSelector,
-    "random_search": RandomSearchSelector,
-}
-
-
-def get_exemplar_selector(name: Literal["random", "random_search"], task: BaseTask, predictor: BasePredictor):
-    """Factory function to get an exemplar selector based on the given name.
-
-    Args:
-        name (str): The name of the exemplar selector to instantiate.
-        task (BaseTask): The task object to be passed to the selector.
-        predictor (BasePredictor): The predictor object to be passed to the selector.
-
-    Returns:
-        BaseExemplarSelector: An instance of the requested exemplar selector.
-
-    Raises:
-        ValueError: If the requested selector name is not found.
-    """
-    if name not in SELECTOR_MAP:
-        raise ValueError(f"Exemplar selector '{name}' not found. Available selectors: {list(SELECTOR_MAP.keys())}")
-
-    return SELECTOR_MAP[name](task, predictor)
diff --git a/promptolution/exemplar_selectors/base_exemplar_selector.py b/promptolution/exemplar_selectors/base_exemplar_selector.py
index 5cc28c3..1f52ccb 100644
--- a/promptolution/exemplar_selectors/base_exemplar_selector.py
+++ b/promptolution/exemplar_selectors/base_exemplar_selector.py
@@ -1,10 +1,14 @@
 """Base class for exemplar selectors."""
 
+
 from abc import ABC, abstractmethod
 
-from promptolution.config import ExperimentConfig
-from promptolution.predictors.base_predictor import BasePredictor
-from promptolution.tasks.base_task import BaseTask
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from promptolution.predictors.base_predictor import BasePredictor
+    from promptolution.tasks.base_task import BaseTask
+    from promptolution.utils.config import ExperimentConfig
 
 
 class BaseExemplarSelector(ABC):
@@ -14,13 +18,13 @@ class BaseExemplarSelector(ABC):
     that all exemplar selectors should implement.
     """
 
-    def __init__(self, task: BaseTask, predictor: BasePredictor, config: ExperimentConfig = None):
+    def __init__(self, task: "BaseTask", predictor: "BasePredictor", config: "ExperimentConfig" = None):
         """Initialize the BaseExemplarSelector.
 
         Args:
             task (BaseTask): An object representing the task to be performed.
             predictor (BasePredictor): An object capable of making predictions based on prompts.
-            config (ExperimentConfig, optional): ExperimentConfig overwriting the defaults
+            config (ExperimentConfig, optional): "ExperimentConfig" overwriting the defaults
         """
         self.task = task
         self.predictor = predictor
diff --git a/promptolution/exemplar_selectors/random_selector.py b/promptolution/exemplar_selectors/random_selector.py
index 2b79801..730e4d6 100644
--- a/promptolution/exemplar_selectors/random_selector.py
+++ b/promptolution/exemplar_selectors/random_selector.py
@@ -1,8 +1,13 @@
 """Random exemplar selector."""
 
+from typing import TYPE_CHECKING
+
 from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
-from promptolution.predictors.base_predictor import BasePredictor
-from promptolution.tasks.base_task import BaseTask
+
+if TYPE_CHECKING:
+    from promptolution.predictors.base_predictor import BasePredictor
+    from promptolution.tasks.base_task import BaseTask
+    from promptolution.utils.config import ExperimentConfig
 
 
 class RandomSelector(BaseExemplarSelector):
@@ -12,19 +17,21 @@ class RandomSelector(BaseExemplarSelector):
     those that are evaluated as correct until the desired number of exemplars is reached.
     """
 
-    def __init__(self, task: BaseTask, predictor: BasePredictor, desired_score: int = 1, config=None):
+    def __init__(
+        self, task: "BaseTask", predictor: "BasePredictor", desired_score: int = 1, config: "ExperimentConfig" = None
+    ):
         """Initialize the RandomSelector.
 
         Args:
             task (BaseTask): An object representing the task to be performed.
             predictor (BasePredictor): An object capable of making predictions based on prompts.
             desired_score (int, optional): The desired score for the exemplars. Defaults to 1.
-            config: ExperimentConfig overriding the defaults
+            config (ExperimentConfig, optional): Configuration for the selector, overriding defaults.
         """
         self.desired_score = desired_score
         super().__init__(task, predictor, config)
 
-    def select_exemplars(self, prompt, n_examples: int = 5):
+    def select_exemplars(self, prompt: str, n_examples: int = 5) -> str:
         """Select exemplars using a random selection strategy.
 
         This method generates random examples and selects those that are evaluated as correct
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 117238e..ec603a4 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -1,22 +1,45 @@
 """Helper functions for the usage of the libary."""
 
-from logging import getLogger
-from typing import List
+
+from typing import TYPE_CHECKING, List, Literal
+
+if TYPE_CHECKING:
+    from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
+    from promptolution.llms.base_llm import BaseLLM
+    from promptolution.optimizers.base_optimizer import BaseOptimizer
+    from promptolution.predictors.base_predictor import BasePredictor
+    from promptolution.tasks.base_task import BaseTask
+    from promptolution.utils.config import ExperimentConfig
 
 import pandas as pd
 
-from promptolution.callbacks import LoggerCallback
-from promptolution.config import ExperimentConfig
-from promptolution.exemplar_selectors import get_exemplar_selector
-from promptolution.llms import get_llm
-from promptolution.optimizers import get_optimizer
-from promptolution.predictors import get_predictor
-from promptolution.tasks import get_task
+from promptolution.exemplar_selectors.random_search_selector import RandomSearchSelector
+from promptolution.exemplar_selectors.random_selector import RandomSelector
+from promptolution.llms.api_llm import APILLM
+from promptolution.llms.local_llm import LocalLLM
+from promptolution.llms.vllm import VLLM
+from promptolution.optimizers.capo import CAPO
+from promptolution.optimizers.evoprompt_de import EvoPromptDE
+from promptolution.optimizers.evoprompt_ga import EvoPromptGA
+from promptolution.optimizers.opro import OPRO
+from promptolution.optimizers.templates import (
+    CAPO_CROSSOVER_TEMPLATE,
+    CAPO_MUTATION_TEMPLATE,
+    EVOPROMPT_DE_TEMPLATE,
+    EVOPROMPT_DE_TEMPLATE_TD,
+    EVOPROMPT_GA_TEMPLATE,
+    EVOPROMPT_GA_TEMPLATE_TD,
+    OPRO_TEMPLATE,
+    OPRO_TEMPLATE_TD,
+)
+from promptolution.predictors.classifier import FirstOccurrenceClassifier, MarkerBasedClassifier
+from promptolution.tasks.classification_tasks import ClassificationTask
+from promptolution.utils.logging import get_logger
 
-logger = getLogger(__name__)
+logger = get_logger(__name__)
 
 
-def run_experiment(df: pd.DataFrame, config: ExperimentConfig):
+def run_experiment(df: pd.DataFrame, config: "ExperimentConfig"):
     """Run a full experiment based on the provided configuration.
 
     Args:
@@ -35,7 +58,7 @@ def run_experiment(df: pd.DataFrame, config: ExperimentConfig):
     return df_prompt_scores
 
 
-def run_optimization(df, config: ExperimentConfig):
+def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]:
     """Run the optimization phase of the experiment.
 
     Args:
@@ -48,9 +71,9 @@ def run_optimization(df, config: ExperimentConfig):
     predictor = get_predictor(llm, config=config)
 
     config.task_description = config.task_description + " " + predictor.extraction_description
-    if config.optimizer == "capo" and config.subsample_strategy is None:
-        logger.info("CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
-        config.subsample_strategy = "sequential_block"
+    if config.optimizer == "capo" and (config.eval_strategy is None or "block" not in config.eval_strategy):
+        logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
+        config.eval_strategy = "sequential_block"
 
     task = get_task(df, config)
     optimizer = get_optimizer(
@@ -59,7 +82,7 @@ def run_optimization(df, config: ExperimentConfig):
         task=task,
         config=config,
     )
-    logger.info("Starting optimization...")
+    logger.warning("🔥 Starting optimization...")
     prompts = optimizer.optimize(n_steps=config.n_steps)
 
     if hasattr(config, "prepend_exemplars") and config.prepend_exemplars:
@@ -69,7 +92,7 @@ def run_optimization(df, config: ExperimentConfig):
     return prompts
 
 
-def run_evaluation(df: pd.DataFrame, config: ExperimentConfig, prompts: List[str]):
+def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[str]) -> pd.DataFrame:
     """Run the evaluation phase of the experiment.
 
     Args:
@@ -81,12 +104,185 @@ def run_evaluation(df: pd.DataFrame, config: ExperimentConfig, prompts: List[str
         pd.DataFrame: A DataFrame containing the prompts and their scores.
     """
     task = get_task(df, config)
-
     llm = get_llm(config=config)
     predictor = get_predictor(llm, config=config)
-    logger.info("Starting evaluation...")
-    scores = task.evaluate(prompts, predictor)
+    logger.warning("📊 Starting evaluation...")
+    scores = task.evaluate(prompts, predictor, eval_strategy="full")
     df = pd.DataFrame(dict(prompt=prompts, score=scores))
     df = df.sort_values("score", ascending=False, ignore_index=True)
 
     return df
+
+
+def get_llm(model_id: str = None, config: "ExperimentConfig" = None) -> "BaseLLM":
+    """Factory function to create and return a language model instance based on the provided model_id.
+
+    This function supports three types of language models:
+    1. LocalLLM: For running models locally.
+    2. VLLM: For running models using the vLLM library.
+    3. APILLM: For API-based models (default if not matching other types).
+
+    Args:
+        model_id (str): Identifier for the model to use. Special cases:
+                        - "local-{model_name}" for LocalLLM
+                        - "vllm-{model_name}" for VLLM
+                        - Any other string for APILLM
+        config (ExperimentConfig, optional): "ExperimentConfig" overwriting defaults.
+
+    Returns:
+        An instance of LocalLLM, or APILLM based on the model_id.
+    """
+    if model_id is None:
+        model_id = config.model_id
+    if "local" in model_id:
+        model_id = "-".join(model_id.split("-")[1:])
+        return LocalLLM(model_id, config)
+    if "vllm" in model_id:
+        model_id = "-".join(model_id.split("-")[1:])
+        return VLLM(model_id, config=config)
+
+    return APILLM(model_id=model_id, config=config)
+
+
+def get_task(df: pd.DataFrame, config: "ExperimentConfig") -> "BaseTask":
+    """Get the task based on the provided DataFrame and configuration.
+
+    So far only ClassificationTask is supported.
+
+    Args:
+        df (pd.DataFrame): Input DataFrame containing the data.
+        config (ExperimentConfig): Configuration for the experiment.
+
+    Returns:
+        BaseTask: An instance of a task class based on the provided DataFrame and configuration.
+    """
+    return ClassificationTask(df, config=config)
+
+
+def get_optimizer(
+    predictor: "BasePredictor",
+    meta_llm: "BaseLLM",
+    task: "BaseTask",
+    optimizer: Literal["evopromptde", "evopromptga", "opro"] = None,
+    meta_prompt: str = None,
+    task_description: str = None,
+    config: "ExperimentConfig" = None,
+) -> "BaseOptimizer":
+    """Creates and returns an optimizer instance based on provided parameters.
+
+    Args:
+        predictor: The predictor used for prompt evaluation
+        meta_llm: The language model used for generating meta-prompts
+        task: The task object used for evaluating prompts
+        optimizer: String identifying which optimizer to use
+        meta_prompt: Meta prompt text for the optimizer
+        task_description: Description of the task for the optimizer
+        config: Configuration object with default parameters
+
+    Returns:
+        An optimizer instance
+
+    Raises:
+        ValueError: If an unknown optimizer type is specified
+    """
+    if optimizer is None:
+        optimizer = config.optimizer
+    if task_description is None:
+        task_description = config.task_description
+    if meta_prompt is None and hasattr(config, "meta_prompt"):
+        meta_prompt = config.meta_prompt
+
+    if config.optimizer == "capo":
+        crossover_template = (
+            CAPO_CROSSOVER_TEMPLATE.replace("<task_desc>", task_description)
+            if task_description
+            else CAPO_CROSSOVER_TEMPLATE
+        )
+        mutation_template = (
+            CAPO_MUTATION_TEMPLATE.replace("<task_desc>", task_description)
+            if task_description
+            else CAPO_MUTATION_TEMPLATE
+        )
+
+        return CAPO(
+            predictor=predictor,
+            meta_llm=meta_llm,
+            task=task,
+            crossover_template=crossover_template,
+            mutation_template=mutation_template,
+            config=config,
+        )
+
+    if config.optimizer == "evopromptde":
+        template = (
+            EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", task_description)
+            if task_description
+            else EVOPROMPT_DE_TEMPLATE
+        )
+        return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+
+    if config.optimizer == "evopromptga":
+        template = (
+            EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", task_description)
+            if task_description
+            else EVOPROMPT_GA_TEMPLATE
+        )
+        return EvoPromptGA(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+
+    if config.optimizer == "opro":
+        template = OPRO_TEMPLATE_TD.replace("<task_desc>", task_description) if task_description else OPRO_TEMPLATE
+        return OPRO(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+
+    raise ValueError(f"Unknown optimizer: {config.optimizer}")
+
+
+def get_exemplar_selector(
+    name: Literal["random", "random_search"], task: "BaseTask", predictor: "BasePredictor"
+) -> "BaseExemplarSelector":
+    """Factory function to get an exemplar selector based on the given name.
+
+    Args:
+        name (str): The name of the exemplar selector to instantiate.
+        task (BaseTask): The task object to be passed to the selector.
+        predictor (BasePredictor): The predictor object to be passed to the selector.
+
+    Returns:
+        BaseExemplarSelector: An instance of the requested exemplar selector.
+
+    Raises:
+        ValueError: If the requested selector name is not found.
+    """
+    if name == "random_search":
+        return RandomSearchSelector(task, predictor)
+    elif name == "random":
+        return RandomSelector(task, predictor)
+    else:
+        raise ValueError(f"Unknown exemplar selector: {name}")
+
+
+def get_predictor(
+    downstream_llm=None, type: Literal["first_occurrence", "marker"] = "marker", *args, **kwargs
+) -> "BasePredictor":
+    """Factory function to create and return a predictor instance.
+
+    This function supports three types of predictors:
+    1. FirstOccurrenceClassifier: A predictor that classifies based on first occurrence of the label.
+    2. MarkerBasedClassifier: A predictor that classifies based on a marker.
+
+    Args:
+        downstream_llm: The language model to use for prediction.
+        type (Literal["first_occurrence", "marker"]): The type of predictor to create:
+                    - "first_occurrence" for FirstOccurrenceClassifier
+                    - "marker" (default) for MarkerBasedClassifier
+        *args: Variable length argument list passed to the predictor constructor.
+        **kwargs: Arbitrary keyword arguments passed to the predictor constructor.
+
+    Returns:
+        An instance of FirstOccurrenceClassifier or MarkerBasedClassifier.
+    """
+    if type == "first_occurrence":
+        return FirstOccurrenceClassifier(downstream_llm, *args, **kwargs)
+    elif type == "marker":
+        return MarkerBasedClassifier(downstream_llm, *args, **kwargs)
+    else:
+        raise ValueError(f"Invalid predictor type: '{type}'")
diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py
index aa01a09..7fd7b97 100644
--- a/promptolution/llms/__init__.py
+++ b/promptolution/llms/__init__.py
@@ -1,42 +1,6 @@
 """Module for Large Language Models."""
 
-from promptolution.config import ExperimentConfig
 
-from .api_llm import APILLM
-from .base_llm import DummyLLM
-from .local_llm import LocalLLM
-from .vllm import VLLM
-
-
-def get_llm(model_id: str = None, config: ExperimentConfig = None):
-    """Factory function to create and return a language model instance based on the provided model_id.
-
-    This function supports three types of language models:
-    1. DummyLLM: A mock LLM for testing purposes.
-    2. LocalLLM: For running models locally.
-    3. VLLM: For running models using the vLLM library.
-    4. APILLM: For API-based models (default if not matching other types).
-
-    Args:
-        model_id (str): Identifier for the model to use. Special cases:
-                        - "dummy" for DummyLLM
-                        - "local-{model_name}" for LocalLLM
-                        - "vllm-{model_name}" for VLLM
-                        - Any other string for APILLM
-        config (ExperimentConfig, optional): ExperimentConfig overwriting defaults.
-
-    Returns:
-        An instance of DummyLLM, LocalLLM, or APILLM based on the model_id.
-    """
-    if model_id is None:
-        model_id = config.llm
-    if model_id == "dummy":
-        return DummyLLM(config)
-    if "local" in model_id:
-        model_id = "-".join(model_id.split("-")[1:])
-        return LocalLLM(model_id, config)
-    if "vllm" in model_id:
-        model_id = "-".join(model_id.split("-")[1:])
-        return VLLM(model_id, config=config)
-
-    return APILLM(llm=model_id, config=config)
+from promptolution.llms.api_llm import APILLM
+from promptolution.llms.local_llm import LocalLLM
+from promptolution.llms.vllm import VLLM
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index 00db034..330b3c8 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -10,13 +10,17 @@
 except ImportError:
     import_successful = False
 
-from logging import Logger
-from typing import Any, List
 
-from promptolution.config import ExperimentConfig
+from typing import TYPE_CHECKING, List
+
 from promptolution.llms.base_llm import BaseLLM
 
-logger = Logger(__name__)
+if TYPE_CHECKING:
+    from promptolution.utils.config import ExperimentConfig
+
+from promptolution.utils.logging import get_logger
+
+logger = get_logger(__name__)
 
 
 async def _invoke_model(prompt, system_prompt, max_tokens, model_id, client, semaphore, max_retries=20, retry_delay=5):
@@ -35,13 +39,13 @@ async def _invoke_model(prompt, system_prompt, max_tokens, model_id, client, sem
                 if attempt < max_retries:
                     # Calculate exponential backoff with jitter
                     logger.warning(
-                        f"API call failed (attempt {attempt + 1} / {max_retries + 1}): {str(e)}. "
+                        f"⚠️ API call failed (attempt {attempt + 1} / {max_retries + 1}): {str(e)}. "
                         f"Retrying in {retry_delay:.2f} seconds..."
                     )
                     await asyncio.sleep(retry_delay)
                 else:
                     # Log the final failure and re-raise the exception
-                    logger.error(f"API call failed after {max_retries + 1} attempts: {str(e)}")
+                    logger.error(f"❌ API call failed after {max_retries + 1} attempts: {str(e)}")
                     raise
 
 
@@ -62,21 +66,21 @@ class APILLM(BaseLLM):
     def __init__(
         self,
         api_url: str = None,
-        llm: str = None,
-        token: str = None,
+        model_id: str = None,
+        api_key: str = None,
         max_concurrent_calls=50,
         max_tokens=512,
-        config: ExperimentConfig = None,
+        config: "ExperimentConfig" = None,
     ):
         """Initialize the APILLM with a specific model and API configuration.
 
         Args:
             api_url (str): The base URL for the API endpoint.
-            llm (str): Identifier for the model to use.
-            token (str, optional): API key for authentication. Defaults to None.
+            model_id (str): Identifier for the model to use.
+            api_key (str, optional): API key for authentication. Defaults to None.
             max_concurrent_calls (int, optional): Maximum number of concurrent API calls. Defaults to 50.
             max_tokens (int, optional): Maximum number of tokens in model responses. Defaults to 512.
-            config (ExperimentConfig, optional): ExperimentConfig overwriting defaults.
+            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.
 
         Raises:
             ImportError: If required libraries are not installed.
@@ -88,13 +92,13 @@ def __init__(
             )
 
         self.api_url = api_url
-        self.llm = llm
-        self.token = token
+        self.model_id = model_id
+        self.api_key = api_key
         self.max_concurrent_calls = max_concurrent_calls
         self.max_tokens = max_tokens
 
         super().__init__(config=config)
-        self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.token)
+        self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.api_key)
         self.semaphore = asyncio.Semaphore(self.max_concurrent_calls)
 
     def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
@@ -105,7 +109,7 @@ def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[s
 
     async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
         tasks = [
-            _invoke_model(prompt, system_prompt, self.max_tokens, self.llm, self.client, self.semaphore)
+            _invoke_model(prompt, system_prompt, self.max_tokens, self.model_id, self.client, self.semaphore)
             for prompt, system_prompt in zip(prompts, system_prompts)
         ]
         responses = await asyncio.gather(*tasks)
diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py
index 592e248..704942b 100644
--- a/promptolution/llms/base_llm.py
+++ b/promptolution/llms/base_llm.py
@@ -1,16 +1,17 @@
 """Base module for LLMs in the promptolution library."""
 
-import logging
+
 from abc import ABC, abstractmethod
-from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, List, Optional, Union
 
-import numpy as np
+from typing import TYPE_CHECKING, List
+
+if TYPE_CHECKING:
+    from promptolution.utils.config import ExperimentConfig
 
-from promptolution.config import ExperimentConfig
-from promptolution.templates import DEFAULT_SYS_PROMPT
+from promptolution.optimizers.templates import DEFAULT_SYS_PROMPT
+from promptolution.utils.logging import get_logger
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 
 class BaseLLM(ABC):
@@ -25,14 +26,14 @@ class BaseLLM(ABC):
         output_token_count (int): Count of output tokens generated.
     """
 
-    def __init__(self, config: ExperimentConfig = None):
+    def __init__(self, config: "ExperimentConfig" = None):
         """Initialize the LLM with a configuration or direct parameters.
 
         This constructor supports both config-based and direct parameter initialization
         for backward compatibility.
 
         Args:
-            config (Optional[Union[Dict[str, Any], LLMModelConfig]]): Configuration for the LLM.
+            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.
         """
         if config is not None:
             config.apply_to(self)
@@ -81,7 +82,7 @@ def get_response(self, prompts: List[str], system_prompts: List[str] = None) ->
         Args:
             prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                         it's converted to a list containing that string.
-            system_prompts (str or List[str]): System prompt(s) to provide context to the model.
+            system_prompts (Optional, str or List[str]): System prompt(s) to provide context to the model.
 
         Returns:
             List[str]: A list of generated responses, one for each input prompt.
@@ -120,37 +121,3 @@ def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[s
             List[str]: A list of generated responses corresponding to the input prompts.
         """
         raise NotImplementedError
-
-
-class DummyLLM(BaseLLM):
-    """A dummy implementation of the BaseLLM for testing purposes.
-
-    This class generates random responses for given prompts, simulating
-    the behavior of a language model without actually performing any
-    complex natural language processing.
-    """
-
-    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> str:
-        """Generate random responses for the given prompts.
-
-        This method creates silly, random responses enclosed in <prompt> tags.
-        It's designed for testing and demonstration purposes.
-
-        Args:
-            prompts (str or List[str]): Input prompt(s). If a single string is provided,
-                                        it's converted to a list containing that string.
-
-        Returns:
-            List[str]: A list of randomly generated responses, one for each input prompt.
-        """
-        results = []
-        for p in prompts:
-            r = np.random.rand()
-            if r < 0.3:
-                results += [f"Joooo wazzuppp <prompt>hier gehts los {r} </prompt> {p}"]
-            elif 0.3 <= r < 0.6:
-                results += [f"was das hier? <prompt>peter lustig{r}</prompt> {p}"]
-            else:
-                results += [f"hier ist ein <prompt>test{r}</prompt> {p}"]
-
-        return results
diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
index dabeb42..33f489e 100644
--- a/promptolution/llms/local_llm.py
+++ b/promptolution/llms/local_llm.py
@@ -7,6 +7,12 @@
 except ImportError:
     imports_successful = False
 
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from promptolution.utils.config import ExperimentConfig
+
+
 from promptolution.llms.base_llm import BaseLLM
 
 
@@ -23,13 +29,13 @@ class LocalLLM(BaseLLM):
         get_response: Generate responses for a list of prompts.
     """
 
-    def __init__(self, model_id: str, batch_size=8, config=None):
+    def __init__(self, model_id: str, batch_size: int = 8, config: "ExperimentConfig" = None):
         """Initialize the LocalLLM with a specific model.
 
         Args:
             model_id (str): The identifier of the model to use (e.g., "gpt2", "facebook/opt-1.3b").
             batch_size (int, optional): The batch size for text generation. Defaults to 8.
-            config (ExperimentConfig, optional): ExperimentConfig overwriting defaults.
+            config (ExperimentConfig, optional): "ExperimentConfig" overwriting defaults.
 
         Note:
             This method sets up a text generation pipeline with bfloat16 precision,
@@ -59,6 +65,7 @@ def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[s
 
         Args:
             prompts (list[str]): A list of input prompts.
+            system_prompts (list[str]): A list of system prompts to guide the model's behavior.
 
         Returns:
             list[str]: A list of generated responses corresponding to the input prompts.
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 8feb753..1806f6b 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -1,13 +1,16 @@
 """Module for running language models locally using the vLLM library."""
 
 
-from logging import Logger
-from typing import List
+from typing import TYPE_CHECKING, List
+
+if TYPE_CHECKING:
+    from promptolution.utils.config import ExperimentConfig
+
 
-from promptolution.config import ExperimentConfig
 from promptolution.llms.base_llm import BaseLLM
+from promptolution.utils.logging import get_logger
 
-logger = Logger(__name__)
+logger = get_logger(__name__)
 
 try:
     from transformers import AutoTokenizer
@@ -49,7 +52,7 @@ def __init__(
         trust_remote_code: bool = False,
         seed: int = 42,
         llm_kwargs: dict = None,
-        config: ExperimentConfig = None,
+        config: "ExperimentConfig" = None,
     ):
         """Initialize the VLLM with a specific model.
 
@@ -67,7 +70,7 @@ def __init__(
             trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
             seed (int, optional): Random seed for the model. Defaults to 42.
             llm_kwargs (dict, optional): Additional keyword arguments for the LLM. Defaults to None.
-            config (ExperimentConfig, optional): ExperimentConfig overwriting defaults.
+            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.
 
         Note:
             This method sets up a vLLM engine with specified parameters for efficient inference.
@@ -108,12 +111,9 @@ def __init__(
         self.llm = LLM(**llm_params)
 
         if batch_size is None:
-            gpu_blocks = self.llm.llm_engine.model_executor.cache_config.num_gpu_blocks
-            block_size = (
-                self.llm.llm_engine.model_executor.cache_config.block_size
-            )  # TODO rename, block_size is misleading
-            self.batch_size = int((gpu_blocks * block_size / self.max_model_len) * 0.95)
-            logger.info(f"Batch size set to {self.batch_size} based on GPU memory.")
+            cache_config = self.llm.llm_engine.model_executor.cache_config
+            self.batch_size = int((cache_config.gpu_blocks * cache_config.block_size / self.max_model_len) * 0.95)
+            logger.info(f"🚀 Batch size set to {self.batch_size} based on GPU memory.")
         else:
             self.batch_size = batch_size
 
@@ -127,6 +127,7 @@ def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[s
 
         Args:
             prompts (list[str]): A list of input prompts.
+            system_prompts (list[str]): A list of system prompts to guide the model's behavior.
 
         Returns:
             list[str]: A list of generated responses corresponding to the input prompts.
diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py
index cc06d24..47f78a3 100644
--- a/promptolution/optimizers/__init__.py
+++ b/promptolution/optimizers/__init__.py
@@ -1,104 +1,23 @@
 """Module for prompt optimizers."""
 
-from typing import Literal
 
-from promptolution.config import ExperimentConfig
-from promptolution.llms.base_llm import BaseLLM
-from promptolution.predictors.base_predictor import BasePredictor
-from promptolution.tasks.base_task import BaseTask
-from promptolution.templates import (
+from promptolution.optimizers.capo import CAPO
+from promptolution.optimizers.evoprompt_de import EvoPromptDE
+from promptolution.optimizers.evoprompt_ga import EvoPromptGA
+from promptolution.optimizers.opro import OPRO
+from promptolution.optimizers.templates import (
     CAPO_CROSSOVER_TEMPLATE,
+    CAPO_DOWNSTREAM_TEMPLATE,
+    CAPO_FEWSHOT_TEMPLATE,
     CAPO_MUTATION_TEMPLATE,
+    DEFAULT_SYS_PROMPT,
     EVOPROMPT_DE_TEMPLATE,
     EVOPROMPT_DE_TEMPLATE_TD,
     EVOPROMPT_GA_TEMPLATE,
     EVOPROMPT_GA_TEMPLATE_TD,
     OPRO_TEMPLATE,
     OPRO_TEMPLATE_TD,
+    PROMPT_CREATION_TEMPLATE,
+    PROMPT_CREATION_TEMPLATE_TD,
+    PROMPT_VARIATION_TEMPLATE,
 )
-
-from .base_optimizer import DummyOptimizer
-from .capo import CAPO
-from .evoprompt_de import EvoPromptDE
-from .evoprompt_ga import EvoPromptGA
-from .opro import Opro
-
-
-def get_optimizer(
-    predictor: BasePredictor,
-    meta_llm: BaseLLM,
-    task: BaseTask,
-    optimizer: Literal["evopromptde", "evopromptga", "opro"] = None,
-    meta_prompt: str = None,
-    task_description: str = None,
-    config: ExperimentConfig = None,
-):
-    """Creates and returns an optimizer instance based on provided parameters.
-
-    Args:
-        predictor: The predictor used for prompt evaluation
-        meta_llm: The language model used for generating meta-prompts
-        task: The task object used for evaluating prompts
-        optimizer: String identifying which optimizer to use
-        meta_prompt: Meta prompt text for the optimizer
-        task_description: Description of the task for the optimizer
-        config: Configuration object with default parameters
-
-    Returns:
-        An optimizer instance
-
-    Raises:
-        ValueError: If an unknown optimizer type is specified
-    """
-    if optimizer is None:
-        optimizer = config.optimizer
-    if task_description is None:
-        task_description = config.task_description
-    if meta_prompt is None and hasattr(config, "meta_prompt"):
-        meta_prompt = config.meta_prompt
-
-    if config.optimizer == "capo":
-        crossover_template = (
-            CAPO_CROSSOVER_TEMPLATE.replace("<task_desc>", task_description)
-            if task_description
-            else CAPO_CROSSOVER_TEMPLATE
-        )
-        mutation_template = (
-            CAPO_MUTATION_TEMPLATE.replace("<task_desc>", task_description)
-            if task_description
-            else CAPO_MUTATION_TEMPLATE
-        )
-
-        return CAPO(
-            predictor=predictor,
-            meta_llm=meta_llm,
-            task=task,
-            crossover_template=crossover_template,
-            mutation_template=mutation_template,
-            config=config,
-        )
-
-    if config.optimizer == "evopromptde":
-        template = (
-            EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", task_description)
-            if task_description
-            else EVOPROMPT_DE_TEMPLATE
-        )
-        return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
-
-    if config.optimizer == "evopromptga":
-        template = (
-            EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", task_description)
-            if task_description
-            else EVOPROMPT_GA_TEMPLATE
-        )
-        return EvoPromptGA(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
-
-    if config.optimizer == "opro":
-        template = OPRO_TEMPLATE_TD.replace("<task_desc>", task_description) if task_description else OPRO_TEMPLATE
-        return Opro(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
-
-    if optimizer == "dummy":
-        return DummyOptimizer(predictor=predictor, config=config)
-
-    raise ValueError(f"Unknown optimizer: {config.optimizer}")
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 6748258..710701a 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -1,13 +1,17 @@
 """Base module for optimizers in the promptolution library."""
 
+
 from abc import ABC, abstractmethod
-from logging import getLogger
-from typing import Callable, List
 
-from promptolution.config import ExperimentConfig
-from promptolution.tasks.base_task import BaseTask
+from typing import TYPE_CHECKING, Callable, List
+
+if TYPE_CHECKING:
+    from promptolution.tasks.base_task import BaseTask
+    from promptolution.utils.config import ExperimentConfig
+
+from promptolution.utils.logging import get_logger
 
-logger = getLogger(__name__)
+logger = get_logger(__name__)
 
 
 class BaseOptimizer(ABC):
@@ -16,7 +20,7 @@ class BaseOptimizer(ABC):
     This class defines the basic structure and interface for prompt optimization algorithms.
 
     Attributes:
-        config (OptimizerConfig): Configuration for the optimizer.
+        config (ExperimentConfig, optional): Configuration for the optimizer, overriding defaults.
         prompts (List[str]): List of current prompts being optimized.
         task (BaseTask): The task object used for evaluating prompts.
         callbacks (List[Callable]): List of callback functions to be called during optimization.
@@ -26,10 +30,10 @@ class BaseOptimizer(ABC):
     def __init__(
         self,
         predictor,
-        task: BaseTask,
+        task: "BaseTask",
         initial_prompts: List[str],
         callbacks: List[Callable] = None,
-        config: ExperimentConfig = None,
+        config: "ExperimentConfig" = None,
     ):
         """Initialize the optimizer with a configuration and/or direct parameters.
 
@@ -38,7 +42,7 @@ def __init__(
             task: Task object for prompt evaluation.
             callbacks: List of callback functions.
             predictor: Predictor for prompt evaluation.
-            config: Configuration for the optimizer.
+            config (ExperimentConfig, optional): Configuration for the optimizer, overriding defaults.
         """
         # Set up optimizer state
         self.prompts = initial_prompts
@@ -58,7 +62,7 @@ def optimize(self, n_steps: int) -> List[str]:
         the specific optimization algorithm.
 
         Args:
-            n_steps: Number of optimization steps to perform. If None, uses the value from config.
+            n_steps (int): Number of optimization steps to perform.
 
         Returns:
             The optimized list of prompts after all steps.
@@ -73,9 +77,9 @@ def optimize(self, n_steps: int) -> List[str]:
                 self.prompts = self._step()
             except Exception as e:
                 # exit training loop and gracefully fail
-                logger.error(f"Error during optimization step: {e}")
-                logger.error("Exiting optimization loop.")
-                break
+                logger.error(f"⛔ Error during optimization step: {e}")
+                logger.error("⚠️ Exiting optimization loop.")
+                continue_optimization = False
 
             # Callbacks at the end of each step
             continue_optimization = self._on_step_end()
@@ -119,42 +123,3 @@ def _on_train_end(self):
         """Call all registered callbacks at the end of the entire optimization process."""
         for callback in self.callbacks:
             callback.on_train_end(self)
-
-
-class DummyOptimizer(BaseOptimizer):
-    """A dummy optimizer that doesn't perform any actual optimization.
-
-    This optimizer simply returns the initial prompts without modification.
-    It's useful for testing or as a baseline comparison.
-
-    Attributes:
-        prompts (List[str]): List of prompts (unchanged from initialization).
-        callbacks (List[Callable]): Empty list of callbacks.
-
-    Args:
-        initial_prompts (List[str]): Initial set of prompts.
-        *args: Variable length argument list (unused).
-        **kwargs: Arbitrary keyword arguments (unused).
-    """
-
-    def __init__(self, initial_prompts):
-        """Initialize the DummyOptimizer."""
-        self.callbacks = []
-        self.prompts = initial_prompts
-
-    def optimize(self, n_steps) -> list[str]:
-        """Simulate an optimization process without actually modifying the prompts.
-
-        This method calls the callback methods to simulate a complete optimization
-        cycle, but returns the initial prompts unchanged.
-
-        Args:
-            n_steps (int): Number of optimization steps (unused in this implementation).
-
-        Returns:
-            List[str]: The original list of prompts, unchanged.
-        """
-        self._on_step_end()
-        self._on_train_end()
-
-        return self.prompts
diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index 993c799..60f8a12 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -1,27 +1,32 @@
 """Implementation of the CAPO (Cost-Aware Prompt Optimization) algorithm."""
+
 import random
 from itertools import compress
-from logging import getLogger
-from typing import Callable, List, Tuple
 
 import numpy as np
 import pandas as pd
 
-from promptolution.config import ExperimentConfig
-from promptolution.llms.base_llm import BaseLLM
+from typing import TYPE_CHECKING, Callable, List, Tuple
+
+if TYPE_CHECKING:
+    from promptolution.llms.base_llm import BaseLLM
+    from promptolution.predictors.base_predictor import BasePredictor
+    from promptolution.tasks.base_task import BaseTask
+    from promptolution.utils.config import ExperimentConfig
+    from promptolution.utils.test_statistics import TestStatistics
+
 from promptolution.optimizers.base_optimizer import BaseOptimizer
-from promptolution.predictors.base_predictor import BasePredictor
-from promptolution.tasks.base_task import BaseTask
-from promptolution.templates import (
+from promptolution.optimizers.templates import (
     CAPO_CROSSOVER_TEMPLATE,
     CAPO_DOWNSTREAM_TEMPLATE,
     CAPO_FEWSHOT_TEMPLATE,
     CAPO_MUTATION_TEMPLATE,
 )
-from promptolution.utils.test_statistics import TestStatistics, get_test_statistic_func
+from promptolution.utils.logging import get_logger
+from promptolution.utils.test_statistics import get_test_statistic_func
 from promptolution.utils.token_counter import get_token_counter
 
-logger = getLogger(__name__)
+logger = get_logger(__name__)
 
 
 class CAPOPrompt:
@@ -68,43 +73,40 @@ class CAPO(BaseOptimizer):
 
     def __init__(
         self,
-        predictor: BasePredictor,
-        task: BaseTask,
-        meta_llm: BaseLLM,
+        predictor: "BasePredictor",
+        task: "BaseTask",
+        meta_llm: "BaseLLM",
         initial_prompts: List[str] = None,
         crossovers_per_iter: int = 4,
         upper_shots: int = 5,
         max_n_blocks_eval: int = 10,
-        test_statistic: TestStatistics = "paired_t_test",
+        test_statistic: "TestStatistics" = "paired_t_test",
         alpha: float = 0.2,
         length_penalty: float = 0.05,
         df_few_shots: pd.DataFrame = None,
         crossover_template: str = None,
         mutation_template: str = None,
         callbacks: List[Callable] = [],
-        config: ExperimentConfig = None,
+        config: "ExperimentConfig" = None,
     ):
         """Initializes the CAPOptimizer with various parameters for prompt evolution.
 
         Args:
-            initial_prompts (List[str]): Initial prompt instructions.
+            predictor (BasePredictor): The predictor for evaluating prompt performance.
             task (BaseTask): The task instance containing dataset and description.
-            df_few_shots (pd.DataFrame): DataFrame containing few-shot examples. If None, will pop 10% of datapoints from task.
             meta_llm (BaseLLM): The meta language model for crossover/mutation.
-            downstream_llm (BaseLLM): The downstream language model used for responses.
-            length_penalty (float): Penalty factor for prompt length.
+            initial_prompts (List[str]): Initial prompt instructions.
             crossovers_per_iter (int): Number of crossover operations per iteration.
             upper_shots (int): Maximum number of few-shot examples per prompt.
             p_few_shot_reasoning (float): Probability of generating llm-reasoning for few-shot examples, instead of simply using input-output pairs.
-            n_trials_generation_reasoning (int): Number of trials to generate reasoning for few-shot examples.
             max_n_blocks_eval (int): Maximum number of evaluation blocks.
             test_statistic (TestStatistics): Statistical test to compare prompt performance. Default is "paired_t_test".
             alpha (float): Significance level for the statistical test.
+            length_penalty (float): Penalty factor for prompt length.
+            df_few_shots (pd.DataFrame): DataFrame containing few-shot examples. If None, will pop 10% of datapoints from task.
             crossover_template (str, optional): Template for crossover instructions.
             mutation_template (str, optional): Template for mutation instructions.
             callbacks (List[Callable], optional): Callbacks for optimizer events.
-            predictor (BasePredictor, optional): Predictor to evaluate prompt
-                performance.
             config (ExperimentConfig, optional): Configuration for the optimizer.
         """
         self.meta_llm = meta_llm
@@ -127,7 +129,7 @@ def __init__(
         self.df_few_shots = df_few_shots if df_few_shots is not None else task.pop_datapoints(frac=0.1)
         if self.max_n_blocks_eval > self.task.n_blocks:
             logger.warning(
-                f"max_n_blocks_eval ({self.max_n_blocks_eval}) is larger than the number of blocks ({self.task.n_blocks})."
+                f"ℹ️ max_n_blocks_eval ({self.max_n_blocks_eval}) is larger than the number of blocks ({self.task.n_blocks})."
                 f" Setting max_n_blocks_eval to {self.task.n_blocks}."
             )
             self.max_n_blocks_eval = self.task.n_blocks
@@ -297,7 +299,7 @@ def _do_racing(self, candidates: List[CAPOPrompt], k: int) -> List[CAPOPrompt]:
             self.task.increment_block_idx()
 
         avg_scores = self.task.evaluate(
-            [c.construct_prompt() for c in candidates], self.predictor, strategy="evaluated"
+            [c.construct_prompt() for c in candidates], self.predictor, eval_strategy="evaluated"
         )
         order = np.argsort(-avg_scores)[:k]
         candidates = [candidates[i] for i in order]
diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py
index 6137488..426e973 100644
--- a/promptolution/optimizers/evoprompt_de.py
+++ b/promptolution/optimizers/evoprompt_de.py
@@ -1,15 +1,18 @@
 """Module for EvoPromptDE optimizer."""
 
-from typing import List
 
 import numpy as np
 
-from promptolution.callbacks import BaseCallback
-from promptolution.config import ExperimentConfig
-from promptolution.llms.base_llm import BaseLLM
+from typing import TYPE_CHECKING, List
+
 from promptolution.optimizers.base_optimizer import BaseOptimizer
-from promptolution.predictors.base_predictor import BasePredictor
-from promptolution.tasks.base_task import BaseTask
+
+if TYPE_CHECKING:
+    from promptolution.llms.base_llm import BaseLLM
+    from promptolution.predictors.base_predictor import BasePredictor
+    from promptolution.tasks.base_task import BaseTask
+    from promptolution.utils.callbacks import BaseCallback
+    from promptolution.utils.config import ExperimentConfig
 
 
 class EvoPromptDE(BaseOptimizer):
@@ -31,25 +34,22 @@ class EvoPromptDE(BaseOptimizer):
         prompt_template (str): Template for meta-prompts.
         meta_llm: Language model for child prompt generation.
         donor_random (bool, optional): Whether to use a random donor. Defaults to False.
-        n_eval_samples (int, optional): Number of samples for evaluation. Defaults to 20.
-        config (ExperimentConfig, optional): Configuration for the experiment.
+        config (ExperimentConfig, optional): Configuration for the optimizer, overriding defaults.
     """
 
     def __init__(
         self,
-        predictor: BasePredictor,
-        task: BaseTask,
+        predictor: "BasePredictor",
+        task: "BaseTask",
         prompt_template: str,
-        meta_llm: BaseLLM,
+        meta_llm: "BaseLLM",
         initial_prompts: List[str] = None,
         donor_random: bool = False,
-        n_eval_samples: int = 20,
-        callbacks: List[BaseCallback] = None,
-        config: ExperimentConfig = None,
+        callbacks: List["BaseCallback"] = None,
+        config: "ExperimentConfig" = None,
     ):
         """Initialize the EvoPromptDE optimizer."""
         self.prompt_template = prompt_template
-        self.n_eval_samples = n_eval_samples
         self.donor_random = donor_random
         self.meta_llm = meta_llm
         super().__init__(
diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py
index 190b654..6fc1215 100644
--- a/promptolution/optimizers/evoprompt_ga.py
+++ b/promptolution/optimizers/evoprompt_ga.py
@@ -1,18 +1,22 @@
 """Module for EvoPromptGA optimizer."""
 
-from logging import Logger
-from typing import List
 
 import numpy as np
 
-from promptolution.callbacks import BaseCallback
-from promptolution.config import ExperimentConfig
-from promptolution.llms.base_llm import BaseLLM
+from typing import TYPE_CHECKING, List
+
 from promptolution.optimizers.base_optimizer import BaseOptimizer
-from promptolution.predictors.base_predictor import BasePredictor
-from promptolution.tasks.base_task import BaseTask
 
-logger = Logger(__name__)
+if TYPE_CHECKING:
+    from promptolution.llms.base_llm import BaseLLM
+    from promptolution.predictors.base_predictor import BasePredictor
+    from promptolution.tasks.base_task import BaseTask
+    from promptolution.utils.callbacks import BaseCallback
+    from promptolution.utils.config import ExperimentConfig
+
+from promptolution.utils.logging import get_logger
+
+logger = get_logger(__name__)
 
 
 class EvoPromptGA(BaseOptimizer):
@@ -41,19 +45,17 @@ class EvoPromptGA(BaseOptimizer):
 
     def __init__(
         self,
-        predictor: BasePredictor,
-        task: BaseTask,
+        predictor: "BasePredictor",
+        task: "BaseTask",
         prompt_template: str,
-        meta_llm: BaseLLM,
+        meta_llm: "BaseLLM",
         initial_prompts: List[str] = None,
         selection_mode: str = "wheel",
-        n_eval_samples: int = 20,
-        callbacks: List[BaseCallback] = None,
-        config: ExperimentConfig = None,
+        callbacks: List["BaseCallback"] = None,
+        config: "ExperimentConfig" = None,
     ):
         """Initialize the EvoPromptGA optimizer."""
         self.prompt_template = prompt_template
-        self.n_eval_samples = n_eval_samples
         self.meta_llm = meta_llm
         self.selection_mode = selection_mode
         super().__init__(
diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py
index afee3f5..0e3f892 100644
--- a/promptolution/optimizers/opro.py
+++ b/promptolution/optimizers/opro.py
@@ -1,19 +1,22 @@
 """Module implementing the OPRO (Optimization by PROmpting) algorithm."""
 
-from typing import List, Optional
 
 import numpy as np
 
-from promptolution.callbacks import BaseCallback
-from promptolution.config import ExperimentConfig
-from promptolution.llms.base_llm import BaseLLM
+from typing import TYPE_CHECKING, List, Optional
+
 from promptolution.optimizers.base_optimizer import BaseOptimizer
-from promptolution.predictors.base_predictor import BasePredictor
-from promptolution.tasks.base_task import BaseTask
-from promptolution.templates import OPRO_TEMPLATE
+from promptolution.optimizers.templates import OPRO_TEMPLATE
+
+if TYPE_CHECKING:
+    from promptolution.llms.base_llm import BaseLLM
+    from promptolution.predictors.base_predictor import BasePredictor
+    from promptolution.tasks.base_task import BaseTask
+    from promptolution.utils.callbacks import BaseCallback
+    from promptolution.utils.config import ExperimentConfig
 
 
-class Opro(BaseOptimizer):
+class OPRO(BaseOptimizer):
     """OPRO: Optimization by PROmpting.
 
     Implementation of the technique proposed in "Large Language Models as Optimizers"
@@ -25,16 +28,16 @@ class Opro(BaseOptimizer):
 
     def __init__(
         self,
-        predictor: BasePredictor,
-        task: BaseTask,
+        predictor: "BasePredictor",
+        task: "BaseTask",
         prompt_template: Optional[str],
-        meta_llm: BaseLLM,
+        meta_llm: "BaseLLM",
         initial_prompts: List[str] = None,
         max_num_instructions: int = 20,
         num_instructions_per_step: int = 8,
         num_few_shots: int = 3,
-        callbacks: List[BaseCallback] = None,
-        config: ExperimentConfig = None,
+        callbacks: List["BaseCallback"] = None,
+        config: "ExperimentConfig" = None,
     ) -> None:
         """Initialize the OPRO optimizer.
 
@@ -48,7 +51,7 @@ def __init__(
             num_instructions_per_step: Number of prompts to generate in each step
             num_few_shots: Number of few-shot examples to include (0 for none)
             callbacks: List of callback functions
-            config: ExperimentConfig overwriting default parameters
+            config: "ExperimentConfig" overwriting default parameters
         """
         self.meta_llm = meta_llm
 
diff --git a/promptolution/templates.py b/promptolution/optimizers/templates.py
similarity index 98%
rename from promptolution/templates.py
rename to promptolution/optimizers/templates.py
index 539a66b..aaa1f63 100644
--- a/promptolution/templates.py
+++ b/promptolution/optimizers/templates.py
@@ -1,4 +1,7 @@
+"""Meta-prompt templates for different prompt optimization methods."""
+
 DEFAULT_SYS_PROMPT = "You are a helpful assistant."
+
 EVOPROMPT_DE_TEMPLATE = """Please follow the instruction step-by-step to generate a better prompt.
 Identifying the different parts between Prompt 1 and Prompt 2:
 Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.
@@ -160,4 +163,4 @@
 Prompt: <instruction>
 
 Return the new prompt in the following format:
-<prompt>new prompt</prompt>"""
\ No newline at end of file
+<prompt>new prompt</prompt>"""
diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py
index 85bc9ac..ddc9595 100644
--- a/promptolution/predictors/__init__.py
+++ b/promptolution/predictors/__init__.py
@@ -1,36 +1,4 @@
 """Module for LLM predictors."""
 
-from typing import Literal
 
-from .base_predictor import DummyPredictor
-from .classificator import FirstOccurrenceClassificator, MarkerBasedClassificator
-
-
-def get_predictor(downstream_llm=None, type: Literal["first_occurrence", "marker"] = "marker", *args, **kwargs):
-    """Factory function to create and return a predictor instance.
-
-    This function supports three types of predictors:
-    1. DummyPredictor: A mock predictor for testing purposes when no downstream_llm is provided.
-    2. FirstOccurrenceClassificator: A predictor that classifies based on first occurrence of the label.
-    3. MarkerBasedClassificator: A predictor that classifies based on a marker.
-
-    Args:
-        downstream_llm: The language model to use for prediction. If None, returns a DummyPredictor.
-        type (Literal["first_occurrence", "marker"]): The type of predictor to create:
-                    - "first_occurrence" (default) for FirstOccurrenceClassificator
-                    - "marker" for MarkerBasedClassificator
-        *args: Variable length argument list passed to the predictor constructor.
-        **kwargs: Arbitrary keyword arguments passed to the predictor constructor.
-
-    Returns:
-        An instance of DummyPredictor, FirstOccurrenceClassificator, or MarkerBasedClassificator.
-    """
-    if downstream_llm is None:
-        return DummyPredictor("", *args, **kwargs)
-
-    if type == "first_occurrence":
-        return FirstOccurrenceClassificator(downstream_llm, *args, **kwargs)
-    elif type == "marker":
-        return MarkerBasedClassificator(downstream_llm, *args, **kwargs)
-    else:
-        raise ValueError(f"Invalid predictor type: '{type}'")
+from promptolution.predictors.classifier import FirstOccurrenceClassifier, MarkerBasedClassifier
diff --git a/promptolution/predictors/base_predictor.py b/promptolution/predictors/base_predictor.py
index 5c81098..ffcfc15 100644
--- a/promptolution/predictors/base_predictor.py
+++ b/promptolution/predictors/base_predictor.py
@@ -1,14 +1,17 @@
 """Base module for predictors in the promptolution library."""
 
+
 from abc import ABC, abstractmethod
-from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, List, Optional, Tuple, Union
 
-import numpy as np
+from typing import TYPE_CHECKING, List
 
-from promptolution.config import ExperimentConfig
 from promptolution.llms.base_llm import BaseLLM
 
+if TYPE_CHECKING:
+    from promptolution.utils.config import ExperimentConfig
+
+import numpy as np
+
 
 class BasePredictor(ABC):
     """Abstract base class for predictors in the promptolution library.
@@ -21,7 +24,7 @@ class BasePredictor(ABC):
         config (ExperimentConfig): Experiment configuration overwriting defaults
     """
 
-    def __init__(self, llm: Optional[BaseLLM] = None, config: ExperimentConfig = None):
+    def __init__(self, llm: "BaseLLM", config: "ExperimentConfig" = None):
         """Initialize the predictor with a language model and configuration.
 
         Args:
@@ -65,64 +68,13 @@ def predict(
         return preds if not return_seq else (preds, seqs)
 
     @abstractmethod
-    def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray:
+    def _extract_preds(self, preds: List[str]) -> np.ndarray:
         """Extract class labels from the predictions, based on the list of valid class labels.
 
         Args:
             preds: The raw predictions from the language model.
-            shape: The shape of the output array: (n_prompts, n_samples).
 
         Returns:
-            np.ndarray: Extracted predictions with shape (n_prompts, n_samples).
+            np.ndarray: Extracted predictions.
         """
         raise NotImplementedError
-
-
-class DummyPredictor(BasePredictor):
-    """A dummy predictor implementation for testing purposes.
-
-    This predictor generates random predictions from the list of possible classes.
-
-    Attributes:
-        model_id (str): Always set to "dummy".
-        classes (List[str]): List of possible class labels.
-    """
-
-    def predict(
-        self, prompts: Union[str, List[str]], xs: np.ndarray, return_seq: bool = False
-    ) -> Union[np.ndarray, Tuple[np.ndarray, List[str]]]:
-        """Generate random predictions for the given prompts and input data.
-
-        Args:
-            prompts: Prompt or list of prompts (ignored in this implementation).
-            xs: Array of input data (only the length is used).
-            return_seq: Whether to return sequences.
-
-        Returns:
-            Array of random predictions, optionally with sequences.
-        """
-        if isinstance(prompts, str):
-            prompts = [prompts]
-
-        preds = np.array([np.random.choice(self.classes, len(xs)) for _ in prompts])
-
-        if return_seq:
-            # Generate fake sequences
-            seqs = [f"Input: {x}\nOutput: {np.random.choice(self.classes)}" for x in xs]
-            return preds, seqs
-
-        return preds
-
-    def _extract_preds(self, preds: List[str], shape: Tuple[int, int]) -> np.ndarray:
-        """Extract class labels from the predictions.
-
-        This is a dummy implementation that returns random predictions.
-
-        Args:
-            preds: The raw predictions from the language model (ignored).
-            shape: The shape of the output array: (n_prompts, n_samples).
-
-        Returns:
-            np.ndarray: Random predictions.
-        """
-        return np.array([np.random.choice(self.classes, shape[1]) for _ in range(shape[0])])
diff --git a/promptolution/predictors/classificator.py b/promptolution/predictors/classifier.py
similarity index 86%
rename from promptolution/predictors/classificator.py
rename to promptolution/predictors/classifier.py
index 2621a86..3fca57f 100644
--- a/promptolution/predictors/classificator.py
+++ b/promptolution/predictors/classifier.py
@@ -1,14 +1,17 @@
 """Module for classification predictors."""
 
-from typing import List, Tuple
 
 import numpy as np
 
-from promptolution.config import ExperimentConfig
+from typing import TYPE_CHECKING, List
+
 from promptolution.predictors.base_predictor import BasePredictor
 
+if TYPE_CHECKING:
+    from promptolution.utils.config import ExperimentConfig
+
 
-class FirstOccurrenceClassificator(BasePredictor):
+class FirstOccurrenceClassifier(BasePredictor):
     """A predictor class for classification tasks using language models.
 
     This class takes a language model and a list of classes, and provides a method
@@ -20,19 +23,19 @@ class FirstOccurrenceClassificator(BasePredictor):
     Attributes:
         llm: The language model used for generating predictions.
         classes (List[str]): The list of valid class labels.
-        config (ExperimentConfig): Experiment configuration overwriting defaults.
+        config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults.
 
     Inherits from:
         BasePredictor: The base class for predictors in the promptolution library.
     """
 
-    def __init__(self, llm, classes, config: ExperimentConfig = None):
-        """Initialize the Classificator.
+    def __init__(self, llm, classes, config: "ExperimentConfig" = None):
+        """Initialize the FirstOccurrenceClassifier.
 
         Args:
             llm: The language model to use for predictions.
             classes (List[str]): The list of valid class labels.
-            config: Experiment configuration overwriting defaults.
+            config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults.
         """
         assert all([c.islower() for c in classes]), "Class labels should be lowercase."
         self.classes = classes
@@ -65,7 +68,7 @@ def _extract_preds(self, preds: List[str]) -> np.ndarray:
         return response
 
 
-class MarkerBasedClassificator(BasePredictor):
+class MarkerBasedClassifier(BasePredictor):
     """A predictor class for classification tasks using language models.
 
     This class takes a language model and a list of classes, and provides a method
@@ -86,16 +89,16 @@ def __init__(
         classes=None,
         begin_marker="<final_answer>",
         end_marker="</final_answer>",
-        config: ExperimentConfig = None,
+        config: "ExperimentConfig" = None,
     ):
-        """Initialize the Classificator.
+        """Initialize the MarkerBasedClassifier.
 
         Args:
             llm: The language model to use for predictions.
             classes (List[str]): The list of valid class labels. If None, does not force any class.
             begin_marker (str): The marker to use for extracting the class label.
             end_marker (str): The marker to use for extracting the class label.
-            config: Experiment configuration overwriting defaults.
+            config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults.
         """
         self.classes = classes
         self.begin_marker = begin_marker
diff --git a/promptolution/tasks/__init__.py b/promptolution/tasks/__init__.py
index b093f22..7222256 100644
--- a/promptolution/tasks/__init__.py
+++ b/promptolution/tasks/__init__.py
@@ -1,21 +1,3 @@
 """Module for task-related functions and classes."""
-import pandas as pd
 
-from promptolution.config import ExperimentConfig
-from promptolution.tasks.base_task import BaseTask
 from promptolution.tasks.classification_tasks import ClassificationTask
-
-
-def get_task(df: pd.DataFrame, config: ExperimentConfig) -> BaseTask:
-    """Get the task based on the provided DataFrame and configuration.
-
-    So far only ClassificationTask is supported.
-
-    Args:
-        df (pd.DataFrame): Input DataFrame containing the data.
-        config (ExperimentConfig): Configuration for the experiment.
-
-    Returns:
-        BaseTask: An instance of a task class based on the provided DataFrame and configuration.
-    """
-    return ClassificationTask(df, config=config)
diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py
index 5b81cf6..418ab87 100644
--- a/promptolution/tasks/base_task.py
+++ b/promptolution/tasks/base_task.py
@@ -1,11 +1,14 @@
 """Base module for tasks."""
 
+
 from abc import ABC, abstractmethod
-from typing import List
 
 import numpy as np
 
-from promptolution.config import ExperimentConfig
+from typing import TYPE_CHECKING, List
+
+if TYPE_CHECKING:
+    from promptolution.utils.config import ExperimentConfig
 
 
 class BaseTask(ABC):
@@ -18,7 +21,7 @@ class BaseTask(ABC):
                   to evaluate prompts using a given predictor.
     """
 
-    def __init__(self, config: ExperimentConfig = None):
+    def __init__(self, config: "ExperimentConfig" = None):
         """Initialize the BaseTask."""
         if config is not None:
             config.apply_to(self)
@@ -39,37 +42,3 @@ def evaluate(self, prompts: List[str], predictor, system_prompts: List[str] = No
             NotImplementedError: If not implemented by a subclass.
         """
         raise NotImplementedError
-
-
-class DummyTask(BaseTask):
-    """A dummy task implementation for testing purposes.
-
-    This task generates random evaluation scores for given prompts.
-
-    Attributes:
-        initial_population (List[str]): List of initial prompts.
-        description (str): Description of the dummy task.
-        xs (np.ndarray): Array of dummy input data.
-        ys (np.ndarray): Array of dummy labels.
-        classes (List[str]): List of possible class labels.
-    """
-
-    def __init__(self):
-        """Initialize the DummyTask."""
-        self.initial_population = ["Some", "initial", "prompts", "that", "will", "do", "the", "trick"]
-        self.description = "This is a dummy task for testing purposes."
-        self.xs = np.array(["This is a test", "This is another test", "This is a third test"])
-        self.ys = np.array(["positive", "negative", "positive"])
-        self.classes = ["negative", "positive"]
-
-    def evaluate(self, prompts: List[str], predictor, system_prompts=None) -> np.ndarray:
-        """Generate random evaluation scores for the given prompts.
-
-        Args:
-            prompts (List[str]): List of prompts to evaluate.
-            predictor: The predictor to use for evaluation (ignored in this implementation).
-
-        Returns:
-            np.ndarray: Array of random evaluation scores, one for each prompt.
-        """
-        return np.array([np.random.rand()] * len(prompts))
diff --git a/promptolution/tasks/classification_tasks.py b/promptolution/tasks/classification_tasks.py
index 8ccf38d..9ff156f 100644
--- a/promptolution/tasks/classification_tasks.py
+++ b/promptolution/tasks/classification_tasks.py
@@ -1,15 +1,18 @@
 """Module for classification tasks."""
 
-from typing import Any, Callable, List, Literal, Tuple, Union
 
 import numpy as np
 import pandas as pd
 from sklearn.metrics import accuracy_score
 
-from promptolution.config import ExperimentConfig
-from promptolution.predictors.base_predictor import BasePredictor
+from typing import TYPE_CHECKING, Any, Callable, List, Literal, Tuple, Union
+
 from promptolution.tasks.base_task import BaseTask
 
+if TYPE_CHECKING:
+    from promptolution.predictors.base_predictor import BasePredictor
+    from promptolution.utils.config import ExperimentConfig
+
 
 class ClassificationTask(BaseTask):
     """A class representing a classification task in the promptolution library.
@@ -25,11 +28,10 @@ def __init__(
         x_column: str = "x",
         y_column: str = "y",
         n_subsamples: int = 30,
-        block_size: int = 30,
-        subsample_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = "full",
+        eval_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = "full",
         seed: int = 42,
         metric: Callable = accuracy_score,
-        config: ExperimentConfig = None,
+        config: "ExperimentConfig" = None,
     ):
         """Initialize the ClassificationTask from a pandas DataFrame.
 
@@ -39,41 +41,39 @@ def __init__(
             x_column (str, optional): Name of the column containing input texts. Defaults to "x".
             y_column (str, optional): Name of the column containing labels. Defaults to "y".
             n_subsamples (int, optional): Number of subsamples to use. No subsampling if None. Defaults to None.
-            block_size (int, optional): Block size for subsampling. Defaults to None.
-            subsample_strategy (str, optional): Subsampling strategy to use. Options:
+            eval_strategy (str, optional): Subsampling strategy to use. Options:
                 - "full": Uses the entire dataset for evaluation.
                 - "evaluated": Uses only previously evaluated datapoints from the cache.
                 - "subsample": Randomly selects n_subsamples datapoints without replacement.
                 - "sequential_block": Uses a block of block_size consecutive datapoints, advancing through blocks sequentially.
                 - "random_block": Randomly selects a block of block_size consecutive datapoints.
-                Defaults to "full".  # TODO in other pull request this is renamed to eval_strategy and we do not use block_size
+                Defaults to "full".
             seed (int, optional): Random seed for reproducibility. Defaults to 42.
             metric (Callable, optional): Metric to use for evaluation. Defaults to accuracy_score.
-            config (ExperimentConfig, optional): ExperimentConfig overwriting the defaults.
+            config (ExperimentConfig, optional): Configuration for the task, overriding defaults.
         """
         self.description = description
         self.metric = metric
 
         self.x_column = x_column
         self.y_column = y_column
+        self.eval_strategy = eval_strategy
+        self.n_subsamples = n_subsamples
+        super().__init__(config)
 
-        self.xs = df[x_column].values
-        self.ys = df[y_column].str.lower().values
-        self.classes = df[y_column].unique()
+        self.xs = df[self.x_column].values
+        self.ys = df[self.y_column].str.lower().values
+        self.classes = np.unique(self.ys)
 
-        self.subsample_strategy = subsample_strategy
-        self.n_subsamples = n_subsamples
-        self.block_size = block_size
         self.block_idx = 0
-        self.n_blocks = len(self.xs) // self.block_size
+        self.n_blocks = len(self.xs) // self.n_subsamples
         self.rng = np.random.default_rng(seed)
-        super().__init__(config)
 
         self.eval_cache = {}  # (prompt, x, y): scores per datapoint
         self.seq_cache = {}  # (prompt, x, y): generating sequence per datapoint
 
     def subsample(
-        self, strategy: Literal["full", "subsample", "sequential_block", "random_block"] = None
+        self, eval_strategy: Literal["full", "subsample", "sequential_block", "random_block"] = None
     ) -> Tuple[np.ndarray, np.ndarray]:
         """Subsample the dataset based on the specified parameters.
 
@@ -83,37 +83,37 @@ def subsample(
         Returns:
             Tuple[np.ndarray, np.ndarray]: Subsampled input data and labels.
         """
-        if strategy is None:
-            strategy = self.subsample_strategy
+        if eval_strategy is None:
+            eval_strategy = self.eval_strategy
 
-        if strategy in ["full", "evaluated"]:
+        if eval_strategy in ["full", "evaluated"]:
             return self.xs, self.ys
 
-        elif strategy == "subsample":
+        elif eval_strategy == "subsample":
             indices = self.rng.choice(len(self.xs), self.n_subsamples, replace=False)
             return self.xs[indices], self.ys[indices]
 
-        elif strategy == "random_block":
-            block_id = self.rng.integers(0, len(self.xs) // self.block_size)
-            indices = np.arange(block_id * self.block_size, (block_id + 1) * self.block_size)
+        elif eval_strategy == "random_block":
+            block_id = self.rng.integers(0, len(self.xs) // self.n_subsamples)
+            indices = np.arange(block_id * self.n_subsamples, (block_id + 1) * self.n_subsamples)
             return self.xs[indices], self.ys[indices]
 
-        elif strategy == "sequential_block":
-            indices = np.arange(self.block_idx * self.block_size, (self.block_idx + 1) * self.block_size)
+        elif eval_strategy == "sequential_block":
+            indices = np.arange(self.block_idx * self.n_subsamples, (self.block_idx + 1) * self.n_subsamples)
             return self.xs[indices], self.ys[indices]
 
         else:
-            raise ValueError(f"Unknown subsampling strategy: '{strategy}'")
+            raise ValueError(f"Unknown subsampling strategy: '{eval_strategy}")
 
     def _prepare_batch(
-        self, prompts: List[str], xs: np.ndarray, ys: np.ndarray, strategy: str
+        self, prompts: List[str], xs: np.ndarray, ys: np.ndarray, eval_strategy: str
     ) -> List[Tuple[str, str, str]]:
         """Generates (prompt, x, y) keys that require prediction.
 
         If strategy is "evaluated", returns an empty list.
         Otherwise, returns keys not found in eval_cache.
         """
-        if strategy == "evaluated":
+        if eval_strategy == "evaluated":
             return []
 
         keys_to_predict = []
@@ -150,21 +150,21 @@ def _collect_results_from_cache(
     def evaluate(
         self,
         prompts: Union[str, List[str]],
-        predictor: BasePredictor,
+        predictor: "BasePredictor",
         system_prompts: List[str] = None,
         return_agg_scores: bool = True,
         return_seq: bool = False,
-        strategy: str = None,
+        eval_strategy: str = None,
     ) -> Union[np.ndarray, Tuple[np.ndarray, Union[List[Any], np.ndarray]]]:
         """Evaluate a set of prompts using a given predictor.
 
         This method orchestrates subsampling, prediction, caching, and result collection.
         """
         prompts = [prompts] if isinstance(prompts, str) else prompts
-        strategy = strategy or self.subsample_strategy
+        eval_strategy = eval_strategy or self.eval_strategy
 
-        xs, ys = self.subsample(strategy=strategy)
-        batches = self._prepare_batch(prompts, xs, ys, strategy)
+        xs, ys = self.subsample(eval_strategy=eval_strategy)
+        batches = self._prepare_batch(prompts, xs, ys, eval_strategy)
         prompts_to_evaluate, xs_to_evaluate, ys_to_evaluate = zip(*batches) if batches else ([], [], [])
 
         preds = predictor.predict(
@@ -216,21 +216,29 @@ def pop_datapoints(self, n: int = None, frac: float = None) -> pd.DataFrame:
         self.xs = np.delete(self.xs, indices)
         self.ys = np.delete(self.ys, indices)
 
-        self.n_blocks = len(self.xs) // self.block_size
+        self.n_blocks = len(self.xs) // self.n_subsamples
         self.block_idx = min(self.block_idx, self.n_blocks - 1)
 
         return df
 
     def increment_block_idx(self) -> None:
-        """Increment the block index for subsampling."""
-        if "block" not in self.subsample_strategy:
+        """Increment the block index for subsampling.
+
+        Raises:
+            ValueError: If the eval_strategy does not contain "block".
+        """
+        if "block" not in self.eval_strategy:
             raise ValueError("Block increment is only valid for block subsampling.")
         self.block_idx += 1
         if self.block_idx >= self.n_blocks:
             self.block_idx = 0
 
     def reset_block_idx(self) -> None:
-        """Reset the block index for subsampling."""
-        if "block" not in self.subsample_strategy:
+        """Reset the block index for subsampling.
+
+        Raises:
+            ValueError: If the eval_strategy does not contain "block".
+        """
+        if "block" not in self.eval_strategy:
             raise ValueError("Block reset is only valid for block subsampling.")
         self.block_idx = 0
diff --git a/promptolution/utils/__init__.py b/promptolution/utils/__init__.py
new file mode 100644
index 0000000..eba584c
--- /dev/null
+++ b/promptolution/utils/__init__.py
@@ -0,0 +1,15 @@
+"""Module for utility functions and classes."""
+
+
+from promptolution.utils.callbacks import (
+    BestPromptCallback,
+    FileOutputCallback,
+    LoggerCallback,
+    ProgressBarCallback,
+    TokenCountCallback,
+)
+from promptolution.utils.config import ExperimentConfig
+from promptolution.utils.logging import get_logger, setup_logging
+from promptolution.utils.prompt_creation import create_prompt_variation, create_prompts_from_samples
+from promptolution.utils.test_statistics import TestStatistics, get_test_statistic_func, paired_t_test
+from promptolution.utils.token_counter import get_token_counter
diff --git a/promptolution/callbacks.py b/promptolution/utils/callbacks.py
similarity index 92%
rename from promptolution/callbacks.py
rename to promptolution/utils/callbacks.py
index 1895dd2..f5e17f6 100644
--- a/promptolution/callbacks.py
+++ b/promptolution/utils/callbacks.py
@@ -1,13 +1,15 @@
 """Callback classes for logging, saving, and tracking optimization progress."""
 
+
 import os
 from abc import ABC
 from datetime import datetime
-from typing import Literal
 
 import pandas as pd
 from tqdm import tqdm
 
+from typing import Literal
+
 
 class BaseCallback(ABC):
     """Base class for optimization callbacks.
@@ -79,12 +81,10 @@ def on_step_end(self, optimizer):
         """Log information about the current step."""
         self.step += 1
         time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
-        self.logger.critical(f"{time} - ✨Step {self.step} ended✨")
-        time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
-        self.logger.critical(f"{time} - ✨Step {self.step} ended✨")
+        self.logger.critical(f"{time} - ✨ Step {self.step} ended ✨")
         for i, (prompt, score) in enumerate(zip(optimizer.prompts, optimizer.scores)):
-            self.logger.critical(f"*** Prompt {i}: Score: {score}")
-            self.logger.critical(f"{prompt}")
+            self.logger.critical(f"📝 Prompt {i}: Score: {score}")
+            self.logger.critical(f"💬 {prompt}")
 
         return True
 
@@ -96,13 +96,10 @@ def on_train_end(self, optimizer, logs=None):
         logs: Additional information to log.
         """
         time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
-        time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
         if logs is None:
-            self.logger.critical(f"{time} - Training ended")
-            self.logger.critical(f"{time} - Training ended")
+            self.logger.critical(f"{time} - 🏁 Training ended")
         else:
-            self.logger.critical(f"{time} - Training ended - {logs}")
-            self.logger.critical(f"{time} - Training ended - {logs}")
+            self.logger.critical(f"{time} - 🏁 Training ended - {logs}")
 
         return True
 
diff --git a/promptolution/config.py b/promptolution/utils/config.py
similarity index 93%
rename from promptolution/config.py
rename to promptolution/utils/config.py
index 6a00eb5..ae27c2e 100644
--- a/promptolution/config.py
+++ b/promptolution/utils/config.py
@@ -1,8 +1,10 @@
 """Configuration class for the promptolution library."""
-from logging import Logger
+
 from typing import Set
 
-logger = Logger(__name__)
+from promptolution.utils.logging import get_logger
+
+logger = get_logger(__name__)
 
 
 class ExperimentConfig:
@@ -68,4 +70,4 @@ def validate(self):
         all_attributes = {k for k in self.__dict__ if not k.startswith("_")}
         unused_attributes = all_attributes - self._used_attributes
         if unused_attributes:
-            logger.warning(f"Unused configuration attributes: {unused_attributes}")
+            logger.warning(f"⚠️ Unused configuration attributes: {unused_attributes}")
diff --git a/promptolution/utils/logging.py b/promptolution/utils/logging.py
new file mode 100644
index 0000000..ef30210
--- /dev/null
+++ b/promptolution/utils/logging.py
@@ -0,0 +1,41 @@
+"""Logging configuration for the promptolution library."""
+
+import logging
+
+from typing import Optional
+
+
+def get_logger(name: str, level: Optional[int] = None) -> logging.Logger:
+    """Get a logger with the specified name and level.
+
+    This function provides a standardized way to get loggers throughout the library,
+    ensuring consistent formatting and behavior.
+
+    Args:
+        name (str): Name of the logger, typically __name__ of the module.
+        level (int, optional): Logging level. Defaults to None, which uses the root logger's level.
+
+    Returns:
+        logging.Logger: Configured logger instance.
+    """
+    logger = logging.getLogger(name)
+    if level is not None:
+        logger.setLevel(level)
+    return logger
+
+
+def setup_logging(level: int = logging.INFO) -> None:
+    """Set up logging for the promptolution library.
+
+    This function configures the root logger for the library with appropriate
+    formatting and level.
+
+    Args:
+        level (int, optional): Logging level. Defaults to logging.INFO.
+    """
+    # Configure the root logger
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 85a613e..d4764df 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -1,16 +1,23 @@
 """Utility functions for prompt creation."""
 
-from typing import List, Union
 
 import numpy as np
 
-from promptolution.llms.base_llm import BaseLLM
-from promptolution.tasks.base_task import BaseTask
+from typing import TYPE_CHECKING, List, Union
+
+if TYPE_CHECKING:
+    from promptolution.llms.base_llm import BaseLLM
+    from promptolution.tasks.base_task import BaseTask
+
+from promptolution.optimizers.templates import (
+    PROMPT_CREATION_TEMPLATE,
+    PROMPT_CREATION_TEMPLATE_TD,
+    PROMPT_VARIATION_TEMPLATE,
+)
 from promptolution.tasks.classification_tasks import ClassificationTask
-from promptolution.templates import PROMPT_CREATION_TEMPLATE, PROMPT_CREATION_TEMPLATE_TD, PROMPT_VARIATION_TEMPLATE
 
 
-def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_prompt: str = None) -> List[str]:
+def create_prompt_variation(prompt: Union[List[str], str], llm: "BaseLLM", meta_prompt: str = None) -> List[str]:
     """Generate a variation of the given prompt(s) while keeping the semantic meaning.
 
     Idea taken from the paper Zhou et al. (2021) https://arxiv.org/pdf/2211.01910
@@ -36,8 +43,8 @@ def create_prompt_variation(prompt: Union[List[str], str], llm: BaseLLM, meta_pr
 
 
 def create_prompts_from_samples(
-    task: BaseTask,
-    llm: BaseLLM,
+    task: "BaseTask",
+    llm: "BaseLLM",
     meta_prompt: str = None,
     n_samples: int = 3,
     task_description: str = None,
diff --git a/promptolution/utils/test_statistics.py b/promptolution/utils/test_statistics.py
index df050ef..a776a75 100644
--- a/promptolution/utils/test_statistics.py
+++ b/promptolution/utils/test_statistics.py
@@ -2,11 +2,12 @@
 Implementation of statistical significance tests used in the racing algorithm.
 Contains paired t-test functionality to compare prompt performance and determine statistical significance between candidates.
 """
-from typing import Literal
 
 import numpy as np
 from scipy.stats import ttest_rel
 
+from typing import Literal
+
 TestStatistics = Literal["paired_t_test"]
 
 
diff --git a/promptolution/utils/token_counter.py b/promptolution/utils/token_counter.py
index a1352ab..12507a8 100644
--- a/promptolution/utils/token_counter.py
+++ b/promptolution/utils/token_counter.py
@@ -2,9 +2,10 @@
 
 This module provides a function to count the number of tokens in a given text.
 """
-from logging import Logger
 
-logger = Logger(__name__)
+from promptolution.utils import get_logger
+
+logger = get_logger(__name__)
 
 
 def get_token_counter(llm):
@@ -23,7 +24,7 @@ def get_token_counter(llm):
     if hasattr(llm, "tokenizer"):
         token_counter = lambda x: len(llm.tokenizer(x)["input_ids"])
     else:
-        logger.warning("The LLM does not have a tokenizer. Using simple token count.")
+        logger.warning("⚠️ The LLM does not have a tokenizer. Using simple token count.")
         token_counter = lambda x: len(x.split())
 
     return token_counter
diff --git a/pyproject.toml b/pyproject.toml
index 2dbeb3a..60e30e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,10 @@ line-length = 120
 [tool.isort]
 line_length = 120
 profile = "black"
+sections = ["FUTURE", "STDLIB", "THIRDPARTY", "TYPING", "FIRSTPARTY", "PROMPTOLUTION", "LOCALFOLDER"]
+known_typing = ["typing"]
+known_promptolution = ["promptolution"]
+lines_between_sections = 1
 
 [tool.pydocstyle]
-convention = "google"
\ No newline at end of file
+convention = "google"
diff --git a/pytest.ini b/pytest.ini
index 38be796..4ecb1ad 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -2,4 +2,4 @@
 testpaths = tests
 python_files = test_*.py
 python_classes = Test*
-python_functions = test_*
\ No newline at end of file
+python_functions = test_*
diff --git a/scripts/api_llm_demo.py b/scripts/api_llm_demo.py
index 8734e48..13370ea 100644
--- a/scripts/api_llm_demo.py
+++ b/scripts/api_llm_demo.py
@@ -1,16 +1,16 @@
 """Test run for the Opro optimizer."""
 
+
 import argparse
 from logging import Logger
 
-from promptolution.callbacks import LoggerCallback
-from promptolution.templates import EVOPROMPT_GA_TEMPLATE
-from promptolution.tasks import ClassificationTask
-from promptolution.predictors import MarkerBasedClassificator
-from promptolution.optimizers import EvoPromptGA
 from datasets import load_dataset
 
-from promptolution.llms.api_llm import APILLM
+from promptolution.llms import APILLM
+from promptolution.optimizers import CAPO
+from promptolution.predictors import MarkerBasedClassifier
+from promptolution.tasks import ClassificationTask
+from promptolution.utils import LoggerCallback
 
 logger = Logger(__name__)
 
@@ -45,22 +45,20 @@
     "Conduct a thorough analysis of the provided news article and classify it as belonging to one of these four categories: World, Sports, Business, or Tech. Your answer should be presented within <final_answer> </final_answer> markers.",
 ]
 
-llm = APILLM(api_url=args.base_url, llm=args.model, token=args.token)
+llm = APILLM(api_url=args.base_url, model_id=args.model, api_key=args.token)
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassificator(downstream_llm, classes=task.classes)
+predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes)
 
 callbacks = [LoggerCallback(logger)]
 
-optimizer = EvoPromptGA(
+optimizer = CAPO(
     task=task,
-    prompt_template=EVOPROMPT_GA_TEMPLATE,
     predictor=predictor,
     meta_llm=meta_llm,
     initial_prompts=initial_prompts,
     callbacks=callbacks,
-    n_eval_samples=20,
 )
 
 best_prompts = optimizer.optimize(n_steps=args.n_steps)
diff --git a/scripts/capo_demo.py b/scripts/capo_demo.py
index fe4fab5..a03ec78 100644
--- a/scripts/capo_demo.py
+++ b/scripts/capo_demo.py
@@ -1,27 +1,25 @@
 """Test run for the Opro optimizer."""
 
+
 import argparse
-import random
 from logging import Logger
 
-from promptolution.callbacks import LoggerCallback, TokenCountCallback, FileOutputCallback
-from promptolution.helpers import get_llm
-from promptolution.tasks import ClassificationTask
-from promptolution.predictors import MarkerBasedClassificator
-from promptolution.optimizers import CAPO
 from datasets import load_dataset
 
+from promptolution.llms import APILLM
+from promptolution.optimizers import CAPO
+from promptolution.predictors import MarkerBasedClassifier
+from promptolution.tasks import ClassificationTask
+from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback
+
 logger = Logger(__name__)
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--model")
-parser.add_argument("--model-storage-path", default="../models/")
-parser.add_argument("--output-dir", default="results/evoprompt_ga_test/")
-parser.add_argument("--max-model-len", type=int, default=1024)
+parser.add_argument("--output-dir", default="results/capo/")
+parser.add_argument("--api_url", default="https://api.openai.com/v1")
+parser.add_argument("--model_id", default="gpt-4-0613")
 parser.add_argument("--n-steps", type=int, default=2)
-parser.add_argument("--n-eval-samples", type=int, default=20)
-parser.add_argument("--token", default=None)
-parser.add_argument("--seed", type=int, default=187)
+parser.add_argument("--api_key", default=None)
 args = parser.parse_args()
 
 callbacks = [
@@ -30,7 +28,7 @@
     TokenCountCallback(100000, "input_tokens"),
 ]
 
-df = load_dataset("openai/gsm8k", name="main", split="train", revision="main").to_pandas().sample(300, random_state=args.seed)
+df = load_dataset("openai/gsm8k", name="main", split="train", revision="main").to_pandas().sample(400)
 
 df["input"] = df["question"]
 df["target"] = df["answer"].str.extract(r"#### (.*)")
@@ -40,6 +38,7 @@
     description="The dataset consists of elementary school math word problems that require multi-step reasoning to solve. The task is to solve each word problem and provide the final answer.",
     x_column="input",
     y_column="target",
+    eval_strategy="sequential_block",
 )
 
 initial_prompts = [
@@ -50,39 +49,14 @@
     "These word problems require multi-step reasoning. Work through the problem methodically, then place your numerical answer between <final_answer> </final_answer> markers.",
     "Solve the problem. Answer format: <final_answer>answer</final_answer>",
     "You are a math tutor helping elementary students with word problems. Explain your reasoning clearly, then provide your answer in the format <final_answer>answer</final_answer>.",
-    "Examine this multi-step math problem. Calculate the solution and ensure you format your final answer within <final_answer> tags as instructed.",
-    "Find the solution to this word problem using logical reasoning. Your final response must be formatted as <final_answer>your calculated result</final_answer>.",
-    "Basic arithmetic word problem below. Solve it and clearly indicate your answer between <final_answer> and </final_answer> markers for easy evaluation.",
-    "Could you please solve this math word problem? I need the final answer wrapped in <final_answer> tags.",
-    "Analyze and solve the following elementary school math word problem that requires multiple steps of reasoning. Your answer should be provided in this exact format: <final_answer>your numerical answer</final_answer>.",
-    "Math problem. Solve. Put answer in <final_answer></final_answer>.",
-    "Below is a grade school mathematics word problem that may require multiple steps to solve. Please work through it carefully and make sure to format your final numerical answer as <final_answer>answer</final_answer>.",
-    "Kindly solve this word problem by applying appropriate mathematical operations. Remember that your final answer must be enclosed within <final_answer> </final_answer> tags for proper evaluation.",
-    "This dataset contains elementary math word problems. Read carefully, solve step by step, and format your answer between <final_answer> </final_answer> tags.",
-    "I'm practicing math word problems that require multi-step reasoning. Help me solve this one and put the answer in <final_answer>answer</final_answer> format.",
-    "Solve the following arithmetic word problem. The answer should be a number placed between <final_answer> and </final_answer> tags. No explanations needed - just the formatted answer.",
-    "You're given a mathematical word problem from elementary school. Your task is to solve it using logical reasoning and mathematical operations. Present your final answer using this format: <final_answer>your answer</final_answer>.",
-    "Word problem ahead! Use your math skills to find the answer, then format it exactly like this: <final_answer>your numerical solution</final_answer>.",
 ]
 
-# randomly sample 5 initial prompts
-initial_prompts = random.sample(initial_prompts, 5)
-
-if "vllm" in args.model:
-    llm = get_llm(
-        args.model,
-        batch_size=None,
-        max_model_len=args.max_model_len,
-        model_storage_path=args.model_storage_path,
-        revision="main",
-    )
-else:
-    llm = get_llm(args.model, args.token)
+llm = APILLM(model_id=args.model_id, api_key=args.api_key, api_url=args.api_url)
 
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassificator(downstream_llm, classes=None)
+predictor = MarkerBasedClassifier(downstream_llm, classes=None)
 
 optimizer = CAPO(
     task=task,
@@ -90,7 +64,6 @@
     meta_llm=meta_llm,
     initial_prompts=initial_prompts,
     callbacks=callbacks,
-    n_eval_samples=args.n_eval_samples,
 )
 
 best_prompts = optimizer.optimize(n_steps=args.n_steps)
diff --git a/scripts/evoprompt_demo.py b/scripts/evoprompt_demo.py
index 8654e24..4177eb2 100644
--- a/scripts/evoprompt_demo.py
+++ b/scripts/evoprompt_demo.py
@@ -1,29 +1,26 @@
 """Test run for the Opro optimizer."""
 
+
 import argparse
 import random
 from logging import Logger
 
-from promptolution.callbacks import LoggerCallback, FileOutputCallback, TokenCountCallback
-from promptolution.templates import EVOPROMPT_GA_TEMPLATE
-from promptolution.helpers import get_llm
-from promptolution.tasks import ClassificationTask
-from promptolution.predictors import MarkerBasedClassificator
-from promptolution.optimizers import EvoPromptGA
 from datasets import load_dataset
 
+from promptolution.llms import APILLM
+from promptolution.optimizers import EVOPROMPT_GA_TEMPLATE, EvoPromptGA
+from promptolution.predictors import MarkerBasedClassifier
+from promptolution.tasks import ClassificationTask
+from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback
+
 logger = Logger(__name__)
 
-"""Run a test run for any of the implemented optimizers."""
 parser = argparse.ArgumentParser()
-parser.add_argument("--model")
-parser.add_argument("--model-storage-path", default="../models/")
+parser.add_argument("--model_id", default="meta-llama/Meta-Llama-3-8B-Instruct")
 parser.add_argument("--output-dir", default="results/evoprompt_ga_test/")
-parser.add_argument("--max-model-len", type=int, default=1024)
 parser.add_argument("--n-steps", type=int, default=2)
-parser.add_argument("--n-eval-samples", type=int, default=20)
-parser.add_argument("--token", default=None)
-parser.add_argument("--seed", type=int, default=187)
+parser.add_argument("--base_url", default="https://api.deepinfra.com/v1/openai")
+parser.add_argument("--api_url", default=None)
 args = parser.parse_args()
 
 callbacks = [
@@ -32,59 +29,38 @@
     TokenCountCallback(100000, "input_tokens"),
 ]
 
-df = load_dataset("SetFit/ag_news", split="train", revision="main").to_pandas().sample(300, random_state=args.seed)
+df = load_dataset("SetFit/ag_news", split="train", revision="main").to_pandas().sample(300)
 
-df["input"] = df["text"]
-df["target"] = df["label_text"]
 
 task = ClassificationTask(
     df,
     description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
-    x_column="input",
-    y_column="target",
+    x_column="text",
+    y_column="label_text",
+    eval_strategy="subsample",
+    n_subsamples=20,
 )
 
 initial_prompts = [
-    "Classify this news article as World, Sports, Business, or Tech. Provide your answer between <final_answer> and </final_answer> tags.",
-    "Read the following news article and determine which category it belongs to: World, Sports, Business, or Tech. Your classification must be placed between <final_answer> </final_answer> markers.",
     "What is the primary category of this news piece? Choose from World, Sports, Business, or Tech. Place your selected category between <final_answer> </final_answer>.",
     "Analyze this news article and categorize it as either World, Sports, Business, or Tech. Format your answer within <final_answer> </final_answer> tags.",
     "Your task is to identify whether this news article belongs to World, Sports, Business, or Tech news. Provide your classification between the markers <final_answer> </final_answer>.",
     "Please review the following news content and classify it into one of these categories: World, Sports, Business, or Tech. Your answer must be formatted with <final_answer> </final_answer> tags.",
     "Based on the content, determine if this news article falls under World, Sports, Business, or Tech category. Return only your classification within <final_answer> </final_answer>.",
     "Examine this news article and identify its primary category (World, Sports, Business, or Tech). Your final classification should be enclosed between <final_answer> </final_answer> markers.",
-    "In this task, you must categorize a news article into one of four classes: World, Sports, Business, or Tech. Remember to place your answer between <final_answer> </final_answer> tags for proper evaluation.",
-    "Read the provided news excerpt carefully and assign it to either World, Sports, Business, or Tech category. Ensure your answer appears between <final_answer> </final_answer> tags.",
-    "Considering the main subject matter, classify this news article as World, Sports, Business, or Tech. Format your response with <final_answer> </final_answer>.",
-    "Determine the appropriate category for this news article from the following options: World, Sports, Business, or Tech. Your selected category must be placed within <final_answer> </final_answer> markers.",
-    "After analyzing the given news article, assign it to the most suitable category: World, Sports, Business, or Tech. Your classification should be enclosed in <final_answer> </final_answer> tags.",
-    "Your objective is to classify the news article into one of the following categories: World, Sports, Business, or Tech based on its primary focus. Submit your answer between <final_answer> </final_answer> tags.",
-    "Which category best describes this news article: World, Sports, Business, or Tech? Provide your answer inside <final_answer> </final_answer> markers.",
-    "As a content classifier, determine if the following news article belongs to World, Sports, Business, or Tech news. Place your answer within <final_answer> </final_answer> tags.",
-    "Evaluate the following news article and indicate whether it primarily concerns World, Sports, Business, or Tech topics. Your classification must appear between <final_answer> </final_answer>.",
-    "Given a news article, your task is to determine its primary category from World, Sports, Business, or Tech. The final classification must be provided between <final_answer> </final_answer> tags.",
-    "Conduct a thorough analysis of the provided news article and classify it as belonging to one of these four categories: World, Sports, Business, or Tech. Your answer should be presented within <final_answer> </final_answer> markers.",
-    "Simply indicate whether this news article is about World, Sports, Business, or Tech. Include your answer between <final_answer> </final_answer> tags.",
 ]
 
-# randomly sample 5 initial prompts
-initial_prompts = random.sample(initial_prompts, 5)
 
-if "vllm" in args.model:
-    llm = get_llm(
-        args.model,
-        batch_size=None,
-        max_model_len=args.max_model_len,
-        model_storage_path=args.model_storage_path,
-        revision="main",
-    )
-else:
-    llm = get_llm(args.model, args.token)
+llm = APILLM(
+    api_url=args.base_url,
+    model_id=args.model_id,
+    api_key=args.api_key,
+)
 
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassificator(downstream_llm, classes=task.classes)
+predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes)
 
 optimizer = EvoPromptGA(
     task=task,
@@ -93,7 +69,6 @@
     meta_llm=meta_llm,
     initial_prompts=initial_prompts,
     callbacks=callbacks,
-    n_eval_samples=args.n_eval_samples,
 )
 
 best_prompts = optimizer.optimize(n_steps=args.n_steps)
diff --git a/scripts/opro_demo.py b/scripts/opro_demo.py
index 13cc1d3..eec818f 100644
--- a/scripts/opro_demo.py
+++ b/scripts/opro_demo.py
@@ -1,46 +1,42 @@
 """Test run for the Opro optimizer."""
 
+
 import argparse
-import random
 from logging import Logger
 
-from promptolution.callbacks import LoggerCallback, CSVCallback, TokenCountCallback
-from promptolution.templates import OPRO_TEMPLATE_TD
-from promptolution.helpers import get_llm
-from promptolution.tasks import ClassificationTask
-from promptolution.predictors import MarkerBasedClassificator
-from promptolution.optimizers import Opro
 from datasets import load_dataset
 
+from promptolution.llms import VLLM
+from promptolution.optimizers import OPRO, OPRO_TEMPLATE_TD
+from promptolution.predictors import MarkerBasedClassifier
+from promptolution.tasks import ClassificationTask
+from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback
+
 logger = Logger(__name__)
 
 """Run a test run for any of the implemented optimizers."""
 parser = argparse.ArgumentParser()
-parser.add_argument("--model")
+parser.add_argument("--model", default="meta-llama/Meta-Llama-3-8B-Instruct")
 parser.add_argument("--model-storage-path", default="../models/")
 parser.add_argument("--output-dir", default="results/opro_test/")
 parser.add_argument("--max-model-len", type=int, default=2048)
 parser.add_argument("--n-steps", type=int, default=999)
-parser.add_argument("--token", default=None)
 parser.add_argument("--seed", type=int, default=187)
 args = parser.parse_args()
 
 callbacks = [
     LoggerCallback(logger),
-    CSVCallback(args.output_dir),
+    FileOutputCallback(args.output_dir, file_type="csv"),
     TokenCountCallback(5000000, "input_tokens"),
 ]
 
 df = load_dataset("SetFit/ag_news", split="train", revision="main").to_pandas().sample(300, random_state=args.seed)
 
-df["input"] = df["text"]
-df["target"] = df["label_text"]
-
 task = ClassificationTask(
     df,
     description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
-    x_column="input",
-    y_column="target",
+    x_column="text",
+    y_column="label_text",
 )
 
 initial_prompts = [
@@ -52,39 +48,19 @@
     "Please review the following news content and classify it into one of these categories: World, Sports, Business, or Tech. Your answer must be formatted with <final_answer> </final_answer> tags.",
     "Based on the content, determine if this news article falls under World, Sports, Business, or Tech category. Return only your classification within <final_answer> </final_answer>.",
     "Examine this news article and identify its primary category (World, Sports, Business, or Tech). Your final classification should be enclosed between <final_answer> </final_answer> markers.",
-    "In this task, you must categorize a news article into one of four classes: World, Sports, Business, or Tech. Remember to place your answer between <final_answer> </final_answer> tags for proper evaluation.",
-    "Read the provided news excerpt carefully and assign it to either World, Sports, Business, or Tech category. Ensure your answer appears between <final_answer> </final_answer> tags.",
-    "Considering the main subject matter, classify this news article as World, Sports, Business, or Tech. Format your response with <final_answer> </final_answer>.",
-    "Determine the appropriate category for this news article from the following options: World, Sports, Business, or Tech. Your selected category must be placed within <final_answer> </final_answer> markers.",
-    "After analyzing the given news article, assign it to the most suitable category: World, Sports, Business, or Tech. Your classification should be enclosed in <final_answer> </final_answer> tags.",
-    "Your objective is to classify the news article into one of the following categories: World, Sports, Business, or Tech based on its primary focus. Submit your answer between <final_answer> </final_answer> tags.",
-    "Which category best describes this news article: World, Sports, Business, or Tech? Provide your answer inside <final_answer> </final_answer> markers.",
-    "As a content classifier, determine if the following news article belongs to World, Sports, Business, or Tech news. Place your answer within <final_answer> </final_answer> tags.",
-    "Evaluate the following news article and indicate whether it primarily concerns World, Sports, Business, or Tech topics. Your classification must appear between <final_answer> </final_answer>.",
-    "Given a news article, your task is to determine its primary category from World, Sports, Business, or Tech. The final classification must be provided between <final_answer> </final_answer> tags.",
-    "Conduct a thorough analysis of the provided news article and classify it as belonging to one of these four categories: World, Sports, Business, or Tech. Your answer should be presented within <final_answer> </final_answer> markers.",
-    "Simply indicate whether this news article is about World, Sports, Business, or Tech. Include your answer between <final_answer> </final_answer> tags.",
 ]
 
-initial_prompts = random.sample(initial_prompts, 10)
 
-if "vllm" in args.model:
-    llm = get_llm(
-        args.model,
-        batch_size=None,
-        max_model_len=args.max_model_len,
-        model_storage_path=args.model_storage_path,
-        revision="main",
-    )
-else:
-    llm = get_llm(args.model, args.token)
+llm = VLLM(
+    model_id=args.model, model_storage_path=args.model_storage_path, max_model_len=args.max_model_len, seed=args.seed
+)
 
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassificator(downstream_llm, classes=task.classes)
+predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes)
 
-optimizer = Opro(
+optimizer = OPRO(
     task=task,
     prompt_template=OPRO_TEMPLATE_TD.replace("<task_desc", task.description),
     predictor=predictor,
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index fbd7246..37ab055 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -4,15 +4,15 @@
 import pandas as pd
 import pytest
 
-from promptolution.callbacks import (
-    BaseCallback,
+from tests.mocks.mock_llm import MockLLM
+
+from promptolution.utils import (
     BestPromptCallback,
     FileOutputCallback,
     LoggerCallback,
     ProgressBarCallback,
     TokenCountCallback,
 )
-from tests.mocks.mock_llm import MockLLM
 
 
 @pytest.fixture
@@ -42,16 +42,6 @@ def mock_logger():
     return logger
 
 
-def test_base_callback():
-    """Test the base Callback class."""
-    callback = BaseCallback()
-
-    # Test default method implementations
-    assert callback.on_step_end(None) is True
-    assert callback.on_epoch_end(None) is True
-    assert callback.on_train_end(None) is True
-
-
 def test_logger_callback(mock_optimizer, mock_logger):
     """Test the LoggerCallback class."""
     callback = LoggerCallback(mock_logger)
@@ -188,7 +178,7 @@ def test_best_prompt_callback(mock_optimizer):
 
 def test_progress_bar_callback():
     """Test the ProgressBarCallback class."""
-    with patch("promptolution.callbacks.tqdm") as mock_tqdm:
+    with patch("promptolution.utils.callbacks.tqdm") as mock_tqdm:
         mock_pbar = MagicMock()
         mock_tqdm.return_value = mock_pbar
 
diff --git a/tests/callbacks/test_callbacks_integration.py b/tests/callbacks/test_callbacks_integration.py
index c92e509..6e17554 100644
--- a/tests/callbacks/test_callbacks_integration.py
+++ b/tests/callbacks/test_callbacks_integration.py
@@ -4,15 +4,15 @@
 import pandas as pd
 import pytest
 
-from promptolution.callbacks import (
-    BaseCallback,
+from tests.mocks.mock_llm import MockLLM
+
+from promptolution.utils import (
     BestPromptCallback,
     FileOutputCallback,
     LoggerCallback,
     ProgressBarCallback,
     TokenCountCallback,
 )
-from tests.mocks.mock_llm import MockLLM
 
 
 @pytest.fixture
@@ -42,16 +42,6 @@ def mock_logger():
     return logger
 
 
-def test_base_callback():
-    """Test the base Callback class."""
-    callback = BaseCallback()
-
-    # Test default method implementations
-    assert callback.on_step_end(None) is True
-    assert callback.on_epoch_end(None) is True
-    assert callback.on_train_end(None) is True
-
-
 def test_logger_callback(mock_optimizer, mock_logger):
     """Test the LoggerCallback class."""
     callback = LoggerCallback(mock_logger)
@@ -188,7 +178,7 @@ def test_best_prompt_callback(mock_optimizer):
 
 def test_progress_bar_callback():
     """Test the ProgressBarCallback class."""
-    with patch("promptolution.callbacks.tqdm") as mock_tqdm:
+    with patch("promptolution.utils.callbacks.tqdm") as mock_tqdm:
         mock_pbar = MagicMock()
         mock_tqdm.return_value = mock_pbar
 
diff --git a/tests/conftest.py b/tests/conftest.py
index b2e0a89..e2099d1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,13 @@
 """Fixtures for testing."""
+
 import pandas as pd
 import pytest
 from mocks.mock_llm import MockLLM
 from mocks.mock_predictor import MockPredictor
 from mocks.mock_task import MockTask
 
-from promptolution.config import ExperimentConfig
-from promptolution.tasks.classification_tasks import ClassificationTask
+from promptolution.tasks import ClassificationTask
+from promptolution.utils import ExperimentConfig
 
 
 @pytest.fixture
@@ -94,6 +95,6 @@ def mock_classification_task_with_subsampling(mock_df):
         description="Sentiment classification task",
         x_column="x",
         y_column="y",
-        subsample_strategy="subsample",
+        eval_strategy="subsample",
         n_subsamples=2,
     )
diff --git a/tests/helpers/test_helpers.py b/tests/helpers/test_helpers.py
index fed3090..909da42 100644
--- a/tests/helpers/test_helpers.py
+++ b/tests/helpers/test_helpers.py
@@ -4,12 +4,13 @@
 import pandas as pd
 import pytest
 
-from promptolution.config import ExperimentConfig
-from promptolution.helpers import run_evaluation, run_experiment, run_optimization
 from tests.mocks.mock_llm import MockLLM
 from tests.mocks.mock_predictor import MockPredictor
 from tests.mocks.mock_task import MockTask
 
+from promptolution.helpers import run_evaluation, run_experiment, run_optimization
+from promptolution.utils import ExperimentConfig
+
 
 @pytest.fixture
 def sample_df():
@@ -216,7 +217,7 @@ def test_run_evaluation(mock_get_task, mock_get_predictor, mock_get_llm, sample_
     mock_get_llm.assert_called_once_with(config=experiment_config)
     mock_get_predictor.assert_called_once_with(mock_llm, config=experiment_config)
     mock_get_task.assert_called_once_with(sample_df, experiment_config)
-    mock_task.evaluate.assert_called_once_with(prompts, mock_predictor)
+    mock_task.evaluate.assert_called_once_with(prompts, mock_predictor, eval_strategy="full")
 
 
 @patch("promptolution.helpers.run_optimization")
diff --git a/tests/llms/test_api_llm.py b/tests/llms/test_api_llm.py
index cccd518..2d1cb5a 100644
--- a/tests/llms/test_api_llm.py
+++ b/tests/llms/test_api_llm.py
@@ -1,6 +1,6 @@
 from unittest.mock import MagicMock, patch
 
-from promptolution.llms.api_llm import APILLM
+from promptolution.llms import APILLM
 
 
 def test_api_llm_initialization():
@@ -17,7 +17,9 @@ def test_api_llm_initialization():
         mock_asyncio.Semaphore.return_value = mock_semaphore
 
         # Create APILLM instance
-        api_llm = APILLM(api_url="https://api.example.com", llm="gpt-4", token="test-token", max_concurrent_calls=10)
+        api_llm = APILLM(
+            api_url="https://api.example.com", model_id="gpt-4", api_key="test-token", max_concurrent_calls=10
+        )
 
         # Verify AsyncOpenAI was called correctly
         mock_client_class.assert_called_once()
@@ -30,5 +32,5 @@ def test_api_llm_initialization():
 
         # Verify instance attributes
         assert api_llm.api_url == "https://api.example.com"
-        assert api_llm.llm == "gpt-4"
+        assert api_llm.model_id == "gpt-4"
         assert api_llm.max_concurrent_calls == 10
diff --git a/tests/llms/test_base_llm.py b/tests/llms/test_base_llm.py
deleted file mode 100644
index 2494c67..0000000
--- a/tests/llms/test_base_llm.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from promptolution.llms.base_llm import DummyLLM
-
-
-def test_base_llm_token_counting():
-    """Test token counting functionality."""
-    llm = DummyLLM()
-
-    # Get initial token count
-    initial_count = llm.get_token_count()
-    assert initial_count["input_tokens"] == 0
-    assert initial_count["output_tokens"] == 0
-
-    # Process some text
-    prompts = ["This is a test prompt with several words."]
-    llm.get_response(prompts)
-
-    # Check updated token count
-    updated_count = llm.get_token_count()
-    assert updated_count["input_tokens"] > 0
-    assert updated_count["output_tokens"] > 0
-
-    # Reset token count
-    llm.reset_token_count()
-    reset_count = llm.get_token_count()
-    assert reset_count["input_tokens"] == 0
-    assert reset_count["output_tokens"] == 0
diff --git a/tests/llms/test_local_llm.py b/tests/llms/test_local_llm.py
index 98c7ee3..1165d0a 100644
--- a/tests/llms/test_local_llm.py
+++ b/tests/llms/test_local_llm.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from promptolution.llms.local_llm import LocalLLM
+from promptolution.llms import LocalLLM
 
 
 @pytest.fixture
diff --git a/tests/llms/test_vllm.py b/tests/llms/test_vllm.py
index 55d1551..d1f6fc8 100644
--- a/tests/llms/test_vllm.py
+++ b/tests/llms/test_vllm.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from promptolution.llms.vllm import VLLM
+from promptolution.llms import VLLM
 
 
 @pytest.fixture
diff --git a/tests/mocks/mock_optimizer.py b/tests/mocks/mock_optimizer.py
index bc3d1f3..c33cd7d 100644
--- a/tests/mocks/mock_optimizer.py
+++ b/tests/mocks/mock_optimizer.py
@@ -1,9 +1,12 @@
 """Mock Optimizer for Testing."""
 
+
 from unittest.mock import MagicMock
 
+from promptolution.optimizers.base_optimizer import BaseOptimizer
+
 
-class MockOptimizer:
+class MockOptimizer(BaseOptimizer):
     """Mock optimizer for testing callbacks and other components.
 
     This class simulates an optimizer without requiring actual optimization processes.
diff --git a/tests/mocks/mock_predictor.py b/tests/mocks/mock_predictor.py
index 349111d..455deb0 100644
--- a/tests/mocks/mock_predictor.py
+++ b/tests/mocks/mock_predictor.py
@@ -1,8 +1,9 @@
 """Mock predictor for testing purposes."""
-from typing import List, Optional, Tuple
 
 import numpy as np
 
+from typing import List, Optional, Tuple
+
 from promptolution.llms.base_llm import BaseLLM
 from promptolution.predictors.base_predictor import BasePredictor
 
diff --git a/tests/mocks/mock_task.py b/tests/mocks/mock_task.py
index 9946ff8..488c3d1 100644
--- a/tests/mocks/mock_task.py
+++ b/tests/mocks/mock_task.py
@@ -1,9 +1,11 @@
 """Mock task for testing purposes."""
-from typing import List
+
 from unittest.mock import MagicMock
 
 import numpy as np
 
+from typing import List
+
 from promptolution.tasks.base_task import BaseTask
 
 
@@ -44,7 +46,7 @@ def evaluate(
         self,
         prompts: List[str],
         predictor,
-        strategy: str = "subsample",
+        eval_strategy: str = "subsample",
         system_prompts: List[str] = None,
         return_agg_scores: bool = False,
         return_seq: bool = False,
@@ -68,7 +70,7 @@ def evaluate(
                 "prompts": prompts,
                 "predictor": predictor,
                 "system_prompts": system_prompts,
-                "strategy": strategy,
+                "eval_strategy": eval_strategy,
                 "return_agg_scores": return_agg_scores,
                 "return_seq": return_seq,
             }
diff --git a/tests/optimizers/test_capo.py b/tests/optimizers/test_capo.py
index c76d85e..b0c9ae2 100644
--- a/tests/optimizers/test_capo.py
+++ b/tests/optimizers/test_capo.py
@@ -1,8 +1,6 @@
 from unittest.mock import MagicMock, patch
 
-import numpy as np
 import pandas as pd
-import pytest
 
 from promptolution.optimizers.capo import CAPO, CAPOPrompt
 
diff --git a/tests/optimizers/test_evoprompt_de.py b/tests/optimizers/test_evoprompt_de.py
index 52fa4e7..8c01adb 100644
--- a/tests/optimizers/test_evoprompt_de.py
+++ b/tests/optimizers/test_evoprompt_de.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 
-from promptolution.optimizers.evoprompt_de import EvoPromptDE
+from promptolution.optimizers import EvoPromptDE
 
 
 def test_evoprompt_de_initialization(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
@@ -15,13 +15,11 @@ def test_evoprompt_de_initialization(mock_meta_llm, initial_prompts, mock_task,
         prompt_template="Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>",
         meta_llm=mock_meta_llm,
         donor_random=False,
-        n_eval_samples=15,
     )
 
     # Only verify the essential properties
     assert optimizer.prompt_template == "Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>"
     assert not optimizer.donor_random
-    assert optimizer.n_eval_samples == 15
     assert optimizer.prompts == initial_prompts
 
 
diff --git a/tests/optimizers/test_evoprompt_ga.py b/tests/optimizers/test_evoprompt_ga.py
index 6ea8fc9..a1c3160 100644
--- a/tests/optimizers/test_evoprompt_ga.py
+++ b/tests/optimizers/test_evoprompt_ga.py
@@ -1,6 +1,6 @@
 from unittest.mock import patch
 
-from promptolution.optimizers.evoprompt_ga import EvoPromptGA
+from promptolution.optimizers import EvoPromptGA
 
 
 def test_evoprompt_ga_initialization(mock_meta_llm, initial_prompts, mock_task, experiment_config, mock_predictor):
diff --git a/tests/optimizers/test_opro.py b/tests/optimizers/test_opro.py
index dc0c323..9910442 100644
--- a/tests/optimizers/test_opro.py
+++ b/tests/optimizers/test_opro.py
@@ -2,12 +2,12 @@
 
 import numpy as np
 
-from promptolution.optimizers.opro import Opro
+from promptolution.optimizers import OPRO
 
 
 def test_opro_initialization(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
     """Test that OPRO initializes correctly."""
-    optimizer = Opro(
+    optimizer = OPRO(
         predictor=mock_predictor,
         task=mock_task,
         initial_prompts=initial_prompts,
@@ -28,7 +28,7 @@ def test_opro_initialization(mock_meta_llm, initial_prompts, mock_task, mock_pre
 
 def test_opro_sample_examples(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
     """Test the _sample_examples method."""
-    optimizer = Opro(
+    optimizer = OPRO(
         predictor=mock_predictor,
         task=mock_task,
         initial_prompts=initial_prompts,
@@ -53,7 +53,7 @@ def test_opro_sample_examples(mock_meta_llm, initial_prompts, mock_task, mock_pr
 
 def test_opro_format_instructions(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
     """Test the _format_instructions method."""
-    optimizer = Opro(
+    optimizer = OPRO(
         predictor=mock_predictor,
         task=mock_task,
         initial_prompts=initial_prompts,
@@ -76,7 +76,7 @@ def test_opro_format_instructions(mock_meta_llm, initial_prompts, mock_task, moc
 
 def test_opro_pre_optimization_loop(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
     """Test the _pre_optimization_loop method."""
-    optimizer = Opro(
+    optimizer = OPRO(
         predictor=mock_predictor,
         task=mock_task,
         initial_prompts=initial_prompts,
@@ -99,7 +99,7 @@ def test_opro_pre_optimization_loop(mock_meta_llm, initial_prompts, mock_task, m
 
 def test_opro_step(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
     """Test the _step method."""
-    optimizer = Opro(
+    optimizer = OPRO(
         predictor=mock_predictor,
         task=mock_task,
         initial_prompts=initial_prompts,
diff --git a/tests/predictors/test_classificators.py b/tests/predictors/test_classifiers.py
similarity index 72%
rename from tests/predictors/test_classificators.py
rename to tests/predictors/test_classifiers.py
index c393f77..f561995 100644
--- a/tests/predictors/test_classificators.py
+++ b/tests/predictors/test_classifiers.py
@@ -1,13 +1,13 @@
 import numpy as np
 import pytest
 
-from promptolution.predictors.classificator import FirstOccurrenceClassificator, MarkerBasedClassificator
+from promptolution.helpers import FirstOccurrenceClassifier, MarkerBasedClassifier
 
 
-def test_first_occurrence_classificator(mock_downstream_llm, mock_df):
-    """Test the FirstOccurrenceClassificator."""
+def test_first_occurrence_classifier(mock_downstream_llm, mock_df):
+    """Test the FirstOccurrenceClassifier."""
     # Create classifier
-    classifier = FirstOccurrenceClassificator(llm=mock_downstream_llm, classes=mock_df["y"].values)
+    classifier = FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values)
 
     # Test with multiple inputs
     xs = np.array(["I love this product!", "I hate this product!", "This product is okay.", "ja ne"])
@@ -24,10 +24,10 @@ def test_first_occurrence_classificator(mock_downstream_llm, mock_df):
     assert predictions[3] == "positive"
 
 
-def test_marker_based_classificator(mock_downstream_llm, mock_df):
-    """Test the MarkerBasedClassificator."""
+def test_marker_based_classifier(mock_downstream_llm, mock_df):
+    """Test the MarkerBasedClassifier."""
     # Create classifier
-    classifier = MarkerBasedClassificator(
+    classifier = MarkerBasedClassifier(
         llm=mock_downstream_llm,
         classes=mock_df["y"].values,
         begin_marker="<final_answer>",
@@ -56,9 +56,9 @@ def test_marker_based_classificator(mock_downstream_llm, mock_df):
 
 
 def test_marker_based_without_classes(mock_downstream_llm):
-    """Test MarkerBasedClassificator without predefined classes."""
+    """Test MarkerBasedClassifier without predefined classes."""
     # Create classifier without classes
-    classifier = MarkerBasedClassificator(
+    classifier = MarkerBasedClassifier(
         llm=mock_downstream_llm,
         classes=None,  # No class restrictions
         begin_marker="<final_answer>",
@@ -80,10 +80,10 @@ def test_marker_based_without_classes(mock_downstream_llm):
     assert predictions[3] == "i dont know"
 
 
-def test_multiple_prompts_with_classificators(mock_downstream_llm, mock_df):
-    """Test using multiple prompts with classificators."""
+def test_multiple_prompts_with_classifiers(mock_downstream_llm, mock_df):
+    """Test using multiple prompts with classifiers."""
     # Create classifier
-    classifier = FirstOccurrenceClassificator(llm=mock_downstream_llm, classes=mock_df["y"].values)
+    classifier = FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values)
 
     # Test with multiple prompts
     prompts = ["Classify:", "Classify:", "Rate:", "Rate:"]
@@ -100,10 +100,10 @@ def test_multiple_prompts_with_classificators(mock_downstream_llm, mock_df):
     assert predictions[3] == "positive"
 
 
-def test_sequence_return_with_classificators(mock_downstream_llm, mock_df):
-    """Test return_seq parameter with classificators."""
+def test_sequence_return_with_classifiers(mock_downstream_llm, mock_df):
+    """Test return_seq parameter with classifiers."""
     # Create classifier
-    classifier = MarkerBasedClassificator(llm=mock_downstream_llm, classes=mock_df["y"].values)
+    classifier = MarkerBasedClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values)
 
     # Test with return_seq=True
     prompts = ["Classify:"]
@@ -128,15 +128,15 @@ def test_invalid_class_labels(mock_downstream_llm):
 
     # Should raise an assertion error
     with pytest.raises(AssertionError):
-        FirstOccurrenceClassificator(llm=mock_downstream_llm, classes=invalid_classes)
+        FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=invalid_classes)
 
     with pytest.raises(AssertionError):
-        MarkerBasedClassificator(llm=mock_downstream_llm, classes=invalid_classes)
+        MarkerBasedClassifier(llm=mock_downstream_llm, classes=invalid_classes)
 
 
 def test_marker_based_missing_markers(mock_downstream_llm):
-    """Test MarkerBasedClassificator behavior when markers are missing."""
-    classifier = MarkerBasedClassificator(llm=mock_downstream_llm, classes=["will", "not", "be", "used"])
+    """Test MarkerBasedClassifier behavior when markers are missing."""
+    classifier = MarkerBasedClassifier(llm=mock_downstream_llm, classes=["will", "not", "be", "used"])
 
     # When markers are missing, it should default to first class
     prompts = ["Classify:"]
diff --git a/tests/tasks/test_classifications_tasks.py b/tests/tasks/test_classifications_tasks.py
index 4d267f7..d72a7e6 100644
--- a/tests/tasks/test_classifications_tasks.py
+++ b/tests/tasks/test_classifications_tasks.py
@@ -1,10 +1,8 @@
 import numpy as np
-import pandas as pd
 import pytest
 from sklearn.metrics import accuracy_score
 
-from promptolution.predictors.classificator import FirstOccurrenceClassificator
-from promptolution.tasks.classification_tasks import ClassificationTask
+from promptolution.tasks import ClassificationTask
 
 
 def test_classification_task_initialization(mock_df):
@@ -112,7 +110,7 @@ def test_pop_datapoints(mock_df):
     task = ClassificationTask(
         df=mock_df,
         description="Sentiment classification task",
-        subsample_strategy="sequential_blocks",
+        eval_strategy="sequential_blocks",
     )
 
     df = task.pop_datapoints(n=1)
@@ -123,7 +121,7 @@ def test_pop_datapoints(mock_df):
 
 def test_blocks(mock_df):
     task = ClassificationTask(
-        df=mock_df, description="Sentiment classification task", subsample_strategy="sequential_blocks", block_size=1
+        df=mock_df, description="Sentiment classification task", eval_strategy="sequential_blocks", n_subsamples=1
     )
 
     # Increment blocks