diff --git a/optillm/__init__.py b/optillm/__init__.py index ca7b60e..50b9850 100644 --- a/optillm/__init__.py +++ b/optillm/__init__.py @@ -1,5 +1,5 @@ # Version information -__version__ = "0.3.4" +__version__ = "0.3.5" # Import from server module from .server import ( diff --git a/optillm/plugins/proxy_plugin.py b/optillm/plugins/proxy_plugin.py index 9e8a6c6..edcdc47 100644 --- a/optillm/plugins/proxy_plugin.py +++ b/optillm/plugins/proxy_plugin.py @@ -120,12 +120,17 @@ def run(system_prompt: str, initial_query: str, client, model: str, if not config.get('providers'): logger.warning("No providers configured, falling back to original client") + # Strip stream parameter to force complete response + api_config = dict(request_config or {}) + api_config.pop('stream', None) + response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": initial_query} - ] + ], + **api_config ) # Return full response dict to preserve all usage information response_dict = response.model_dump() if hasattr(response, 'model_dump') else response @@ -204,12 +209,17 @@ def run(system_prompt: str, initial_query: str, client, model: str, if not supports_system_messages: logger.info(f"Using fallback message formatting for {model} (no system message support)") + # Strip stream parameter to force complete response + # server.py will handle converting to SSE streaming format if needed + api_config = dict(request_config or {}) + api_config.pop('stream', None) + response = proxy_client.chat.completions.create( model=model, messages=messages, - **(request_config or {}) + **api_config ) - + # Return full response dict to preserve all usage information response_dict = response.model_dump() if hasattr(response, 'model_dump') else response return response_dict, 0 @@ -218,12 +228,17 @@ def run(system_prompt: str, initial_query: str, client, model: str, logger.error(f"Proxy plugin error: {e}", exc_info=True) # Fallback to original client logger.info("Falling back to original client") + # Strip stream parameter to force complete response + api_config = dict(request_config or {}) + api_config.pop('stream', None) + response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": initial_query} - ] + ], + **api_config ) # Return full response dict to preserve all usage information response_dict = response.model_dump() if hasattr(response, 'model_dump') else response diff --git a/pyproject.toml b/pyproject.toml index e8e3fd3..5a7fd95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "optillm" -version = "0.3.4" +version = "0.3.5" description = "An optimizing inference proxy for LLMs." readme = "README.md" license = "Apache-2.0"