algorithmicsuperintelligence · codelion · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/optillm/__init__.py b/optillm/__init__.py
@@ -1,5 +1,5 @@
 # Version information
-__version__ = "0.3.4"
+__version__ = "0.3.5"
 
 # Import from server module
 from .server import (

diff --git a/optillm/plugins/proxy_plugin.py b/optillm/plugins/proxy_plugin.py
@@ -120,12 +120,17 @@ def run(system_prompt: str, initial_query: str, client, model: str,
 
         if not config.get('providers'):
             logger.warning("No providers configured, falling back to original client")
+            # Strip stream parameter to force complete response
+            api_config = dict(request_config or {})
+            api_config.pop('stream', None)
+
             response = client.chat.completions.create(
                 model=model,
                 messages=[
                     {"role": "system", "content": system_prompt},
                     {"role": "user", "content": initial_query}
-                ]
+                ],
+                **api_config
             )
             # Return full response dict to preserve all usage information
             response_dict = response.model_dump() if hasattr(response, 'model_dump') else response
@@ -204,12 +209,17 @@ def run(system_prompt: str, initial_query: str, client, model: str,
         if not supports_system_messages:
             logger.info(f"Using fallback message formatting for {model} (no system message support)")
 
+        # Strip stream parameter to force complete response
+        # server.py will handle converting to SSE streaming format if needed
+        api_config = dict(request_config or {})
+        api_config.pop('stream', None)
+
         response = proxy_client.chat.completions.create(
             model=model,
             messages=messages,
-            **(request_config or {})
+            **api_config
         )
-        
+
         # Return full response dict to preserve all usage information
         response_dict = response.model_dump() if hasattr(response, 'model_dump') else response
         return response_dict, 0
@@ -218,12 +228,17 @@ def run(system_prompt: str, initial_query: str, client, model: str,
         logger.error(f"Proxy plugin error: {e}", exc_info=True)
         # Fallback to original client
         logger.info("Falling back to original client")
+        # Strip stream parameter to force complete response
+        api_config = dict(request_config or {})
+        api_config.pop('stream', None)
+
         response = client.chat.completions.create(
             model=model,
             messages=[
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": initial_query}
-            ]
+            ],
+            **api_config
         )
         # Return full response dict to preserve all usage information
         response_dict = response.model_dump() if hasattr(response, 'model_dump') else response

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "optillm"
-version = "0.3.4"
+version = "0.3.5"
 description = "An optimizing inference proxy for LLMs."
 readme = "README.md"
 license = "Apache-2.0"