From c7d00161b3e6af233cc26c29e11f4c9b7c560735 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sun, 19 Jan 2025 09:38:33 +0800
Subject: [PATCH 01/11] Update eval_arena_hard_auto_rtc.py

- update max tokens
---
 scripts/eval_arena_hard_auto_rtc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/eval_arena_hard_auto_rtc.py b/scripts/eval_arena_hard_auto_rtc.py
index 890c8019..76ab4835 100644
--- a/scripts/eval_arena_hard_auto_rtc.py
+++ b/scripts/eval_arena_hard_auto_rtc.py
@@ -17,7 +17,8 @@
 logger = logging.getLogger(__name__)
 
 # Initialize OpenAI client (only used for chat completions now)
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+client = OpenAI(base_url="http://localhost:8000/v1", api_key=os.environ.get("OPENAI_API_KEY"))
+# client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
 @dataclass
 class RTCConfig:
@@ -58,8 +59,7 @@ def get_llm_response(messages: List[Dict], model: str) -> Optional[str]:
             response = client.chat.completions.create(
                 model=model,
                 messages=messages,
-                temperature=0.7,
-                max_tokens=1000
+                max_tokens=4096
             )
             return response.choices[0].message.content.strip()
         except Exception as e:

From ac66567e355b838574d2f083eab35feefd0a95be Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sun, 19 Jan 2025 09:39:45 +0800
Subject: [PATCH 02/11] Update coc_plugin.py

---
 optillm/plugins/coc_plugin.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/optillm/plugins/coc_plugin.py b/optillm/plugins/coc_plugin.py
index 62ef5ce6..b8f421ec 100644
--- a/optillm/plugins/coc_plugin.py
+++ b/optillm/plugins/coc_plugin.py
@@ -108,12 +108,10 @@ def sanitize_code(code: str) -> str:
     # Add safety wrapper
     wrapper = f"""
 {imports}
-
 def safe_execute():
     import numpy as np  # Always allow numpy
-    {safe_code.replace('\n', '\n    ')}
+    {safe_code.replace('\\n', '\\n    ')}
     return answer if 'answer' in locals() else None
-
 result = safe_execute()
 answer = result
 """

From 576c7d530b962778782b51ebf364cb239fe29e0d Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sun, 19 Jan 2025 09:47:29 +0800
Subject: [PATCH 03/11] Update coc_plugin.py

fix bug
---
 optillm/plugins/coc_plugin.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/optillm/plugins/coc_plugin.py b/optillm/plugins/coc_plugin.py
index b8f421ec..34d18776 100644
--- a/optillm/plugins/coc_plugin.py
+++ b/optillm/plugins/coc_plugin.py
@@ -104,14 +104,17 @@ def sanitize_code(code: str) -> str:
         safe_lines.append(line)
     
     safe_code = '\n'.join(safe_lines)
+    safe_code = safe_code.replace('\n', '\n    ')
     
     # Add safety wrapper
     wrapper = f"""
 {imports}
+
 def safe_execute():
     import numpy as np  # Always allow numpy
-    {safe_code.replace('\\n', '\\n    ')}
+    {safe_code}
     return answer if 'answer' in locals() else None
+
 result = safe_execute()
 answer = result
 """

From 917141f48163b802b782bcd86df8f1729ccc8665 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sun, 19 Jan 2025 10:03:48 +0800
Subject: [PATCH 04/11] fix dependenices on GPU for local inference

add protobuf
---
 requirements.txt | 3 ++-
 setup.py         | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index af328f48..3eb1bdde 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ ipython
 ipykernel
 peft
 bitsandbytes
-gradio
\ No newline at end of file
+gradio
+protobuf==3.20.3
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 25068c22..a19c71a3 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="optillm",
-    version="0.0.24",
+    version="0.0.25",
     packages=find_packages(),
     py_modules=['optillm'],
     package_data={
@@ -34,6 +34,7 @@
         "peft",
         "bitsandbytes",
         "gradio",
+        "protobuf"
     ],
     entry_points={
         'console_scripts': [

From 8e0adfcb839a361fc30359a4aa63532722ad4656 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 20 Jan 2025 09:33:40 +0800
Subject: [PATCH 05/11] update dependencies

---
 requirements.txt         | 3 +--
 scripts/requirements.txt | 1 -
 setup.py                 | 3 +--
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3eb1bdde..af328f48 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,5 +21,4 @@ ipython
 ipykernel
 peft
 bitsandbytes
-gradio
-protobuf==3.20.3
\ No newline at end of file
+gradio
\ No newline at end of file
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index 8a130edf..dd662e3e 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1,4 +1,3 @@
 datasets
 accelerate
 huggingface_hub
-git+https://github.com/huggingface/transformers.git
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a19c71a3..610493c8 100644
--- a/setup.py
+++ b/setup.py
@@ -33,8 +33,7 @@
         "ipykernel",
         "peft",
         "bitsandbytes",
-        "gradio",
-        "protobuf"
+        "gradio"
     ],
     entry_points={
         'console_scripts': [

From c0f0893c5b502c935181284f0f2b2e46395fae74 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 20 Jan 2025 10:23:02 +0800
Subject: [PATCH 06/11] Update inference.py

fix inference on amd gpu
---
 optillm/inference.py | 87 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 76 insertions(+), 11 deletions(-)

diff --git a/optillm/inference.py b/optillm/inference.py
index 3c8a4a4e..3c9901de 100644
--- a/optillm/inference.py
+++ b/optillm/inference.py
@@ -402,42 +402,107 @@ def __init__(self):
         self.device_stats = {device: {'memory_used': 0, 'active_models': 0} for device in self.available_devices}
 
     def _detect_devices(self) -> List[str]:
+        """Detect available compute devices including AMD GPUs via ROCm"""
         devices = ['cpu']
+        
+        # Check for CUDA (NVIDIA) GPUs
         if torch.cuda.is_available():
-            devices.extend([f'cuda:{i}' for i in range(torch.cuda.device_count())])
+            backend = torch.cuda.get_device_properties(0).platform
+            if backend == 'ROCm':
+                # AMD GPUs via ROCm
+                devices.extend([f'cuda:{i}' for i in range(torch.cuda.device_count())])
+                logging.info("Detected AMD GPU(s) using ROCm backend")
+            else:
+                # NVIDIA GPUs
+                devices.extend([f'cuda:{i}' for i in range(torch.cuda.device_count())])
+                logging.info("Detected NVIDIA GPU(s)")
+                
+        # Check for Apple M-series GPU
         if torch.backends.mps.is_available():
             devices.append('mps')
+            logging.info("Detected Apple M-series GPU")
+            
         return devices
 
     def get_optimal_device(self, model_size: int = 0) -> str:
+        """Select the optimal device considering AMD GPU support"""
         if not self.available_devices:
             return 'cpu'
 
-        # Prefer CUDA devices if available
+        # Get CUDA devices (both NVIDIA and AMD via ROCm)
         cuda_devices = [d for d in self.available_devices if 'cuda' in d]
+        
         if cuda_devices:
-            # Find CUDA device with most free memory
+            # Find device with most free memory
             max_free_memory = 0
             optimal_device = cuda_devices[0]
             
-            for device in cuda_devices:
-                idx = int(device.split(':')[1])
-                free_memory = torch.cuda.get_device_properties(idx).total_memory - torch.cuda.memory_allocated(idx)
-                if free_memory > max_free_memory:
-                    max_free_memory = free_memory
-                    optimal_device = device
-            
-            return optimal_device
+            try:
+                for device in cuda_devices:
+                    idx = int(device.split(':')[1])
+                    # Get memory info safely handling both NVIDIA and AMD
+                    try:
+                        total_memory = torch.cuda.get_device_properties(idx).total_memory
+                        used_memory = torch.cuda.memory_allocated(idx)
+                        free_memory = total_memory - used_memory
+                    except Exception as e:
+                        logging.warning(f"Error getting memory info for device {device}: {e}")
+                        continue
+                        
+                    if free_memory > max_free_memory:
+                        max_free_memory = free_memory
+                        optimal_device = device
+                        
+                logging.info(f"Selected optimal CUDA device: {optimal_device} with {max_free_memory/1e9:.2f}GB free memory")
+                return optimal_device
+                
+            except Exception as e:
+                logging.error(f"Error selecting optimal CUDA device: {e}")
+                # Fall back to first CUDA device if memory query fails
+                return cuda_devices[0]
         
         # Fall back to MPS if available
         if 'mps' in self.available_devices:
             return 'mps'
         
+        # Final fallback to CPU
+        logging.info("No GPU detected, using CPU")
         return 'cpu'
 
     def track_device_usage(self, device: str, memory_delta: int):
+        """Track memory usage for the device"""
         if device in self.device_stats:
             self.device_stats[device]['memory_used'] += memory_delta
+            
+    def get_device_info(self, device: str) -> Dict[str, Any]:
+        """Get detailed information about a device"""
+        info = {
+            'type': 'cpu',
+            'memory_total': None,
+            'memory_used': None,
+            'memory_free': None
+        }
+        
+        if 'cuda' in device:
+            try:
+                idx = int(device.split(':')[1])
+                props = torch.cuda.get_device_properties(idx)
+                info.update({
+                    'type': 'gpu',
+                    'name': props.name,
+                    'backend': 'ROCm' if hasattr(props, 'platform') and props.platform == 'ROCm' else 'CUDA',
+                    'compute_capability': f"{props.major}.{props.minor}",
+                    'memory_total': props.total_memory,
+                    'memory_used': torch.cuda.memory_allocated(idx),
+                    'memory_free': props.total_memory - torch.cuda.memory_allocated(idx)
+                })
+            except Exception as e:
+                logging.warning(f"Error getting device info for {device}: {e}")
+                
+        elif device == 'mps':
+            info['type'] = 'mps'
+            
+        return info
 
 class ModelManager:
     def __init__(self, cache_manager: CacheManager, device_manager: DeviceManager):

From 75a5d21e0b31ada37c48d9b04b205fd4dc6e5cd2 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 20 Jan 2025 12:10:25 +0800
Subject: [PATCH 07/11] Revert "Update inference.py"

This reverts commit c0f0893c5b502c935181284f0f2b2e46395fae74.
---
 optillm/inference.py | 87 ++++++--------------------------------------
 1 file changed, 11 insertions(+), 76 deletions(-)

diff --git a/optillm/inference.py b/optillm/inference.py
index 3c9901de..3c8a4a4e 100644
--- a/optillm/inference.py
+++ b/optillm/inference.py
@@ -402,107 +402,42 @@ def __init__(self):
         self.device_stats = {device: {'memory_used': 0, 'active_models': 0} for device in self.available_devices}
 
     def _detect_devices(self) -> List[str]:
-        """Detect available compute devices including AMD GPUs via ROCm"""
         devices = ['cpu']
-        
-        # Check for CUDA (NVIDIA) GPUs
         if torch.cuda.is_available():
-            backend = torch.cuda.get_device_properties(0).platform
-            if backend == 'ROCm':
-                # AMD GPUs via ROCm
-                devices.extend([f'cuda:{i}' for i in range(torch.cuda.device_count())])
-                logging.info("Detected AMD GPU(s) using ROCm backend")
-            else:
-                # NVIDIA GPUs
-                devices.extend([f'cuda:{i}' for i in range(torch.cuda.device_count())])
-                logging.info("Detected NVIDIA GPU(s)")
-                
-        # Check for Apple M-series GPU
+            devices.extend([f'cuda:{i}' for i in range(torch.cuda.device_count())])
         if torch.backends.mps.is_available():
             devices.append('mps')
-            logging.info("Detected Apple M-series GPU")
-            
         return devices
 
     def get_optimal_device(self, model_size: int = 0) -> str:
-        """Select the optimal device considering AMD GPU support"""
         if not self.available_devices:
             return 'cpu'
 
-        # Get CUDA devices (both NVIDIA and AMD via ROCm)
+        # Prefer CUDA devices if available
         cuda_devices = [d for d in self.available_devices if 'cuda' in d]
-        
         if cuda_devices:
-            # Find device with most free memory
+            # Find CUDA device with most free memory
             max_free_memory = 0
             optimal_device = cuda_devices[0]
             
-            try:
-                for device in cuda_devices:
-                    idx = int(device.split(':')[1])
-                    # Get memory info safely handling both NVIDIA and AMD
-                    try:
-                        total_memory = torch.cuda.get_device_properties(idx).total_memory
-                        used_memory = torch.cuda.memory_allocated(idx)
-                        free_memory = total_memory - used_memory
-                    except Exception as e:
-                        logging.warning(f"Error getting memory info for device {device}: {e}")
-                        continue
-                        
-                    if free_memory > max_free_memory:
-                        max_free_memory = free_memory
-                        optimal_device = device
-                        
-                logging.info(f"Selected optimal CUDA device: {optimal_device} with {max_free_memory/1e9:.2f}GB free memory")
-                return optimal_device
-                
-            except Exception as e:
-                logging.error(f"Error selecting optimal CUDA device: {e}")
-                # Fall back to first CUDA device if memory query fails
-                return cuda_devices[0]
+            for device in cuda_devices:
+                idx = int(device.split(':')[1])
+                free_memory = torch.cuda.get_device_properties(idx).total_memory - torch.cuda.memory_allocated(idx)
+                if free_memory > max_free_memory:
+                    max_free_memory = free_memory
+                    optimal_device = device
+            
+            return optimal_device
         
         # Fall back to MPS if available
         if 'mps' in self.available_devices:
             return 'mps'
         
-        # Final fallback to CPU
-        logging.info("No GPU detected, using CPU")
         return 'cpu'
 
     def track_device_usage(self, device: str, memory_delta: int):
-        """Track memory usage for the device"""
         if device in self.device_stats:
             self.device_stats[device]['memory_used'] += memory_delta
-            
-    def get_device_info(self, device: str) -> Dict[str, Any]:
-        """Get detailed information about a device"""
-        info = {
-            'type': 'cpu',
-            'memory_total': None,
-            'memory_used': None,
-            'memory_free': None
-        }
-        
-        if 'cuda' in device:
-            try:
-                idx = int(device.split(':')[1])
-                props = torch.cuda.get_device_properties(idx)
-                info.update({
-                    'type': 'gpu',
-                    'name': props.name,
-                    'backend': 'ROCm' if hasattr(props, 'platform') and props.platform == 'ROCm' else 'CUDA',
-                    'compute_capability': f"{props.major}.{props.minor}",
-                    'memory_total': props.total_memory,
-                    'memory_used': torch.cuda.memory_allocated(idx),
-                    'memory_free': props.total_memory - torch.cuda.memory_allocated(idx)
-                })
-            except Exception as e:
-                logging.warning(f"Error getting device info for {device}: {e}")
-                
-        elif device == 'mps':
-            info['type'] = 'mps'
-            
-        return info
 
 class ModelManager:
     def __init__(self, cache_manager: CacheManager, device_manager: DeviceManager):

From e45cb7f0f76f2457b6e9821eb11386f3a2727ebd Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 20 Jan 2025 20:46:59 +0800
Subject: [PATCH 08/11] Update publish.yml

---
 .github/workflows/publish.yml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 7d748f4d..3772a2af 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -71,3 +71,29 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=gha
           cache-to: type=gha,mode=max
+
+      # Extract metadata for slim image
+      - name: Extract metadata for proxy_only Docker
+        id: meta-proxy
+        uses: docker/metadata-action@v5
+        with:
+          images: ghcr.io/${{ github.repository }}
+          flavor: |
+            suffix=-slim
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            latest
+      
+      # Build and push slim image
+      - name: Build and push proxy_only Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: Dockerfile.proxy_only
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: ${{ steps.meta-proxy.outputs.tags }}
+          labels: ${{ steps.meta-proxy.outputs.labels }}
+          cache-from: type=gha,scope=proxy
+          cache-to: type=gha,scope=proxy,mode=max

From d6c11518c2f29c18b58c0daba928e3630a4fc64f Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 20 Jan 2025 20:47:20 +0800
Subject: [PATCH 09/11] add plugindir flag and env var

---
 Dockerfile.proxy_only          | 55 ++++++++++++++++++++++++++++++++++
 optillm.py                     |  9 +++---
 requirements_proxy_only.txt    | 19 ++++++++++++
 scripts/eval_aime_benchmark.py | 21 ++++++++-----
 4 files changed, 92 insertions(+), 12 deletions(-)
 create mode 100644 Dockerfile.proxy_only
 create mode 100644 requirements_proxy_only.txt

diff --git a/Dockerfile.proxy_only b/Dockerfile.proxy_only
new file mode 100644
index 00000000..bc4cc90b
--- /dev/null
+++ b/Dockerfile.proxy_only
@@ -0,0 +1,55 @@
+# Build stage
+FROM python:3.12-slim AS builder
+
+# Define build argument with default value
+ARG PORT=8000
+# Make it available as env variable at runtime
+ENV OPTILLM_PORT=$PORT
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential \
+  python3-dev \
+  gcc \
+  g++ \
+  && rm -rf /var/lib/apt/lists/*
+
+# Copy only the requirements file first to leverage Docker cache
+COPY requirements_proxy_only.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements_proxy_only.txt
+
+# Final stage
+FROM python:3.12-slim
+
+# Install curl for the healthcheck
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  curl && \
+  apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy installed dependencies from builder stage
+COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# Copy application code
+COPY . .
+
+# Create a non-root user and switch to it
+RUN useradd -m appuser
+USER appuser
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+
+# Use the ARG in EXPOSE
+EXPOSE ${PORT}
+
+# Run the application
+ENTRYPOINT ["python", "optillm.py"]
diff --git a/optillm.py b/optillm.py
index b5046262..cdd52587 100644
--- a/optillm.py
+++ b/optillm.py
@@ -158,7 +158,7 @@ def load_plugins():
    package_plugin_dir = os.path.join(os.path.dirname(optillm.__file__), 'plugins')
    
    # Get local project plugins directory
-   current_dir = os.getcwd()
+   current_dir = os.getcwd() if server_config.get("plugins_dir", "") == "" else server_config["plugins_dir"]
    local_plugin_dir = os.path.join(current_dir, 'optillm', 'plugins')
    
    plugin_dirs = []
@@ -664,7 +664,8 @@ def parse_args():
         ("--return-full-response", "OPTILLM_RETURN_FULL_RESPONSE", bool, False, "Return the full response including the CoT with <thinking> tags"),
         ("--port", "OPTILLM_PORT", int, 8000, "Specify the port to run the proxy"),
         ("--log", "OPTILLM_LOG", str, "info", "Specify the logging level", list(logging_levels.keys())),
-        ("--launch-gui", "OPTILLM_LAUNCH_GUI", bool, False, "Launch a Gradio chat interface")
+        ("--launch-gui", "OPTILLM_LAUNCH_GUI", bool, False, "Launch a Gradio chat interface"),
+        ("--plugins-dir", "OPTILLM_PLUGINS_DIR", str, "", "Path to the plugins directory"),
     ]
 
     for arg, env, type_, default, help_text, *extra in args_env:
@@ -704,11 +705,11 @@ def main():
     global server_config
     # Call this function at the start of main()
     args = parse_args()
-    load_plugins()
-
     # Update server_config with all argument values
     server_config.update(vars(args))
 
+    load_plugins()
+
     port = server_config['port']
 
     # Set logging level from user request
diff --git a/requirements_proxy_only.txt b/requirements_proxy_only.txt
new file mode 100644
index 00000000..84e02764
--- /dev/null
+++ b/requirements_proxy_only.txt
@@ -0,0 +1,19 @@
+numpy
+networkx
+openai
+z3-solver
+aiohttp
+flask
+azure.identity
+scikit-learn
+litellm
+requests
+beautifulsoup4
+lxml
+presidio_analyzer
+presidio_anonymizer
+nbformat
+nbconvert
+ipython
+ipykernel
+gradio
\ No newline at end of file
diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py
index c834a2fc..2e51ff0f 100644
--- a/scripts/eval_aime_benchmark.py
+++ b/scripts/eval_aime_benchmark.py
@@ -15,7 +15,7 @@
 logger = logging.getLogger(__name__)
 
 # Initialize OpenAI client
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="http://localhost:8000/v1")
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="https://ot7nh9nqf4l7b43s.us-east-1.aws.endpoints.huggingface.cloud/v1/")
 
 SYSTEM_PROMPT = '''You are solving AIME (American Invitational Mathematics Examination) problems.
 
@@ -104,10 +104,11 @@ def get_llm_response(problem: str, model: str) -> Union[str, List[Dict]]:
     try:
         response = client.with_options(timeout=1000.0).chat.completions.create(
             model=model,
+            temperature=0.2,
             messages=[
                 {"role": "user", "content": SYSTEM_PROMPT + problem}
             ],
-            max_tokens=8192,
+            max_tokens=40000,
         )
         
         # If there's more than one choice, format as attempts
@@ -241,18 +242,21 @@ def analyze_results(results: List[Dict], n: int):
             print("---")
 
 def main(model: str, n_attempts: int):
-    """Main evaluation function."""
+    """Main evaluation function that handles gaps in processed indexes."""
     os.makedirs("results", exist_ok=True)
     
-    # Include n_attempts in filename to keep separate results for different n values
     results_file = f"evaluation_results_{model.replace('/', '_')}_pass_at_{n_attempts}.json"
     
     dataset = load_2024_dataset()
     existing_results = load_existing_results(results_file)
-    last_processed_index = get_last_processed_index(existing_results)
     
-    for idx, item in enumerate(tqdm(dataset, desc="Evaluating problems")):
-        if idx <= last_processed_index:
+    # Create a set of already processed indexes for efficient lookup
+    processed_indexes = {result['index'] for result in existing_results}
+    
+    for _, item in enumerate(tqdm(dataset, desc="Evaluating problems")):
+        id = int(item['id'])
+        # Skip if this index has already been processed
+        if id in processed_indexes:
             continue
             
         problem_text = item['problem']
@@ -263,7 +267,7 @@ def main(model: str, n_attempts: int):
         is_correct, first_correct = evaluate_pass_at_n(attempts, correct_answer)
         
         result = {
-            "index": idx,
+            "index": id,
             "problem": problem_text,
             "attempts": attempts,
             "correct_answer": correct_answer,
@@ -275,6 +279,7 @@ def main(model: str, n_attempts: int):
     final_results = load_existing_results(results_file)
     analyze_results(final_results, n_attempts)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Evaluate LLM performance on AIME 2024 problems")
     parser.add_argument("--model", type=str, required=True, help="OpenAI model to use (e.g., gpt-4, gpt-3.5-turbo)")

From e14521b657e1153908372f272660215ade3e23b0 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 20 Jan 2025 22:10:16 +0800
Subject: [PATCH 10/11] Update publish.yml

---
 .github/workflows/publish.yml | 42 +++++++++++++++++------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 3772a2af..ae03a5a5 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -50,50 +50,50 @@ jobs:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
-      
-      - name: Extract metadata for Docker
-        id: meta
+          
+      # Extract metadata for proxy_only image
+      - name: Extract metadata for proxy_only Docker
+        id: meta-proxy
         uses: docker/metadata-action@v5
         with:
           images: ghcr.io/${{ github.repository }}
+          flavor: |
+            suffix=-slim
           tags: |
             type=semver,pattern={{version}}
             type=semver,pattern={{major}}.{{minor}}
             latest
       
-      - name: Build and push Docker image
+      # Build and push proxy image
+      - name: Build and push proxy_only Docker image
         uses: docker/build-push-action@v5
         with:
           context: .
+          file: Dockerfile.proxy_only
           push: true
           platforms: linux/amd64,linux/arm64
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-      # Extract metadata for slim image
-      - name: Extract metadata for proxy_only Docker
-        id: meta-proxy
+          tags: ${{ steps.meta-proxy.outputs.tags }}
+          labels: ${{ steps.meta-proxy.outputs.labels }}
+          cache-from: type=gha,scope=proxy
+          cache-to: type=gha,scope=proxy,mode=max
+      
+      - name: Extract metadata for Docker
+        id: meta
         uses: docker/metadata-action@v5
         with:
           images: ghcr.io/${{ github.repository }}
-          flavor: |
-            suffix=-slim
           tags: |
             type=semver,pattern={{version}}
             type=semver,pattern={{major}}.{{minor}}
             latest
       
-      # Build and push slim image
-      - name: Build and push proxy_only Docker image
+      - name: Build and push Docker image
         uses: docker/build-push-action@v5
         with:
           context: .
-          file: Dockerfile.proxy_only
           push: true
           platforms: linux/amd64,linux/arm64
-          tags: ${{ steps.meta-proxy.outputs.tags }}
-          labels: ${{ steps.meta-proxy.outputs.labels }}
-          cache-from: type=gha,scope=proxy
-          cache-to: type=gha,scope=proxy,mode=max
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max

From 910d3b3d4fa4ad4b7a2d17a4b0e90749e21a1dc1 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 20 Jan 2025 22:34:36 +0800
Subject: [PATCH 11/11] Update eval_aime_benchmark.py

---
 scripts/eval_aime_benchmark.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py
index 2e51ff0f..5fc72576 100644
--- a/scripts/eval_aime_benchmark.py
+++ b/scripts/eval_aime_benchmark.py
@@ -104,11 +104,10 @@ def get_llm_response(problem: str, model: str) -> Union[str, List[Dict]]:
     try:
         response = client.with_options(timeout=1000.0).chat.completions.create(
             model=model,
-            temperature=0.2,
             messages=[
                 {"role": "user", "content": SYSTEM_PROMPT + problem}
             ],
-            max_tokens=40000,
+            max_tokens=8192,
         )
         
         # If there's more than one choice, format as attempts