Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 61 additions & 14 deletions optillm.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,52 @@ async def run_approach(approach):
responses, tokens = zip(*results)
return list(responses), sum(tokens)

def execute_n_times(n: int, approaches, operation: str, system_prompt: str, initial_query: str, client: Any, model: str) -> Tuple[Union[str, List[str]], int]:
"""
Execute the pipeline n times and return n responses.

Args:
n (int): Number of times to run the pipeline
approaches (list): List of approaches to execute
operation (str): Operation type ('SINGLE', 'AND', or 'OR')
system_prompt (str): System prompt
initial_query (str): Initial query
client: OpenAI client instance
model (str): Model identifier

Returns:
Tuple[Union[str, List[str]], int]: List of responses and total token count
"""
responses = []
total_tokens = 0

for _ in range(n):
if operation == 'SINGLE':
response, tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
elif operation == 'AND':
response, tokens = execute_combined_approaches(approaches, system_prompt, initial_query, client, model)
elif operation == 'OR':
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
response, tokens = loop.run_until_complete(execute_parallel_approaches(approaches, system_prompt, initial_query, client, model))
loop.close()
else:
raise ValueError(f"Unknown operation: {operation}")

# If response is already a list (from OR operation), extend responses
# Otherwise append the single response
if isinstance(response, list):
responses.extend(response)
else:
responses.append(response)
total_tokens += tokens

# If n=1 and we got a single response, return it as is
# Otherwise return the list of responses
if n == 1 and len(responses) == 1:
return responses[0], total_tokens
return responses, total_tokens

def generate_streaming_response(final_response, model):
# Yield the final response
if isinstance(final_response, list):
Expand Down Expand Up @@ -393,11 +439,12 @@ def proxy():
stream = data.get('stream', False)
messages = data.get('messages', [])
model = data.get('model', server_config['model'])
n = data.get('n', server_config['n']) # Get n value from request or config

optillm_approach = data.get('optillm_approach', server_config['approach'])
logger.debug(data)
server_config['mcts_depth'] = data.get('mcts_depth', server_config['mcts_depth'])
server_config['mcts_exploration' ] = data.get('mcts_exploration', server_config['mcts_exploration'])
server_config['mcts_exploration'] = data.get('mcts_exploration', server_config['mcts_exploration'])
server_config['mcts_simulations'] = data.get('mcts_simulations', server_config['mcts_simulations'])

system_prompt, initial_query, message_optillm_approach = parse_conversation(messages)
Expand Down Expand Up @@ -428,26 +475,26 @@ def proxy():
contains_none = any(approach == 'none' for approach in approaches)

if operation == 'SINGLE' and approaches[0] == 'none':
# For none approach, return the response directly
result, _ = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
# For none approach with n>1, make n separate calls
if n > 1:
responses = []
completion_tokens = 0
for _ in range(n):
result, tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
responses.append(result)
completion_tokens += tokens
result = responses
else:
result, completion_tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
logger.debug(f'Direct proxy response: {result}')
return jsonify(result), 200

elif operation == 'AND' or operation == 'OR':
if contains_none:
raise ValueError("'none' approach cannot be combined with other approaches")

# Handle non-none approaches
if operation == 'SINGLE':
response, completion_tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
elif operation == 'AND':
response, completion_tokens = execute_combined_approaches(approaches, system_prompt, initial_query, client, model)
elif operation == 'OR':
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
response, completion_tokens = loop.run_until_complete(execute_parallel_approaches(approaches, system_prompt, initial_query, client, model))
else:
raise ValueError(f"Unknown operation: {operation}")
# Handle non-none approaches with n attempts
response, completion_tokens = execute_n_times(n, approaches, operation, system_prompt, initial_query, client, model)

except Exception as e:
logger.error(f"Error processing request: {str(e)}")
Expand Down
141 changes: 75 additions & 66 deletions scripts/eval_aime_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
import logging
import re
import time

from typing import List, Dict, Tuple, Optional
from datetime import datetime

from openai import OpenAI
from datasets import load_dataset
from tqdm import tqdm
Expand All @@ -17,7 +15,7 @@
logger = logging.getLogger(__name__)

# Initialize OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="http://localhost:8000/v1")
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="http://localhost:8888/v1")

SYSTEM_PROMPT = '''You are solving AIME (American Invitational Mathematics Examination) problems.

Expand Down Expand Up @@ -48,50 +46,30 @@ def extract_answer(response: str) -> Optional[int]:
"""
Extract the numerical answer from a math solution response.
Handles various formats of boxed answers and falls back to last number if needed.

Args:
response (str): The complete response text from the model

Returns:
Optional[int]: The extracted answer as an integer, or None if no valid answer found
"""
if not response:
return None

# Clean the response: normalize whitespace and handle potential Unicode
# Clean the response
response = ' '.join(response.split())

# List of regex patterns to try, in order of preference
patterns = [
# $n=\boxed{X}$ format
r'\$n=\\boxed{(\d+)}\$',

# LaTeX display style answer: \[\boxed{X}\] or \[\boxed{X}.\]
r'\\\[\\boxed{(\d+)}\\\]',
r'\\\[\\boxed{(\d+)}\.\\\]',

# Inline LaTeX \boxed{X}
r'\\boxed{(\d+)}',

# Common variations
r'\$\\boxed{(\d+)}\$',
r'boxed{(\d+)}',

# Less strict patterns
r'\\boxed\s*{\s*(\d+)\s*}',
r'\bboxed\s*{\s*(\d+)\s*}',

# Plain text answer indicators
r'final answer is[^\d]*(\d+)',
r'answer is[^\d]*(\d+)',
r'answer:[^\d]*(\d+)',
r'= ?(\d+)$'
]

# Try each pattern in order
for pattern in patterns:
matches = re.finditer(pattern, response, re.IGNORECASE)
# Get the last match for this pattern (in case there are multiple)
last_match = None
for match in matches:
last_match = match
Expand All @@ -102,47 +80,70 @@ def extract_answer(response: str) -> Optional[int]:
except (ValueError, IndexError):
continue

# Fallback: Extract all numbers and take the last one
# This is our last resort, assuming the answer typically comes last
numbers = re.findall(r'(\d+)', response)
if numbers:
try:
# Convert to int and return the last number found
return int(numbers[-1])
except ValueError:
pass

# If all methods fail, return None
return None

def get_llm_response(problem: str, model: str) -> str:
"""
Get response from the LLM for a given problem.
"""
try:
response = client.chat.completions.create(
response = client.with_options(timeout=1000.0).chat.completions.create(
model=model,
messages=[
# {"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": SYSTEM_PROMPT + problem}
],
max_tokens=8192,
# extra_body={
# "decoding": "entropy_decoding",
# }
)
return response.choices[0].message.content.strip()
except Exception as e:
logger.error(f"Error getting LLM response: {e}")
return ""

def evaluate_response(predicted_answer: Optional[int], correct_answer: int) -> bool:
def make_n_attempts(problem: str, model: str, n: int) -> List[Dict]:
"""
Make n attempts to solve a problem and return all responses and predictions.

Args:
problem (str): The problem text
model (str): The model identifier
n (int): Number of attempts to make

Returns:
List[Dict]: List of dictionaries containing response and predicted answer for each attempt
"""
attempts = []
for i in range(n):
response = get_llm_response(problem, model)
predicted_answer = extract_answer(response)
attempts.append({
"attempt_number": i + 1,
"response": response,
"predicted_answer": predicted_answer
})
return attempts

def evaluate_pass_at_n(attempts: List[Dict], correct_answer: int) -> Tuple[bool, Optional[int]]:
"""
Evaluate if the predicted answer matches the correct answer.
Evaluate if any of the n attempts got the correct answer.

Args:
attempts (List[Dict]): List of attempt results
correct_answer (int): The correct answer

Returns:
Tuple[bool, Optional[int]]: (whether any attempt was correct, first correct attempt number)
"""
if predicted_answer is None:
return False
return predicted_answer == correct_answer
for attempt in attempts:
if attempt["predicted_answer"] == correct_answer:
return True, attempt["attempt_number"]
return False, None

def load_existing_results(filename: str) -> List[Dict]:
"""Load existing results from file if it exists."""
Expand All @@ -165,76 +166,84 @@ def get_last_processed_index(results: List[Dict]) -> int:
return -1
return max(int(r.get('index', -1)) for r in results)

def analyze_results(results: List[Dict]):
"""Analyze and print summary statistics of the results."""
def analyze_results(results: List[Dict], n: int):
"""
Analyze and print summary statistics of the results.

Args:
results (List[Dict]): List of evaluation results
n (int): Number of attempts per problem
"""
total = len(results)
correct = sum(1 for r in results if r['is_correct'])
accuracy = correct / total if total > 0 else 0

print("\n=== Results Summary ===")
print(f"Evaluation mode: pass@{n}")
print(f"Total problems: {total}")
print(f"Correct answers: {correct}")
print(f"Accuracy: {accuracy:.2%}")

# Print incorrect problems for analysis
print("\n=== Incorrect Answers ===")
# Calculate attempt statistics
successful_attempts = [r['first_correct_attempt'] for r in results if r['is_correct']]
if successful_attempts:
avg_attempts = sum(successful_attempts) / len(successful_attempts)
print(f"\nFor correct solutions:")
print(f"Average attempts needed: {avg_attempts:.2f}")
print(f"Attempt distribution:")
for i in range(1, n + 1):
count = sum(1 for x in successful_attempts if x == i)
print(f" Attempt {i}: {count} problems")

print("\n=== Incorrect Problems ===")
for r in results:
if not r['is_correct']:
print(f"Problem {r['index']}:")
print(f"Expected: {r['correct_answer']}")
print(f"Predicted: {r['predicted_answer']}")
print("Predicted answers across attempts:", [
attempt['predicted_answer'] for attempt in r['attempts']
])
print("---")

def main(model: str):
def main(model: str, n_attempts: int):
"""Main evaluation function."""
# Create results directory if it doesn't exist
os.makedirs("results", exist_ok=True)

# Setup results file
results_file = f"evaluation_results_{model.replace('/', '_')}.json"
# Include n_attempts in filename to keep separate results for different n values
results_file = f"evaluation_results_{model.replace('/', '_')}_pass_at_{n_attempts}.json"

# Load dataset
dataset = load_2024_dataset()

# Load existing results
existing_results = load_existing_results(results_file)
last_processed_index = get_last_processed_index(existing_results)

# Process problems
for idx, item in enumerate(tqdm(dataset, desc="Evaluating problems")):
if idx <= last_processed_index:
continue

problem_text = item['problem']
correct_answer = int(item['answer'])

# Get model response
response = get_llm_response(problem_text, model)
logger.debug(f"Response: {response}")
predicted_answer = extract_answer(response)
is_correct = evaluate_response(predicted_answer, correct_answer)
# Make n attempts for each problem
attempts = make_n_attempts(problem_text, model, n_attempts)
is_correct, first_correct = evaluate_pass_at_n(attempts, correct_answer)

# Save result
result = {
"index": idx,
"problem": problem_text,
"model_response": response,
"predicted_answer": predicted_answer,
"attempts": attempts,
"correct_answer": correct_answer,
"is_correct": is_correct
"is_correct": is_correct,
"first_correct_attempt": first_correct
}
save_result(results_file, result)

# Optional: Add delay between requests if needed
time.sleep(300)

# Analyze results
final_results = load_existing_results(results_file)
analyze_results(final_results)
analyze_results(final_results, n_attempts)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate LLM performance on AIME 2024 problems")
parser.add_argument("--model", type=str, required=True, help="OpenAI model to use (e.g., gpt-4, gpt-3.5-turbo)")
parser.add_argument("--n", type=int, default=1, help="Number of attempts per problem (for pass@n evaluation)")
args = parser.parse_args()

main(args.model)
main(args.model, args.n)
Loading