Skip to content
Merged

as #248

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions optillm/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,14 +887,30 @@ def proxy_models():
try:
if server_config['base_url']:
client = OpenAI(api_key=API_KEY, base_url=server_config['base_url'])
# For external API, fetch models using the OpenAI client
models_response = client.models.list()
# Convert to dict format
models_data = {
"object": "list",
"data": [model.dict() for model in models_response.data]
}
else:
client = default_client

# Fetch models using the OpenAI client and return the raw response
models_response = client.models.list().json()
# For local inference, create a models response manually
current_model = server_config.get('model', 'gpt-3.5-turbo')
models_data = {
"object": "list",
"data": [
{
"id": current_model,
"object": "model",
"created": 1677610602,
"owned_by": "optillm"
}
]
}

logger.debug('Models retrieved successfully')
return models_response, 200
return jsonify(models_data), 200
except Exception as e:
logger.error(f"Error fetching models: {str(e)}")
return jsonify({"error": f"Error fetching models: {str(e)}"}), 500
Expand Down
62 changes: 47 additions & 15 deletions scripts/eval_simpleqa_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

# Constants
SIMPLEQA_CSV_URL = "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
SIMPLEQA_VERIFIED_CSV_URL = "https://huggingface.co/datasets/codelion/SimpleQA-Verified/raw/main/simpleqa_verified.csv"
DEFAULT_TIMEOUT = 600 # 10 minutes for potentially long research operations
DEFAULT_GRADER_MODEL = "gpt-4o"
DEFAULT_BASE_URL = "http://localhost:8000/v1"
Expand Down Expand Up @@ -90,12 +91,14 @@ def __init__(self,
grader_model: str = DEFAULT_GRADER_MODEL,
timeout: int = DEFAULT_TIMEOUT,
cache_dir: str = "cache",
output_dir: str = "results"):
output_dir: str = "results",
use_verified: bool = False):
self.model = model
self.approach = approach
self.base_url = base_url
self.grader_model = grader_model
self.timeout = timeout
self.use_verified = use_verified
self.cache_dir = Path(cache_dir)
self.output_dir = Path(output_dir)

Expand Down Expand Up @@ -137,16 +140,23 @@ def __init__(self,

def download_dataset(self) -> str:
"""Download SimpleQA dataset if not cached"""
cache_file = self.cache_dir / "simple_qa_test_set.csv"
if self.use_verified:
cache_file = self.cache_dir / "simpleqa_verified.csv"
url = SIMPLEQA_VERIFIED_CSV_URL
dataset_name = "SimpleQA-Verified"
else:
cache_file = self.cache_dir / "simple_qa_test_set.csv"
url = SIMPLEQA_CSV_URL
dataset_name = "SimpleQA"

if cache_file.exists():
logger.info(f"Using cached dataset: {cache_file}")
logger.info(f"Using cached {dataset_name} dataset: {cache_file}")
return str(cache_file)

logger.info(f"Downloading SimpleQA dataset from {SIMPLEQA_CSV_URL}")
logger.info(f"Downloading {dataset_name} dataset from {url}")

try:
response = requests.get(SIMPLEQA_CSV_URL, timeout=30)
response = requests.get(url, timeout=30)
response.raise_for_status()

with open(cache_file, 'wb') as f:
Expand Down Expand Up @@ -176,21 +186,35 @@ def load_dataset(self, num_samples: Optional[int] = None, start_index: int = 0)
if num_samples and len(questions) >= num_samples:
break

# Parse metadata if it's JSON string
try:
metadata = json.loads(row['metadata']) if row['metadata'] else {}
except:
metadata = {}
if self.use_verified:
# SimpleQA-Verified dataset has different fields
metadata = {
'original_index': row.get('original_index', i),
'topic': row.get('topic', ''),
'answer_type': row.get('answer_type', ''),
'multi_step': row.get('multi_step', ''),
'requires_reasoning': row.get('requires_reasoning', ''),
'urls': row.get('urls', '')
}
question_id = row.get('original_index', i)
else:
# Original SimpleQA dataset
try:
metadata = json.loads(row['metadata']) if row.get('metadata') else {}
except:
metadata = {}
question_id = i

question_data = {
'id': i,
'id': question_id,
'metadata': metadata,
'question': row['problem'],
'gold_answer': row['answer']
}
questions.append(question_data)

logger.info(f"Loaded {len(questions)} questions from dataset")
dataset_type = "SimpleQA-Verified" if self.use_verified else "SimpleQA"
logger.info(f"Loaded {len(questions)} questions from {dataset_type} dataset")
return questions

except Exception as e:
Expand Down Expand Up @@ -377,7 +401,8 @@ def calculate_metrics(self) -> Dict:
def save_results(self, timestamp: str) -> Tuple[str, str, str]:
"""Save evaluation results to files"""
# Create output directory for this run
run_dir = self.output_dir / f"simpleqa_{self.model}_{self.approach}"
dataset_suffix = "_verified" if self.use_verified else ""
run_dir = self.output_dir / f"simpleqa{dataset_suffix}_{self.model}_{self.approach}"
run_dir.mkdir(parents=True, exist_ok=True)

# File paths
Expand Down Expand Up @@ -416,9 +441,11 @@ def run_evaluation(self,
"""Run the complete evaluation"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

logger.info(f"Starting SimpleQA evaluation")
dataset_type = "SimpleQA-Verified" if self.use_verified else "SimpleQA"
logger.info(f"Starting {dataset_type} evaluation")
logger.info(f"Model: {self.model}")
logger.info(f"Approach: {self.approach}")
logger.info(f"Dataset: {dataset_type} ({'1k verified questions' if self.use_verified else '4.3k questions'})")
logger.info(f"Base URL: {self.base_url}")
logger.info(f"Timeout: {self.timeout}s")

Expand Down Expand Up @@ -502,6 +529,10 @@ def parse_args():
parser.add_argument("--output-dir", type=str, default="results",
help="Directory for saving results (default: results)")

# Dataset selection
parser.add_argument("--verified", action="store_true",
help="Use SimpleQA-Verified dataset (1k verified questions) instead of original SimpleQA")

# Debugging
parser.add_argument("--verbose", action="store_true",
help="Enable verbose logging")
Expand All @@ -524,7 +555,8 @@ def main():
grader_model=args.grader_model,
timeout=args.timeout,
cache_dir=args.cache_dir,
output_dir=args.output_dir
output_dir=args.output_dir,
use_verified=args.verified
)

try:
Expand Down