In [None]:
from pathlib import Path

import typer
import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import OpenAI


assert load_dotenv(), "Failed to load .env file"

In [None]:
model = "Qwen/Qwen2.5-7B-Instruct"
datasets_str = "bdsaglam/musique-mini,answerable,validation"
noise_rate = 1.0
retriever = "hybrid"
temperature = 0.1
max_new_tokens = 1024

print("üîÆ Starting MuSiQue evaluation")
print("=" * 50)
print(f"üìù Model: {model}")
print(f"üìä Dataset: {datasets_str} (noise rate: {noise_rate})")
print(f"üîç Retriever: {retriever}")
print(f"üå°Ô∏è Temperature: {temperature}")
print(f"üéØ Max tokens: {max_new_tokens}")
print("=" * 50)

# Load MuSiQue environment
print("üåç Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id="vf-musique",
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever_name=retriever,
)
print(f"‚úÖ Environment loaded with {len(vf_env.dataset)} examples")

# Use OpenAI-compatible API client (e.g., for vLLM)
print("ü§ñ Using OpenAI-compatible API client...")
client = OpenAI()

# Run evaluation using the environment
print("üîÑ Running evaluation...")
results = vf_env.evaluate(
    client,
    model,
    rollouts_per_example=1,
    sampling_args={"temperature": temperature, "max_tokens": max_new_tokens},
)


In [None]:
len(results.prompt), len(results.completion), len(results.answer), len(results.state), len(results.info), len(results.task), len(results.reward)

In [None]:
from verifiers.types import GenerateOutputs

for field in GenerateOutputs.__fields__.keys():
    print(field)

    _type = type(getattr(results, field))
    if _type == list:
        print(getattr(results, field)[0])
    else:
        print(type(getattr(results, field)))

In [None]:
results.completion[0]

In [None]:
ds = vf_env.make_dataset(results)
ds


In [None]:
# ds.to_json("musique-mini-eval-results.jsonl", orient="records", lines=True)