In [2]:
!uv pip install torchvision
!uv pip install datasets transformers
!uv pip install "camel-ai[all] @ git+https://github.com/camel-ai/camel.git"

[2mUsing Python 3.10.10 environment at: /home/zeus/miniconda3/envs/cloudspace[0m
[2mAudited [1m1 package[0m [2min 11ms[0m[0m


[2mUsing Python 3.10.10 environment at: /home/zeus/miniconda3/envs/cloudspace[0m
[2mAudited [1m2 packages[0m [2min 13ms[0m[0m
[2mUsing Python 3.10.10 environment at: /home/zeus/miniconda3/envs/cloudspace[0m
[2K[2mResolved [1m401 packages[0m [2min 197ms[0m[0m                                       [0m
[2mAudited [1m401 packages[0m [2min 0.16ms[0m[0m


In [3]:
import asyncio
import os
import json
from tqdm import tqdm
import pandas as pd
from typing import List, Dict
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import asyncio.exceptions

from camel.extractors import BaseExtractor, BoxedStrategy
from camel.verifiers import MathVerifier, VerificationOutcome
from camel.logger import get_logger, set_log_level

# Set up logger
logger = get_logger(__name__)
set_log_level('INFO')

In [4]:
# Initialize extractor
logger.info("Setting up extractor...")
extractor = BaseExtractor([[BoxedStrategy()]])
await extractor.setup()

test_string = "The answer is $\\boxed{\\dfrac{9}{7}}$ vertical asymptotes."
logger.info(f"Testing extractor with: {test_string}")
test_result = await extractor.extract(test_string)
logger.info(f"Extractor test result: {test_result} (type: {type(test_result)})")

2025-04-11 13:48:40,769 - camel.__main__ - INFO - Setting up extractor...
2025-04-11 13:48:41,771 - camel.camel.extractors.base - INFO - BaseExtractor initialized successfully
2025-04-11 13:48:41,772 - camel.__main__ - INFO - Testing extractor with: The answer is $\boxed{\dfrac{9}{7}}$ vertical asymptotes.
2025-04-11 13:48:41,773 - camel.__main__ - INFO - Extractor test result: \dfrac{9}{7} (type: <class 'str'>)


In [5]:
# Initialize math verifier with extractor
logger.info("Setting up math verifier...")
math_verifier = MathVerifier(
    extractor=extractor,
    float_rounding=6,
    numeric_precision=15,
    enable_wrapping=True
)
await math_verifier.setup()

2025-04-11 13:48:42,130 - camel.__main__ - INFO - Setting up math verifier...
2025-04-11 13:48:44,134 - camel.camel.verifiers.base - INFO - MathVerifier initialized with batch_size=10, max_parallel=1


In [6]:
await math_verifier.verify("the solution is \\boxed{2}", "4/2")

VerificationResult(status=<VerificationOutcome.SUCCESS: 'success'>, result='$$ 2 $$', duration=0.3955972194671631, timestamp=datetime.datetime(2025, 4, 11, 13, 48, 44, 905306), metadata={'attempt': 1}, error_message=None)

In [7]:
# Initialize Qwen model
logger.info("Initializing Qwen model...")
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

device = "cuda" if torch.cuda.is_available else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device)


# Define the prompt template with clearer formatting instructions
SYSTEM_PROMPT = """You are an agent designed to answer mathematical questions with clarity and precision. Your task is to provide a step-by-step explanation for
any mathematical problem posed by the user, ensuring the response is easy to follow. Adhere to these guidelines:
Analyze the mathematical question carefully and break down the solution process into clear, logical steps.
Use natural language to explain each step, incorporating LaTeX notation (e.g., $x + 2$)
for mathematical expressions when helpful. Conclude your response with the final answer enclosed
in a LaTeX \\boxed{} environment (e.g., \\boxed{5}).
Place this at the end of your explanation as a standalone statement."""

2025-04-11 13:48:46,460 - camel.__main__ - INFO - Initializing Qwen model...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [8]:
# Load dataset with only 5 examples
logger.info("Loading dataset (5 examples)...")
dataset = load_dataset("EleutherAI/hendrycks_math", "algebra", split="test")
dataset = dataset.select(range(min(5, len(dataset))))

# Store all results
all_results = []

2025-04-11 13:48:48,148 - camel.__main__ - INFO - Loading dataset (5 examples)...


In [9]:
# Process each example
for idx, example in enumerate(dataset):
  print("TEST")
  question = example['problem']
  ground_truth = example['solution']
  gt_extracted = await extractor.extract(ground_truth)

  if not gt_extracted:
    raise RuntimeError()

  logger.info(f"\n=== Processing Example {idx+1}/{len(dataset)} ===")
  logger.info(f"Question: {question}")
  logger.info(f"Ground Truth: {ground_truth}")
  logger.info(f"Extracted Ground Truth: {gt_extracted}")

  # Prepare input for model
  prompt = f"{SYSTEM_PROMPT}\n\nQuestion: {question}\n\nAnswer:"
  inputs = tokenizer(prompt, return_tensors="pt").to(device)

  # Generate response
  logger.info("Generating model response...")
  with torch.no_grad():
      output_ids = model.generate(
          **inputs,
          max_new_tokens=512,
          do_sample=True,
          pad_token_id=tokenizer.pad_token_id
      )

  prompt_length = inputs["input_ids"].shape[-1]

  full_output = output_ids[0]

  # Only the generated tokens (excluding prompt)
  generated_tokens = full_output[prompt_length:]

  response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

  logger.info(f"Model Response: {response}")
  
  verification_result = await math_verifier.verify(response, gt_extracted)
  is_correct = verification_result.status == VerificationOutcome.SUCCESS
  error_message = verification_result.error_message if hasattr(verification_result, 'error_message') else None

  # Save result
  result = {
      'question': question,
      'ground_truth': ground_truth,
      'ground_truth_extracted': gt_extracted,
      'model_response': response,
      'verification_status': str(verification_result.status) if verification_result else "VERIFICATION_FAILED",
      'is_correct': is_correct,
      'error_message': error_message
  }
  all_results.append(result)

  # Log verification result
  logger.info(f"Verification result: {'✓' if is_correct else '✗'}")
  if error_message:
      logger.info(f"Error message: {error_message}")

  logger.info("=" * 50)

TEST
2025-04-11 13:48:49,663 - camel.__main__ - INFO - 
=== Processing Example 1/5 ===
2025-04-11 13:48:49,664 - camel.__main__ - INFO - Question: How many vertical asymptotes does the graph of $y=\frac{2}{x^2+x-6}$ have?
2025-04-11 13:48:49,665 - camel.__main__ - INFO - Ground Truth: The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$.  Therefore, the graph has $\boxed{2}$ vertical asymptotes.
2025-04-11 13:48:49,665 - camel.__main__ - INFO - Extracted Ground Truth: 2
cuda
2025-04-11 13:48:49,668 - camel.__main__ - INFO - Generating model response...
2025-04-11 13:48:59,182 - camel.__main__ - INFO - Model Response:  The function has two vertical asymptotes.

Step-by-step analysis:

1) First, factor the denominator: 
   $$y = \frac{2}{(x+3)(x-2)}$$

2) Identify any potential vertical asymptotes by setting the denominator equ

In [10]:
# Save results
output_dir = "test_results"
os.makedirs(output_dir, exist_ok=True)

with open(f"{output_dir}/test_run_results.json", "w") as f:
    json.dump({
        'summary': {
            'total': len(all_results),
            'correct': sum(1 for r in all_results if r['is_correct']),
            'accuracy': sum(1 for r in all_results if r['is_correct']) / len(all_results)
        },
        'results': all_results
    }, f, indent=2)

logger.info("\n=== Test Run Summary ===")
logger.info(f"Total examples processed: {len(all_results)}")
logger.info(f"Results saved to {output_dir}/test_run_results.json")


2025-04-11 13:49:37,377 - camel.__main__ - INFO - 
=== Test Run Summary ===
2025-04-11 13:49:37,377 - camel.__main__ - INFO - Total examples processed: 5
2025-04-11 13:49:37,378 - camel.__main__ - INFO - Results saved to test_results/test_run_results.json
