# Working with Data Generation

You can also check out this cookbook in Google Colab [here](https://colab.research.google.com/github/camel-ai/loong/blob/main/cookbooks/collect_synthetic_data.ipynb).

<div class="align-center">
  <a href="https://www.camel-ai.org/"><img src="https://i.postimg.cc/KzQ5rfBC/button.png"width="150"></a>
  <a href="https://discord.camel-ai.org"><img src="https://i.postimg.cc/L4wPdG9N/join-2.png"  width="150"></a></a>
  
⭐ <i>Star us on [*Github*](https://github.com/camel-ai/camel), join our [*Discord*](https://discord.camel-ai.org) or follow our [*X*](https://x.com/camelaiorg)
</div>

In [None]:
# Optional: Install camel if you don't have it
!pip install "git+https://github.com/camel-ai/camel.git#egg=camel-ai[all]"

## Import necessary libraries

In [None]:
import asyncio
import json
import os
from typing import List, Dict, Any

from camel.datasets.static_dataset import StaticDataset
from camel.datasets.few_shot_generator import FewShotGenerator
from camel.models import ModelFactory
from camel.types import ModelPlatformType, ModelType
from camel.configs import ChatGPTConfig
from camel.agents import ChatAgent
from camel.extractors import BaseExtractor, BoxedStrategy
from camel.verifiers import MathVerifier, PythonVerifier
from camel.environments import SingleStepEnv, Action
from camel.logger import get_logger, set_log_level

## Set up logger

In [None]:
logger = get_logger(__name__)
set_log_level('INFO')

### enable deepseek reasoning content

In [None]:
os.environ["GET_REASONING_CONTENT"] = "true"

### define output files

In [None]:
OUTPUT_FILE = "math_dataset.json"
ALL_RESPONSES_FILE = "all_responses.txt"

In [None]:
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r') as f:
        dataset = json.load(f)
    logger.info(f"Loaded existing dataset with {len(dataset)} examples")
else:
    dataset = []
    logger.info("Starting new dataset")

In [None]:
from datasets import load_dataset

dataset = load_dataset("camel-ai/loong", split="graph_discrete_math")

seed_dataset = StaticDataset(dataset)

In [None]:
logger.info("Initializing models...")

In [None]:
# Get API keys
import os
from getpass import getpass

openai_api_key = getpass('Enter your OpenAI API key: ')
os.environ["OPENAI_API_KEY"] = openai_api_key

deepseek_api_key = getpass('Enter your DeepSeek API key: ')

In [None]:
model_4o = ModelFactory.create(
    model_platform=ModelPlatformType.OPENAI,
    model_type=ModelType.GPT_4O_MINI,
    model_config_dict=ChatGPTConfig().as_dict(),
    timeout=60
)

model_deepseek = ModelFactory.create(
    model_platform=ModelPlatformType.DEEPSEEK,
    model_type=ModelType.DEEPSEEK_REASONER,
    api_key=deepseek_api_key
)
logger.info("Models initialized successfully")

## Set up extractors and verifiers

In [None]:
logger.info("Setting up extractors and verifiers...")
extractor = BaseExtractor([[BoxedStrategy()]])
await extractor.setup()

In [None]:
python_verifier = PythonVerifier(required_packages=["sympy"])
await python_verifier.setup(uv=False)

math_verifier = MathVerifier(
    extractor=extractor,
    float_rounding=6,
    numeric_precision=15,
    enable_wrapping=True
)
await math_verifier.setup()
logger.info("Extractors and verifiers setup complete")

## Initialize generator and environment

In [None]:
logger.info("Initializing generator and environment...")
generator = FewShotGenerator(
    buffer=10,
    seed_dataset=seed_dataset,
    verifier=python_verifier,
    model=model_4o
)

env = SingleStepEnv(generator, math_verifier)
await env.setup()
logger.info("Generator and environment initialized")

## Initialize agent for CoT generation

In [None]:
agent = ChatAgent(model=model_deepseek)

In [None]:
USER_PROMPT = """You are an agent designed to answer mathematical questions with clarity and precision. Your task is to provide a step-by-step explanation for
any mathematical problem posed by the user, ensuring the response is easy to follow. Adhere to these guidelines:
Analyze the mathematical question carefully and break down the solution process into clear, logical steps.
Use natural language to explain each step, incorporating LaTeX notation (e.g., $x + 2$)
for mathematical expressions when helpful. Conclude your response with the final answer enclosed
in a LaTeX \boxed{} environment (e.g., \boxed{5}).
Place this at the end of your explanation as a standalone statement.
It should be a Python expression, for example "[1, 2, 3]" for a list.

The question you should answer is: """

In [None]:
# Generation Loop
num_rejected = 0
target_size = 100

logger.info("Starting generation and verification loop...")

In [None]:
while len(dataset) < target_size:
    logger.info(f"Current dataset size: {len(dataset)}/{target_size}")
    
    obs = await env.reset()
    deepseek_response = agent.step(USER_PROMPT + obs.question).msgs[0].content
    
    # Split the response into reasoning and answer parts
    reasoning_part = ""
    answer_part = deepseek_response
    
    if "<think>" in deepseek_response and "</think>" in deepseek_response:
        parts = deepseek_response.split("</think>")
        if len(parts) > 1:
            reasoning_part = parts[0].replace("<think>", "").strip()
            answer_part = parts[1].strip()
    
    # Verify through environment
    next_obs, reward, done, info = await env.step(Action(index=0, llm_response=deepseek_response))
    
    # Save all responses
    with open(ALL_RESPONSES_FILE, "a") as f:
        f.write(f"Question: {obs.question}\n")
        f.write(f"Response: {answer_part}\n")
        f.write(f"Long CoT: {reasoning_part}\n")
        f.write(f"Info: {info}\n")
        f.write(f"Verified: {reward > 0}\n")
        f.write("-" * 80 + "\n")
    
    # Create data entry
    data_entry = {
        "question": obs.question,
        "answer": info['state'].final_answer if 'state' in info else '',
        "response": answer_part,
        "long_cot": reasoning_part,
        "verified": reward > 0,
    }

    # Save entry and update dataset
    dataset.append(data_entry)
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(dataset, f, indent=2)
    
    if reward > 0:
        logger.info(f"Verification successful - Added verified entry (reward: {reward})")
    else:
        num_rejected += 1
        logger.warning(f"Verification failed - Added unverified entry (reward: {reward})")
    
    agent.reset()

## Final Statistics

In [None]:
total_entries = len(dataset)
verified_entries = sum(1 for entry in dataset if entry["verified"])
logger.info(f"Generation complete. Total entries: {total_entries}")
logger.info(f"Verified entries: {verified_entries}")
logger.info(f"Rejected entries: {num_rejected}")