# O1DataGene with CAMEL
## experimental version

In [16]:
import os
import logging
from datetime import datetime
from collections import defaultdict
from dotenv import load_dotenv
import json

### Configure logging,create a log file and a console handler

In [17]:
# Configure logging
def setup_logger():
    if not os.path.exists('logs'):
        os.makedirs('logs')
    log_filename = f'logs/omega_prm_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler = logging.FileHandler(log_filename, encoding='utf-8')
    file_handler.setFormatter(formatter)
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    return logger

In [18]:
logger = setup_logger()

In [19]:
# Load environment variables
load_dotenv()
logger.info("Environment variables loaded")

2024-12-05 20:48:34,200 - INFO - Environment variables loaded
2024-12-05 20:48:34,200 - INFO - Environment variables loaded


### First we will set the OPENAI_API_KEY that will be used to generate the data.

In [22]:

from getpass import getpass


openai_api_key = getpass('Enter your OpenAI API key: ')
os.environ["OPENAI_API_KEY"] = openai_api_key

### Create a system message to define agent's default role and behaviors.

In [20]:

sys_msg = 'You are a genius at slow-thinking data and code'

### Use ModelFactory to set up the backend model for agent, for more detailed model settings

In [21]:
from camel.models import ModelFactory
from camel.types import ModelPlatformType, ModelType
from camel.configs import ChatGPTConfig

# Define the model, here in this case we use gpt-4o-mini
model = ModelFactory.create(
    model_platform=ModelPlatformType.OPENAI,
    model_type=ModelType.GPT_4O_MINI,
    model_config_dict=ChatGPTConfig().as_dict(), # [Optional] the config for model
)
   

In [15]:
# Initialize AI model by OPENAI_COMPATIBLE_MODEL

from camel.models import ModelFactory
from camel.types import ModelPlatformType, ModelType


sys_msg = 'You are a genius at slow-thinking data and code'
model = ModelFactory.create(
    model_platform=ModelPlatformType.OPENAI_COMPATIBLE_MODEL,
    model_type="deepseek-chat",
    api_key=os.environ.get("OPENAI_COMPATIBILIY_API_KEY"),
    url=os.environ.get("OPENAI_COMPATIBILIY_API_BASE_URL"),
    model_config_dict={"temperature": 0.4, "max_tokens": 4096},
)


### Set ChatAgent

In [14]:
from camel.agents import ChatAgent
chat_agent = ChatAgent(
    system_message=sys_msg,
    model=model,
    message_window_size=10,
)

### define the class to solve the problem,use monte carlo tree search and binary search to solve the problem,and record the solution process and results


In [7]:
class O1DataGene:
    def __init__(self, chat_agent, golden_answers=None, search_limit=100):
        self.chat_agent = chat_agent
        self.golden_answers = golden_answers if golden_answers else {}
        self.search_limit = search_limit
        self.solution_tree = defaultdict(dict)  # Store correct solution steps
        logger.info("O1DataGene initialized with search_limit=%d", search_limit)

    def get_answer(self, question, context=""):
        """
        Get the AI's thought process and answer
        """
        prompt = f"""Please think step by step and solve this problem: {question}
        Existing content: {context}
        Requirements:
        1. Analyze the problem requirements
        2. List the steps to solve the problem
        3. Execute the solution process
        4. Provide the final answer
        Please explain the thought process of each step in detail.
        """
        response = self.chat_agent.step(prompt)
        answer = response.msgs[0].content
        logger.info("AI thought process:\n%s", answer)
        return answer

    def verify_answer(self, question, answer):
        """
        Verify if the answer is correct
        """
        prompt = f"""Please determine if the following two answers express the same meaning:
        Question: {question}
        Answer 1: {answer}
        Answer 2: {self.golden_answers[question]}
        Just answer "True" or "False".
        """
        response = self.chat_agent.step(prompt)
        is_correct = response.msgs[0].content.strip().lower() == "true"
        logger.info("Answer verification result: %s", is_correct)
        return is_correct

    def monte_carlo_tree_search(self, question, partial_solution=""):
        """
        Generate and verify answers using Monte Carlo Tree Search
        """
        logger.info("Starting Monte Carlo Tree Search")
        best_solution = None
        best_score = 0
        for i in range(self.search_limit):
            # Generate new answer
            current_solution = self.get_answer(question, partial_solution)
            # Verify answer
            is_correct = self.verify_answer(question, current_solution)
            if is_correct:
                logger.info("Correct answer found! Stopping search")
                return current_solution, True
            # Analyze error, get similarity score
            prompt = f"""Analyze the similarity of this answer to the correct answer (between 0-1):
            Question: {question}
            Generated answer: {current_solution}
            Correct answer: {self.golden_answers[question]}
            Just return a number between 0-1.
            """
            response = self.chat_agent.step(prompt)
            try:
                score = float(response.msgs[0].content.strip())
                if score > best_score:
                    best_score = score
                    best_solution = current_solution
                logger.info("Current search progress: %d/%d, best score: %.2f", i+1, self.search_limit, best_score)
            except ValueError:
                continue
        return best_solution, False

    def binary_search_error(self, question, solution):
        """
        Use binary search to locate the first error
        """
        logger.info("Starting binary search for error location")
        sentences = solution.split('。')
        left, right = 0, len(sentences)
        while left < right:
            mid = (left + right) // 2
            partial_solution = '。'.join(sentences[:mid]) + '。'
            logger.info("Checking solution fragment:\n%s", partial_solution)
            # Verify if the current part is correct
            is_correct = self.verify_answer(question, partial_solution)
            if is_correct:
                left = mid + 1
            else:
                right = mid
        error_position = left
        logger.info("First error position found: sentence %d", error_position)
        return error_position

    def solve(self, question):
        """
        Main process to solve the problem
        """
        logger.info("\n=== Starting to solve the problem: %s ===", question)
        # 1. Use Monte Carlo Tree Search to generate answer
        solution, is_correct = self.monte_carlo_tree_search(question)
        if is_correct:
            logger.info("Problem solved!")
            self.solution_tree[question] = {
                "solution": solution,
                "is_correct": True,
                "timestamp": datetime.now().isoformat()
            }
            return solution
        # 2. If the answer is not completely correct, use binary search to locate the error
        error_pos = self.binary_search_error(question, solution)
        # 3. Store the correct part
        correct_part = '。'.join(solution.split('。')[:error_pos]) + '。'
        final_solution = self.get_answer(question, correct_part)
        self.solution_tree[question] = {
            "solution": final_solution,
            "partial_correct": correct_part,
            "error_position": error_pos,
            "is_correct": False,
            "timestamp": datetime.now().isoformat()
        }
        logger.info("Final answer:\n%s", final_solution)
        return final_solution

    def import_qa_from_json(self, json_file_path):
        """
        Import question and answer data from JSON file
        JSON format should be: {"question1": "answer1", "question2": "answer2", ...}
        """
        try:
            with open(json_file_path, 'r', encoding='utf-8') as f:
                qa_data = json.load(f)
            # Update golden_answers
            self.golden_answers.update(qa_data)
            logger.info(f"Successfully imported {len(qa_data)} QA pairs from {json_file_path}")
            return True
        except Exception as e:
            logger.error(f"Error importing JSON data: {str(e)}")
            return False

    def export_solutions(self, filepath='solutions.json'):
        """
        Export the solution process and results to a JSON file
        """
        export_data = {
            "solutions": self.solution_tree,
            "golden_answers": self.golden_answers,
            "export_time": datetime.now().isoformat()
        }
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(export_data, f, ensure_ascii=False, indent=2)
            logger.info(f"Solutions exported successfully to {filepath}")
        except Exception as e:
            logger.error(f"Error exporting solutions: {str(e)}")

### Load Q&A data from a JSON file

In [8]:
def load_qa_data(file_path):
    """Load Q&A data from a JSON file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [9]:
# Load JSON data
qa_data = load_qa_data('qa_data.json')

### Create an instance of O1DataGene

In [10]:
# Create an instance of O1DataGene
omega = O1DataGene(chat_agent, golden_answers=qa_data)

2024-12-05 20:23:35,888 - INFO - O1DataGene initialized with search_limit=100


In [11]:
# Record generated answers
generated_answers = {}

### Test Q&A

In [12]:
# Test Q&A
for question in qa_data.keys():
    print(f"\nQuestion: {question}")
    
    # Get AI's thought process and answer
    answer = omega.get_answer(question)
    generated_answers[question] = answer
    print(f"AI's thought process and answer:\n{answer}")
    
    # Verify the answer
    is_correct = omega.verify_answer(question, answer)
    print(f"Answer verification result: {'Correct' if is_correct else 'Incorrect'}")
    print("-" * 50)

2024-12-05 20:23:36,088 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"



Question: What is the coefficient of $x^2y^6$ in the expansion of $\left(\frac{3}{5}x-\frac{y}{2}\right)^8$?  Express your answer as a common fraction


2024-12-05 20:24:30,175 - INFO - AI thought process:
### Step 1: Analyze the Problem Requirements

The problem requires us to find the coefficient of \(x^2y^6\) in the expansion of \(\left(\frac{3}{5}x - \frac{y}{2}\right)^8\). This is a binomial expansion problem where we need to identify the specific term that contains \(x^2y^6\).

### Step 2: List the Steps to Solve the Problem

1. **Understand the Binomial Theorem**: The binomial theorem states that \((a + b)^n\) expands to:
   \[
   \sum_{k=0}^{n} \binom{n}{k} a^{n-k} b^k
   \]
   Here, \(a = \frac{3}{5}x\), \(b = -\frac{y}{2}\), and \(n = 8\).

2. **Identify the General Term**: The general term in the expansion is:
   \[
   \binom{8}{k} \left(\frac{3}{5}x\right)^{8-k} \left(-\frac{y}{2}\right)^k
   \]

3. **Set Up the Condition for \(x^2y^6\)**: We need the term where the exponents of \(x\) and \(y\) match \(2\) and \(6\) respectively. This gives us:
   \[
   (8-k) = 2 \quad \text{and} \quad k = 6
   \]

4. **Solve for \(k\)**: F

AI's thought process and answer:
### Step 1: Analyze the Problem Requirements

The problem requires us to find the coefficient of \(x^2y^6\) in the expansion of \(\left(\frac{3}{5}x - \frac{y}{2}\right)^8\). This is a binomial expansion problem where we need to identify the specific term that contains \(x^2y^6\).

### Step 2: List the Steps to Solve the Problem

1. **Understand the Binomial Theorem**: The binomial theorem states that \((a + b)^n\) expands to:
   \[
   \sum_{k=0}^{n} \binom{n}{k} a^{n-k} b^k
   \]
   Here, \(a = \frac{3}{5}x\), \(b = -\frac{y}{2}\), and \(n = 8\).

2. **Identify the General Term**: The general term in the expansion is:
   \[
   \binom{8}{k} \left(\frac{3}{5}x\right)^{8-k} \left(-\frac{y}{2}\right)^k
   \]

3. **Set Up the Condition for \(x^2y^6\)**: We need the term where the exponents of \(x\) and \(y\) match \(2\) and \(6\) respectively. This gives us:
   \[
   (8-k) = 2 \quad \text{and} \quad k = 6
   \]

4. **Solve for \(k\)**: From the condition \(

2024-12-05 20:24:30,898 - INFO - Answer verification result: True


Answer verification result: Correct
--------------------------------------------------

Question: how many r in strawberry?


2024-12-05 20:24:31,294 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2024-12-05 20:24:51,969 - INFO - AI thought process:
### Step 1: Analyze the Problem Requirements

The problem requires us to count the number of times the letter "r" appears in the word "strawberry". This is a straightforward counting problem where we need to identify and tally the occurrences of the letter "r" in the given word.

### Step 2: List the Steps to Solve the Problem

1. **Identify the Word**: The word given is "strawberry".
2. **Count the Occurrences of "r"**: Go through each letter in the word and count how many times "r" appears.
3. **Summarize the Count**: Provide the total count of "r" in the word.

### Step 3: Execute the Solution Process

1. **Identify the Word**: The word is "strawberry".
2. **Count the Occurrences of "r"**:
   - The first letter is "s".
   - The second letter is "t".
   - The third letter is "r".
   - The fourth letter is "a".
   - The fi

AI's thought process and answer:
### Step 1: Analyze the Problem Requirements

The problem requires us to count the number of times the letter "r" appears in the word "strawberry". This is a straightforward counting problem where we need to identify and tally the occurrences of the letter "r" in the given word.

### Step 2: List the Steps to Solve the Problem

1. **Identify the Word**: The word given is "strawberry".
2. **Count the Occurrences of "r"**: Go through each letter in the word and count how many times "r" appears.
3. **Summarize the Count**: Provide the total count of "r" in the word.

### Step 3: Execute the Solution Process

1. **Identify the Word**: The word is "strawberry".
2. **Count the Occurrences of "r"**:
   - The first letter is "s".
   - The second letter is "t".
   - The third letter is "r".
   - The fourth letter is "a".
   - The fifth letter is "w".
   - The sixth letter is "b".
   - The seventh letter is "e".
   - The eighth letter is "r".
   - The ninth lette

2024-12-05 20:24:52,637 - INFO - Answer verification result: True


Answer verification result: Correct
--------------------------------------------------


### Export the generated answers to a JSON file


In [13]:
simplified_output = {
    'timestamp': datetime.now().isoformat(),
    'qa_pairs': generated_answers
}
simplified_file = f'generated_answers_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(simplified_file, 'w', encoding='utf-8') as f:
    json.dump(simplified_output, f, ensure_ascii=False, indent=2)
print(f"The generated answers have been exported to: {simplified_file}")

The generated answers have been exported to: generated_answers_20241205_202452.json
