## load data

In [2]:
from data import MathDataset
from torch.utils.data import DataLoader

data_path = "data/test.jsonl"
dataset = MathDataset(data_path, data_num=8)
dataLoader = DataLoader(dataset, batch_size=4, shuffle=False)
for batch in dataLoader:
    questions = batch['question']
    answers = batch['answer']
    for qn, ans in zip(questions, answers):
        print(qn)
        print(ans)
        print("----"*20)
    break

  from .autonotebook import tqdm as notebook_tqdm


Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? Please answer step by step and give the final answer(only the value of the result) after '#### '
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
--------------------------------------------------------------------------------
A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take? Please answer step by step and give the final answer(only the value of the result) after '#### '
It takes 2/2=<<2/2=1>>1 bolt of white fiber
So the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric
#### 3
--------------------------------------------------------------------------------
Josh decides

## load agents

In [3]:
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.messages import StructuredMessage
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient

# Define a model client. You can use other model client that implements
# the `ChatCompletionClient` interface.
model_client = OpenAIChatCompletionClient(
    model='qwen2.5-7b-instruct',
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    api_key="xxx",  # your api-key. Recommend to use environment variable (pip install dotenv).
    model_info={
        "vision": False,
        "function_calling": True,
        "json_output": True,
        "family": 'Qwen2.5',
    }
)


# Define an AssistantAgent with the model, tool, system message, and reflection enabled.
# The system message instructs the agent via natural language.
agent = AssistantAgent(
    name="math_assistant",
    model_client=model_client,
    system_message="You are a helpful assistant for solving math problems.",
    model_client_stream=True,  # Enable streaming tokens from the model client.
)


# # Run the agent and stream the messages to the console.
# result = await agent.run(task="What is 2 + 4 ?")
# print(result.messages[1].content)


  validate_model_info(self._model_info)


## Inference and evaluate

In [4]:
import re
import time

ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"

def extract_answer(completion):
    match = ANS_RE.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "").replace("$", "").replace(".00", "")
        return match_str
    else:
        return INVALID_ANS

def is_correct(model_completion, gt_example):
    gt_answer = extract_answer(gt_example)
    assert gt_answer != INVALID_ANS
    exact_correct = extract_answer(model_completion) == gt_answer
    return exact_correct

scores = []
st_time = time.time()
for batch in dataLoader:
    questions = batch['question']
    answers = batch['answer']
    for qn, ans in zip(questions, answers):
        print(f"[Quastion]: {qn}")
        print(f"[Answer]: {ans}")
        # Run the agent and stream the messages to the console.
        result = await agent.run(task=qn)
        model_res = result.messages[1].content
        model_ans = extract_answer(model_res)
        ans_is_correct = is_correct(model_res, ans)
        print(f"[Model Response]: {model_res}")
        print(f"[Model Answer]: {model_ans}")
        print(f"[Is correct]: {ans_is_correct}")
        scores.append(1) if ans_is_correct else scores.append(0)
        print("----"*20)
    break
et_time = time.time()

acc = sum(scores) / len(scores)
duration = et_time - st_time
print(f"Accuracy: {acc}")
print(f"Duration: {duration}, Average time per question: {duration / len(scores)}")

[Quastion]: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? Please answer step by step and give the final answer(only the value of the result) after '#### '
[Answer]: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
[Model Response]: Let's break down the problem step by step.

1. **Determine the total number of eggs laid per day:**
   Janet's ducks lay 16 eggs per day.

2. **Calculate the number of eggs used by Janet:**
   - Janet eats 3 eggs for breakfast every morning.
   - She uses 4 eggs to bake muffins for her friends every day.
   Therefore, the total number of eggs used by Janet is:
   \[
   3 + 4 = 7 \text{ eggs}
   \]

3. **Calculate the number of eggs remaining after 