In [29]:
#imports
import json
import re
from sympy import simplify
from collections import defaultdict
import numpy as np
from utils import load_test_puzzles
from data.data import GameOf24Data
import os
import pandas as pd
import textwrap
from colorama import Fore, Style, init

In [127]:
value_reflects = ['Left numbers after Step 1: 8 10 14\n\n8 10 14: impossible', 'Left numbers after Step 1: 8 10 13  \nLabel: impossible\n\nLeft numbers after Step 2: 13 18  \nLabel: impossible']

# Step 2: Normalize and clean
summarized_labels = [
    entry.strip() for entry in value_reflects
    if re.match(r'^-?\d+(?:[\s,]+-?\d+)*:\s*(sure|impossible)$', entry.strip())
]
text_reflections = [entry for entry in value_reflects if entry not in summarized_labels]

for i, reflection in enumerate(text_reflections):
    if 'Label' in reflection:
        text_reflections[i] = reflection.replace("Label", "").replace("\n", "")

print("text before ", text_reflections)
# Step 3: Normalize commas to avoid broken matches (e.g., '1, 5, 6: sure')
text = " ".join(text_reflections)
text = re.sub(r'(?<=\d),\s*(?=\d)', ' ', text).replace("{", "").replace("}", "")
print("text after ", repr(text))

# Step 4: Define patterns
patterns = [
    r'\(left:\s*([\d\s]+)\)\s*[:\-–]?\s*(sure|impossible)',  # Old format
    r'(-?\d+(?:\s+-?\d+)*)\s*:\s*(sure|impossible)'              # New format
]


# Step 5: Extract and deduplicate
new_labels = []
for pattern in patterns:
    matches = re.findall(pattern, text)
    for nums, label in matches:
        formatted = f"{nums.strip()}: {label}"
        if formatted not in summarized_labels and formatted not in new_labels:
            new_labels.append(formatted)
            
print("new labels: ", new_labels)
# Step 6: Combine all labels
value_reflects = summarized_labels + new_labels

for i, reflection in enumerate(value_reflects):

    if '\n' in reflection:
        value_reflects[i] = reflection.split('\n')[-1]
print(value_reflects)

text before  ['Left numbers after Step 1: 8 10 14\n\n8 10 14: impossible', 'Left numbers after Step 1: 8 10 13  : impossibleLeft numbers after Step 2: 13 18  : impossible']
text after  'Left numbers after Step 1: 8 10 14\n\n8 10 14: impossible Left numbers after Step 1: 8 10 13  : impossibleLeft numbers after Step 2: 13 18  : impossible'
new labels:  ['8 10 14\n\n8 10 14: impossible', '8 10 13: impossible', '13 18: impossible']
['8 10 14: impossible', '8 10 13: impossible', '13 18: impossible']


In [130]:
def shorten_value_reflects(value_reflects):
    # Step 2: Normalize and clean
    summarized_labels = [
        entry.strip() for entry in value_reflects
        if re.match(r'^-?\d+(?:[\s,]+-?\d+)*:\s*(sure|impossible)$', entry.strip())
    ]
    text_reflections = [entry for entry in value_reflects if entry not in summarized_labels]
    
    for i, reflection in enumerate(text_reflections):
        if 'Label' in reflection:
            text_reflections[i] = reflection.replace("Label", "").replace("\n", "")
        
    # Step 3: Normalize commas to avoid broken matches (e.g., '1, 5, 6: sure')
    text = " ".join(text_reflections)
    text = re.sub(r'(?<=\d),\s*(?=\d)', ' ', text).replace("{", "").replace("}", "")
    
    # Step 4: Define patterns
    patterns = [
        r'\(left:\s*([\d\s]+)\)\s*[:\-–]?\s*(sure|impossible)',  # Old format
        r'(-?\d+(?:\s+-?\d+)*)\s*:\s*(sure|impossible)'              # New format
    ]
    
    
    # Step 5: Extract and deduplicate
    new_labels = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        for nums, label in matches:
            formatted = f"{nums.strip()}: {label}"
            if formatted not in summarized_labels and formatted not in new_labels:
                new_labels.append(formatted)
                
    
    # Step 6: Combine all labels
    value_reflects = summarized_labels + new_labels
    
    for i, reflection in enumerate(value_reflects):

        if '\n' in reflection:
            value_reflects[i] = reflection.split('\n')[-1]
    print(value_reflects)
    return value_reflects
    

# Let's assume value_reflects is this at the start:

newlines = ['Left numbers after Step 1: 12\n\n12: impossible']
colon = ['1 5 6: sure', '4 6: sure', '24: sure', "The given attempt answer is: \n['Step 1: 1 + 4 = 5 (left: 1 5 6)']\n\nThe formula with left numbers from each step is: \n1 + 4 = 5 (left: 1 5 6): impossible \n\nThis is labeled 'impossible' because the feedback states that Step 4 (which is not shown in this attempt) is illegal. However, this implies that the given step is not leading to a correct solution. Additionally, the given step itself seems correct in terms of arithmetic operation, but considering it's part of an ultimately incorrect solution and given the constraints of the problem, it cannot lead to 24 with the remaining numbers 1, 5, and 6 in a way that follows the rules."]
commas = ['2 4 6: impossible', '1 1 4 6: sure', '1 5 6: sure', '6 6: impossible', "To evaluate the given step and provide labels, let's follow the instructions and the step provided:\n\n1. The initial numbers are: 1, 1, 4, 6\n2. The attempt answer's first step is: 1 + 4 = 5, resulting in the left numbers: 1, 5, 6\n\nGiven this, we evaluate the left numbers from each step as follows:\n\n1, 1, 4, 6: sure (since we are starting with all numbers and haven't applied any operations yet)\n1, 5, 6: sure (because the operation to get to this point is valid, and we still have to explore if these numbers can lead to 24)\n\nHowever, since the feedback mentions that Step 2 is illegal, we need to understand that without knowing what the attempted Step 2 was, we can only assume the mistake occurred after the provided step. Thus, our evaluation for the step provided is based on its own merit and the feedback that the subsequent step was incorrect.\n\nGiven the information and focusing on the step provided (1 + 4 = 5), the numbers 1, 5, 6 are left, and without the exact nature of the illegal Step 2, we cannot conclusively label the outcome of the sequence as 'impossible' based solely on the provided step. The step itself (1 + 4 = 5) is a valid operation, leaving us with 1, 5, 6, which could potentially lead to 24 with the right subsequent operations. \n\nTherefore, based on the information given:\n1, 1, 4, 6: sure\n1, 5, 6: sure\n\nKeep in mind, the 'sure' label here indicates that the step provided is valid and that these numbers could potentially lead to a solution, not that they definitively will lead to 24 without further correct operations."]
brackets = ['Let\'s analyze the steps and the left numbers after each step.\n\n**Initial numbers:**\n1 1 11 11\n\n**Step 1 (given):**\n11 + 1 = 12 (left: 1 1 12)\n\n**Feedback indicates this is illegal** because the operation uses a number (11 + 1) that does not exist in the previous set (which was 1 1 11 11). You cannot combine or reuse numbers outside the current set, nor can you drop numbers arbitrarily.\n\n---\n\nNow, let\'s consider the left numbers after each step:\n\n1. After initial: **1 1 11 11** — initial set\n2. After "11 + 1 = 12": left would be **1 1 12**\n\n**Label for initial set:**\n- {1 1 11 11}: **sure** (initial set, no operations yet)\n\n**Label for the set after step 1:**\n- {1 1 12}: **impossible** (since the step is illegal as per feedback)\n\n---\n\n**Final answer:**\n\n{1 1 11 11}: sure  \n{1 1 12}: impossible']
minus = ['Left numbers from each step:\n- After Step 1: 4 5 11\n- After Step 2: 4 6\n- After Step 3: -2\n\nLabels:\n4 5 11: sure  \n4 6: sure  \n-2: impossible', 'Left numbers from each step:\n- Step 1: 24\n- Step 2: 12, 1, 5, 11\n\nLabels:\n24: sure\n12, 1, 5, 11: impossible', 'Left numbers after Step 1: 1, 4, 5, 12\n\nSince the answer provided is 4 * 6 = 24, but 6 is not one of the original numbers (1, 4, 5, 12), and the operation does not use only the original numbers, this attempt is invalid.\n\nTherefore, the label for the original numbers is:\n\n1, 4, 5, 12: impossible']
label = ['Left numbers after each step:\n\n1. 12 22\n\nLabel: impossible']
label_2 = ["Left numbers after Step 1: 5 6 24  \nLeft numbers after Step 2: 4 5\n\nStep 1: 4 * 6 = 24 (left: 5 6 24)  \nLabel: impossible (since the left numbers are 5 6 24, and using 5 and 6, it's possible to reach 24, but the feedback states that the step is correct in leading to 24, so the focus is on whether the left numbers can lead to 24. Since the left numbers are 5 6 24, and 5 and 6 can be combined to make 24 (e.g., 6 * 4.166...), but that's not valid with the given numbers, so the key is that the step's left numbers after the operation are 5 6 24, which includes 24, and the remaining numbers are 5 and 6. Given that, the step's left numbers are 5 6 24, and the question is whether these can lead to 24. Since 24 is already present, the label is 'sure' because the step is correct and can lead to 24, but the previous feedback says Step 2 is impossible, so the overall conclusion is that the left numbers after Step 1 are 'sure' because they include 24 which is the goal, so:\n\n5 6 24: sure\n\nStep 2: 24 / 6 = 4 (left: 4 5)  \nLabel: impossible (as per feedback, this step cannot lead to 24 with the remaining numbers). The left numbers after this step are 4 5, which cannot produce 24 with the allowed operations.\n\nFinal answer:\n4 5: impossible"]
final_boss = ['Left numbers after Step 1: 8 10 14\n\n8 10 14: impossible', 'Left numbers after Step 1: 8 10 13  \nLabel: impossible\n\nLeft numbers after Step 2: 13 18  \nLabel: impossible', 'Left numbers from each step:\n- After Step 1: 8, 3\n- After Step 2: 5\n\nLabels:\n8, 3: sure  \n5: impossible', 'Left numbers from each step:\n- Step 1: 8, 10, 13\n- Step 2: 24, 10, 13\n\nLabels:\n8, 10, 13: impossible\n24, 10, 13: impossible', 'Left numbers from each step:\nStep 1: 8 * 3 = 24 (left: 24)\nStep 2: answer: (13 - 10) * (13 / 1) = 24\n\nLabels:\n24: impossible  \n(13 - 10) * (13 / 1): impossible']

all_data = {
    "newlines": newlines,
    "colon": colon,
    "commas": commas,
    "brackets": brackets,
    "minus": minus,
    "label": label,
    "label_2": label_2,
    "final_boss": final_boss
}

expected_results = {
    "newlines": ['12: impossible'],
    "colon": ['1 5 6: sure', '4 6: sure', '24: sure', '1 5 6: impossible'],
    "commas": ['2 4 6: impossible', '1 1 4 6: sure', '1 5 6: sure', '6 6: impossible'],
    "brackets": ['1 1 11 11: sure', '1 1 12: impossible'],
    "minus": ['4 5 11: sure', '4 6: sure', '-2: impossible', '24: sure', '12 1 5 11: impossible', '1 4 5 12: impossible'],
    "label": ['12 22: impossible'],
    "label_2": ['5 6 24: impossible', '4 5: impossible', '5 6 24: sure'],
    "final_boss": ['8 10 14: impossible', '8 10 13: impossible', '13 18: impossible', '8 3: sure', '5: impossible',  '24 10 13: impossible', '24: impossible']
}

results = {}
for name, lst in all_data.items():
    results[name] = shorten_value_reflects(lst)
    print(f'expected below {name}')
    print(expected_results[name])
    assert results[name] == expected_results[name], f"{name} failed ❌"
    print(f"\n=== Test result for '{name}'✅ ===")
    print(results[name])


['12: impossible']
expected below newlines
['12: impossible']

=== Test result for 'newlines'✅ ===
['12: impossible']
['1 5 6: sure', '4 6: sure', '24: sure', '1 5 6: impossible']
expected below colon
['1 5 6: sure', '4 6: sure', '24: sure', '1 5 6: impossible']

=== Test result for 'colon'✅ ===
['1 5 6: sure', '4 6: sure', '24: sure', '1 5 6: impossible']
['2 4 6: impossible', '1 1 4 6: sure', '1 5 6: sure', '6 6: impossible']
expected below commas
['2 4 6: impossible', '1 1 4 6: sure', '1 5 6: sure', '6 6: impossible']

=== Test result for 'commas'✅ ===
['2 4 6: impossible', '1 1 4 6: sure', '1 5 6: sure', '6 6: impossible']
['1 1 11 11: sure', '1 1 12: impossible']
expected below brackets
['1 1 11 11: sure', '1 1 12: impossible']

=== Test result for 'brackets'✅ ===
['1 1 11 11: sure', '1 1 12: impossible']
['4 5 11: sure', '4 6: sure', '-2: impossible', '24: sure', '12 1 5 11: impossible', '1 4 5 12: impossible']
expected below minus
['4 5 11: sure', '4 6: sure', '-2: impossible', 

In [32]:
datadir = f'24_tot.csv'
DATA_PATH = 'data'
path = os.path.join(DATA_PATH, 'datasets', datadir)
data = list(pd.read_csv(path)['Puzzles'])
arr = np.arange(len(data))
uniform_test_indices = np.linspace(0, len(data) - 1, 60, dtype=int)
remaining_indices = [i for i in arr if i not in set(uniform_test_indices)]
uniform_val_indices = np.array([remaining_indices[i] for i in np.linspace(0, len(remaining_indices) - 1, 30, dtype=int)])
uniform_indices = uniform_val_indices
index = 0
puzzle = data[uniform_indices[index]]
for i, index in enumerate(uniform_val_indices):
    puzzle = data[uniform_indices[i]]
    print(f'index: {uniform_indices[i]}\npuzzle: {puzzle}')

index: 1
puzzle: 1 1 11 11
index: 47
puzzle: 11 11 11 12
index: 94
puzzle: 2 7 7 8
index: 141
puzzle: 1 4 5 12
index: 188
puzzle: 1 1 3 4
index: 235
puzzle: 1 8 10 13
index: 282
puzzle: 4 5 6 6
index: 329
puzzle: 5 6 7 7
index: 375
puzzle: 2 8 12 12
index: 422
puzzle: 3 7 12 13
index: 469
puzzle: 3 9 10 11
index: 516
puzzle: 2 10 10 11
index: 563
puzzle: 2 6 8 11
index: 610
puzzle: 3 4 5 5
index: 657
puzzle: 2 7 9 10
index: 703
puzzle: 3 3 8 12
index: 750
puzzle: 2 8 9 11
index: 797
puzzle: 3 6 12 13
index: 844
puzzle: 2 5 6 10
index: 891
puzzle: 2 6 12 12
index: 938
puzzle: 2 7 12 13
index: 985
puzzle: 5 9 12 12
index: 1031
puzzle: 4 8 8 10
index: 1078
puzzle: 1 4 7 9
index: 1125
puzzle: 2 4 9 10
index: 1172
puzzle: 1 3 12 13
index: 1219
puzzle: 2 3 11 11
index: 1266
puzzle: 2 6 9 9
index: 1313
puzzle: 5 7 9 12
index: 1360
puzzle: 1 3 4 6


In [16]:
def check_valid_move(idx, last_step, cur_step):
    if idx == 1:
        original_nums = [float(num) for num in last_step.split(" ")]
    else:
        original_nums = [float(num) for num in last_step.split('left:')[-1].strip("()").split(" ") if
                        num != '']
    formula = [op for op in cur_step.split('left:')[0].strip("()").split(" ") if op != '']
    new_nums = [float(num) for num in cur_step.split('left:')[-1].strip().strip("()").split(" ") if num != ''] #Added a strip() to remove potential whitespace behind "(left: x)  "
    print("idx: ", idx)
    print("last step: ", last_step)
    print("cur step: ", cur_step) 
    print("original nums: ", original_nums)
    print("formula: ", formula)
    print("new nums: ", new_nums)
    
    
    try:
        #print(original_nums, new_nums, formula)
        original_nums.remove(float(eval(formula[0])))
        print("original nums after removing formular[0] ", original_nums)
        original_nums.remove(float(eval(formula[2])))
        print("original nums after removing formular[0] ", original_nums)
        for num in original_nums:
            new_nums.remove(num)
            print(f"new nums after removing new_nums[{num}]", new_nums)
            
        new_nums.remove(float(formula[4]))
        print(f"new nums after removing float(formula[4])", new_nums)
        
        assert len(new_nums) == 0
    except ValueError:
        return False, "You use value that does not exists in last step or you use them repeatedly; or you drop numbers from the last step."
    except AssertionError:
        return False, "You have more numbers left than expected."

    return True, "The move the valid and correct."

check_valid_move(1, "1 1 11 11", "11 - 1 = 10 (left: 1 10 11)")

idx:  1
last step:  1 1 11 11
cur step:  11 - 1 = 10 (left: 1 10 11)
original nums:  [1.0, 1.0, 11.0, 11.0]
formula:  ['11', '-', '1', '=', '10']
new nums:  [1.0, 10.0, 11.0]
original nums after removing formular[0]  [1.0, 1.0, 11.0]
original nums after removing formular[0]  [1.0, 11.0]
new nums after removing new_nums[1.0] [10.0, 11.0]
new nums after removing new_nums[11.0] [10.0]
new nums after removing float(formula[4]) []


(True, 'The move the valid and correct.')

In [13]:
from collections import Counter

def check_valid_move_new(idx, last_step, cur_step):
    if idx == 1:
        original_nums = [float(num) for num in last_step.split(" ")]
    else:
        original_nums = [float(num) for num in last_step.split('left:')[-1].strip("()").split(" ") if num != '']
    formula = [op for op in cur_step.split('left:')[0].strip("()").split(" ") if op != '']
    new_nums = [float(num) for num in cur_step.split('left:')[-1].strip().strip("()").split(" ") if num != '']

    print("idx: ", idx)
    print("last step: ", last_step)
    print("cur step: ", cur_step) 
    print("original nums: ", original_nums)
    print("formula: ", formula)
    print("new nums: ", new_nums)
    
    try:
        original_counter = Counter(original_nums)
        used1 = float(eval(formula[0]))
        used2 = float(eval(formula[2]))
        result = float(formula[4])

        original_counter[used1] -= 1
        original_counter[used2] -= 1
        if original_counter[used1] < 0 or original_counter[used2] < 0:
            raise ValueError()

        new_counter = Counter(new_nums)
        for num, count in original_counter.items():
            new_counter[num] -= count
            if new_counter[num] < 0:
                raise ValueError()

        new_counter[result] -= 1
        print(new_counter)
        if new_counter[result] != 0 or any(v != 0 for v in new_counter.values()):
            raise AssertionError()
        
    except ValueError:
        return False, "You use value that does not exists in last step or you use them repeatedly; or you drop numbers from the last step."
    except AssertionError:
        return False, "You have more numbers left than expected."

    return True, "The move the valid and correct."

In [14]:
check_valid_move_new(1, "1 1 11 11", "11 - 1 = 10 (left: 1 1 10)")

idx:  1
last step:  1 1 11 11
cur step:  11 - 1 = 10 (left: 1 1 10)
original nums:  [1.0, 1.0, 11.0, 11.0]
formula:  ['11', '-', '1', '=', '10']
new nums:  [1.0, 1.0, 10.0]


(False,
 'You use value that does not exists in last step or you use them repeatedly; or you drop numbers from the last step.')

In [6]:
import re
from collections import defaultdict
from sympy import simplify

def verify(puzzle: str, action: str) -> dict:
    """
    Verifies a Game of 24 action string against a puzzle.

    Returns a dictionary of specific error types (each with count 1 if it occurred).
    """
    errors = defaultdict(int)

    puzzle_numbers = re.findall(r'\d+', puzzle)
    puzzle_count = {num: puzzle_numbers.count(num) for num in puzzle_numbers}

    steps = [line.strip() for line in action.strip().split('\n') if line.strip()]
    if not steps:
        errors["No final state"] += 1
        return errors

    # --- Final state check (based on last "left:" step) ---
    final_state_line = None
    for line in reversed(steps):
        match = re.search(r'left:\s*(.*)', line)
        if match:
            final_state_line = match.group(1)
            break

    if not final_state_line:
        errors["No final state"] += 1
    else:
        final_numbers = final_state_line.split()
        if len(final_numbers) > 1:
            errors["More than one final number"] += 1
        elif final_numbers[0] != "24":
            errors["Final number not equal 24"] += 1

    # --- Check for notes in steps ---
    for step in steps:
        if "(Note:" in step or "note:" in step.lower():
            errors["Gives note"] += 1
            break

    # --- Check for multiple answer lines ---
    answer_lines = [line for line in steps if line.lower().startswith("answer:")]
    if len(answer_lines) > 1:
        errors["Writes more than one answer"] += 1

    # --- Extract expression from answer or fallback equation ---
    expr = None
    if answer_lines:
        expr = answer_lines[-1].split(":", 1)[1].split("=")[0].strip()
    else:
        # fallback to last equation
        for line in reversed(steps):
            if "=" in line:
                expr = line.split("=")[0].strip()
                break

    # --- Expression validation ---
    if expr:
        expr_numbers = re.findall(r'\d+', expr)
        expr_count = {num: expr_numbers.count(num) for num in expr_numbers}

        # Uses number not in puzzle
        for num in expr_count:
            if num not in puzzle_count:
                errors["Uses number not in puzzle"] += 1
            elif expr_count[num] > puzzle_count[num]:
                errors["Same number twice"] += 1

        # Does not use all input numbers
        if sorted(expr_numbers) != sorted(puzzle_numbers):
            errors["Does not use all steps or input numbers"] += 1

        # Try evaluating expression
        try:
            if simplify(expr) != 24:
                errors["Expression does not evaluate to 24"] += 1
        except Exception:
            errors["Invalid expression"] += 1
    else:
        errors["Invalid expression"] += 1

    return dict(errors)


In [4]:
#Load log file
filename = r'logs\recent\gameof24\RAFA\game24\gpt-4.1-nano-2025-04-14_0.7_single_10_value_1_greedy_1_16PuzzlesNoSelectionState_convertErrorNotFixed_time1748461091.json'
with open(filename) as f:
    data = json.load(f)

In [7]:

error_counts = defaultdict(int)
total_actions = 0

for entry in data:
    puzzle = entry['agent_info'][0]['steps'][0]['x']
    for env in entry['env_info']:
        total_actions += 1
        errors = verify(puzzle, env['action'])
        for error, count in errors.items():
            error_counts[error] += count

# Format results with x/N style
final_results = {
    error: f"{count}/{total_actions}" for error, count in error_counts.items()
}

print(final_results)


{'Final number not equal 24': '127/336', 'Uses number not in puzzle': '191/336', 'Does not use all steps or input numbers': '221/336', 'Expression does not evaluate to 24': '159/336', 'More than one final number': '95/336', 'No final state': '114/336', 'Invalid expression': '116/336', 'Same number twice': '5/336'}


In [49]:
true_total = sum(len(entry.get("env_info", [])) for entry in data)
print("True number of actions:", true_total)


True number of actions: 336


In [51]:
j = 0
for entry in data:
    puzzle = entry['agent_info'][0]['steps'][0]['x']  # starting numbers
    for i, env in enumerate(entry['env_info']):
        j+=1
        action = env['action']
        result = verify(puzzle, action)
        print(f"Action {i}: {result}")
print(j)

Action 0: {'No final state': 1}
Action 1: {'No final state': 1}
Action 2: {'No final state': 1}
Action 3: {'No final state': 1}
Action 4: {'Final number not equal 24': 1, 'Does not use all input numbers': 1}
Action 5: {'Final number not equal 24': 1, 'Does not use all input numbers': 1}
Action 6: {'Final number not equal 24': 1, 'Does not use all input numbers': 1, 'Invalid expression': 1}
Action 7: {'Final number not equal 24': 1, 'Does not use all input numbers': 1, 'Invalid expression': 1}
Action 8: {'Final number not equal 24': 1, 'Does not use all input numbers': 1, 'Invalid expression': 1}
Action 9: {'No final state': 1}
Action 10: {'No final state': 1}
Action 11: {'Final number not equal 24': 1, 'Does not use all input numbers': 1, 'Invalid expression': 1}
Action 12: {'Final number not equal 24': 1, 'Does not use all input numbers': 1}
Action 13: {'No final state': 1}
Action 14: {'Final number not equal 24': 1, 'Does not use all input numbers': 1}
Action 15: {'Final number not e

In [None]:


for entry in data:
    env_infos = entry.get("env_info", [])
    for i, env in enumerate(env_infos):
        action = env.get("action", "")
        
        print(f"Action {i}:\n{action}\n{'-'*40}")

'6 * 4 = 24 (left: 1 1 24)\n1 + 1 = 2 (left: 2 24)\n24 / 2 = 12 (left: 12)\n12 * 2 = 24 (left: 24)\n'
Action 0:
6 * 4 = 24 (left: 1 1 24)
1 + 1 = 2 (left: 2 24)
24 / 2 = 12 (left: 12)
12 * 2 = 24 (left: 24)

----------------------------------------
'1 + 1 = 2 (left: 2 24)\n2 * 24 = 48 is not possible since we want to reach 24, 24 / 2 = 12 (left: 12)\nNo possible next steps, as there is only one number.\n'
Action 1:
1 + 1 = 2 (left: 2 24)
2 * 24 = 48 is not possible since we want to reach 24, 24 / 2 = 12 (left: 12)
No possible next steps, as there is only one number.

----------------------------------------
'1 + 1 = 2 (left: 2 24)\n24 / 2 = 12 (left: 12)\nNo possible next steps can be generated with a single number, as we need at least two numbers to perform basic arithmetic operations.\n'
Action 2:
1 + 1 = 2 (left: 2 24)
24 / 2 = 12 (left: 12)
No possible next steps can be generated with a single number, as we need at least two numbers to perform basic arithmetic operations.

--------

In [19]:
from dataclasses import dataclass

@dataclass
class GameOf24State:
    puzzle: str
    steps: list[str]
    current_state: str


In [22]:
def verify2(state: GameOf24State)-> dict:
            """
            Verifies the output of a given task
                1. Checks if the numbers used are the same as the ones provided.
                2. Checks if the operations performed result to 24.

            States 
                {"r": 0} : Not finished.
                {"r": 1} : Finished and correct.
                {"r": -1} : Finished and incorrect.
            """
            current_states = state.current_state.split(" ")
            if len(current_states) !=1 or len(state.steps)<=3:
                # More than one number left
                return {'r':0}
            elif current_states[0] != "24":
                # One number left and it is not 24
                return {'r':-1}
            else:
                # One number left and it is 24
                expression = state.steps[-1].lower().replace('answer: ', '').split('=')[0]
                numbers = re.findall(r'\d+', expression)
                problem_numbers = re.findall(r'\d+', state.puzzle)
                if sorted(numbers) != sorted(problem_numbers):
                    # Numbers used are not the same as the ones provided
                    return {'r': -1}
                try:
                    if sympy.simplify(expression) == 24:
                        return {'r': 1}
                    else:
                        # Operations performed do not result to 24
                        return {'r': -1}
                except Exception as e:
                    print(e)
                    return {'r': -1}


In [23]:
for entry in data:
    x = entry['agent_info'][0]['steps'][0]['x']  # assume all agent_info use the same puzzle
    for i, env in enumerate(entry['env_info']):
        action = env['action']
        lines = [line.strip() for line in action.strip().split('\n') if line.strip()]
        if not lines:
            continue
        last_line = lines[-1]
        # Extract the final numbers left, e.g. "left: 24" or "left: 1 1"
        match = re.search(r'left:\s*(.*)', last_line)
        current_state = match.group(1) if match else ""

        state = GameOf24State(
            puzzle=x,
            steps=lines,
            current_state=current_state
        )

        result = verify2(state)
        print(f"Action {i} result: {result} | Current State: {current_state}")

Action 0 result: {'r': -1} | Current State: 24)
Action 1 result: {'r': 0} | Current State: 
Action 2 result: {'r': 0} | Current State: 
Action 3 result: {'r': 0} | Current State: 


In [57]:
import json
import re
from collections import defaultdict
from sympy import simplify

# === Verification logic ===
def verify(puzzle: str, action: str) -> dict:
    errors = defaultdict(int)
    steps = [line.strip() for line in action.strip().split('\n') if line.strip()]

    if not steps:
        errors["More than one final number / final state not 24"] += 1
        return errors

    final_line = steps[-1]
    match = re.search(r'left:\s*(.*)', final_line)
    if not match:
        errors["No final state"] += 1
        return errors

    final_numbers = match.group(1).split()
    if len(final_numbers) != 1:
        errors["More than one final number"] += 1
    if final_numbers[0] != "24":
        errors["Final number not equal 24"] += 1

    # Try to find the last valid equation to evaluate
    for line in reversed(steps):
        if "=" in line:
            try:
                expr = line.split("=")[0].strip()
                numbers_in_expr = re.findall(r'\d+', expr)
                numbers_in_puzzle = re.findall(r'\d+', puzzle)

                if sorted(numbers_in_expr) != sorted(numbers_in_puzzle):
                    errors["Does not use all input numbers"] += 1
                try:
                    if simplify(expr) != 24:
                        errors["Expression does not evaluate to 24"] += 1
                except Exception:
                    errors["Invalid expression"] += 1
                break  # only consider the last expression
            except Exception:
                errors["Invalid expression"] += 1
                break
    else:
        errors["No valid expression found"] += 1

    return errors


# === Aggregation ===
error_counts = defaultdict(int)
total_actions = 0
skipped = 0

for entry_idx, entry in enumerate(data):
    try:
        puzzle = entry['agent_info'][0]['steps'][0]['x']
    except (KeyError, IndexError):
        print(f"Missing puzzle at entry {entry_idx}")
        continue

    for env_idx, env in enumerate(entry.get("env_info", [])):
        total_actions += 1  # ✅ Count all envs

        action = env.get('action')
        if not action:
            skipped += 1
            print(f"Missing or empty action at entry {entry_idx}, env {env_idx}")
            continue

        errors = verify(puzzle, action)
        for err_type, count in errors.items():
            error_counts[err_type] += count

# === Format output ===
final_results = {
    error: f"{count}/{total_actions}" for error, count in sorted(error_counts.items())
}

print("\nErrors per type:")
for key, value in final_results.items():
    print(f"- {key}: {value}")

print(f"\nTotal actions processed: {total_actions}")
print(f"Total skipped envs (missing/empty action): {skipped}")



Errors per type:
- Does not use all input numbers: 108/336
- Expression does not evaluate to 24: 66/336
- Final number not equal 24: 107/336
- More than one final number: 33/336
- No final state: 228/336

Total actions processed: 336
Total skipped envs (missing/empty action): 0


In [97]:
true_total = sum(len(entry.get("env_info", [])) for entry in data)
print(f"True number of env_info entries (actions): {true_total}")

AttributeError: 'str' object has no attribute 'get'