In [4]:
#imports
import json
import re
from sympy import simplify
from collections import defaultdict
import numpy as np
from utils import load_test_puzzles
from data.data import GameOf24Data
import os
import pandas as pd

In [18]:
datadir = f'24_tot.csv'
DATA_PATH = 'data'
path = os.path.join(DATA_PATH, 'datasets', datadir)
data = list(pd.read_csv(path)['Puzzles'])
arr = np.arange(len(data))
uniform_test_indices = np.linspace(0, len(data) - 1, 60, dtype=int)
remaining_indices = [i for i in arr if i not in set(uniform_test_indices)]
uniform_val_indices = np.array([remaining_indices[i] for i in np.linspace(0, len(remaining_indices) - 1, 30, dtype=int)])
uniform_indices = uniform_val_indices
index = 0
puzzle = data[uniform_indices[index]]
for i, index in enumerate(uniform_val_indices):
    puzzle = data[uniform_indices[i]]
    print(f'index: {uniform_indices[i]}\npuzzle: {puzzle}')

index: 1
puzzle: 1 1 11 11
index: 47
puzzle: 11 11 11 12
index: 94
puzzle: 2 7 7 8
index: 141
puzzle: 1 4 5 12
index: 188
puzzle: 1 1 3 4
index: 235
puzzle: 1 8 10 13
index: 282
puzzle: 4 5 6 6
index: 329
puzzle: 5 6 7 7
index: 375
puzzle: 2 8 12 12
index: 422
puzzle: 3 7 12 13
index: 469
puzzle: 3 9 10 11
index: 516
puzzle: 2 10 10 11
index: 563
puzzle: 2 6 8 11
index: 610
puzzle: 3 4 5 5
index: 657
puzzle: 2 7 9 10
index: 703
puzzle: 3 3 8 12
index: 750
puzzle: 2 8 9 11
index: 797
puzzle: 3 6 12 13
index: 844
puzzle: 2 5 6 10
index: 891
puzzle: 2 6 12 12
index: 938
puzzle: 2 7 12 13
index: 985
puzzle: 5 9 12 12
index: 1031
puzzle: 4 8 8 10
index: 1078
puzzle: 1 4 7 9
index: 1125
puzzle: 2 4 9 10
index: 1172
puzzle: 1 3 12 13
index: 1219
puzzle: 2 3 11 11
index: 1266
puzzle: 2 6 9 9
index: 1313
puzzle: 5 7 9 12
index: 1360
puzzle: 1 3 4 6


In [6]:
import re
from collections import defaultdict
from sympy import simplify

def verify(puzzle: str, action: str) -> dict:
    """
    Verifies a Game of 24 action string against a puzzle.

    Returns a dictionary of specific error types (each with count 1 if it occurred).
    """
    errors = defaultdict(int)

    puzzle_numbers = re.findall(r'\d+', puzzle)
    puzzle_count = {num: puzzle_numbers.count(num) for num in puzzle_numbers}

    steps = [line.strip() for line in action.strip().split('\n') if line.strip()]
    if not steps:
        errors["No final state"] += 1
        return errors

    # --- Final state check (based on last "left:" step) ---
    final_state_line = None
    for line in reversed(steps):
        match = re.search(r'left:\s*(.*)', line)
        if match:
            final_state_line = match.group(1)
            break

    if not final_state_line:
        errors["No final state"] += 1
    else:
        final_numbers = final_state_line.split()
        if len(final_numbers) > 1:
            errors["More than one final number"] += 1
        elif final_numbers[0] != "24":
            errors["Final number not equal 24"] += 1

    # --- Check for notes in steps ---
    for step in steps:
        if "(Note:" in step or "note:" in step.lower():
            errors["Gives note"] += 1
            break

    # --- Check for multiple answer lines ---
    answer_lines = [line for line in steps if line.lower().startswith("answer:")]
    if len(answer_lines) > 1:
        errors["Writes more than one answer"] += 1

    # --- Extract expression from answer or fallback equation ---
    expr = None
    if answer_lines:
        expr = answer_lines[-1].split(":", 1)[1].split("=")[0].strip()
    else:
        # fallback to last equation
        for line in reversed(steps):
            if "=" in line:
                expr = line.split("=")[0].strip()
                break

    # --- Expression validation ---
    if expr:
        expr_numbers = re.findall(r'\d+', expr)
        expr_count = {num: expr_numbers.count(num) for num in expr_numbers}

        # Uses number not in puzzle
        for num in expr_count:
            if num not in puzzle_count:
                errors["Uses number not in puzzle"] += 1
            elif expr_count[num] > puzzle_count[num]:
                errors["Same number twice"] += 1

        # Does not use all input numbers
        if sorted(expr_numbers) != sorted(puzzle_numbers):
            errors["Does not use all steps or input numbers"] += 1

        # Try evaluating expression
        try:
            if simplify(expr) != 24:
                errors["Expression does not evaluate to 24"] += 1
        except Exception:
            errors["Invalid expression"] += 1
    else:
        errors["Invalid expression"] += 1

    return dict(errors)


In [4]:
#Load log file
filename = r'logs\recent\gameof24\RAFA\game24\gpt-4.1-nano-2025-04-14_0.7_single_10_value_1_greedy_1_16PuzzlesNoSelectionState_convertErrorNotFixed_time1748461091.json'
with open(filename) as f:
    data = json.load(f)

In [7]:

error_counts = defaultdict(int)
total_actions = 0

for entry in data:
    puzzle = entry['agent_info'][0]['steps'][0]['x']
    for env in entry['env_info']:
        total_actions += 1
        errors = verify(puzzle, env['action'])
        for error, count in errors.items():
            error_counts[error] += count

# Format results with x/N style
final_results = {
    error: f"{count}/{total_actions}" for error, count in error_counts.items()
}

print(final_results)


{'Final number not equal 24': '127/336', 'Uses number not in puzzle': '191/336', 'Does not use all steps or input numbers': '221/336', 'Expression does not evaluate to 24': '159/336', 'More than one final number': '95/336', 'No final state': '114/336', 'Invalid expression': '116/336', 'Same number twice': '5/336'}


In [49]:
true_total = sum(len(entry.get("env_info", [])) for entry in data)
print("True number of actions:", true_total)


True number of actions: 336


In [51]:
j = 0
for entry in data:
    puzzle = entry['agent_info'][0]['steps'][0]['x']  # starting numbers
    for i, env in enumerate(entry['env_info']):
        j+=1
        action = env['action']
        result = verify(puzzle, action)
        print(f"Action {i}: {result}")
print(j)

Action 0: {'No final state': 1}
Action 1: {'No final state': 1}
Action 2: {'No final state': 1}
Action 3: {'No final state': 1}
Action 4: {'Final number not equal 24': 1, 'Does not use all input numbers': 1}
Action 5: {'Final number not equal 24': 1, 'Does not use all input numbers': 1}
Action 6: {'Final number not equal 24': 1, 'Does not use all input numbers': 1, 'Invalid expression': 1}
Action 7: {'Final number not equal 24': 1, 'Does not use all input numbers': 1, 'Invalid expression': 1}
Action 8: {'Final number not equal 24': 1, 'Does not use all input numbers': 1, 'Invalid expression': 1}
Action 9: {'No final state': 1}
Action 10: {'No final state': 1}
Action 11: {'Final number not equal 24': 1, 'Does not use all input numbers': 1, 'Invalid expression': 1}
Action 12: {'Final number not equal 24': 1, 'Does not use all input numbers': 1}
Action 13: {'No final state': 1}
Action 14: {'Final number not equal 24': 1, 'Does not use all input numbers': 1}
Action 15: {'Final number not e

In [None]:


for entry in data:
    env_infos = entry.get("env_info", [])
    for i, env in enumerate(env_infos):
        action = env.get("action", "")
        
        print(f"Action {i}:\n{action}\n{'-'*40}")

'6 * 4 = 24 (left: 1 1 24)\n1 + 1 = 2 (left: 2 24)\n24 / 2 = 12 (left: 12)\n12 * 2 = 24 (left: 24)\n'
Action 0:
6 * 4 = 24 (left: 1 1 24)
1 + 1 = 2 (left: 2 24)
24 / 2 = 12 (left: 12)
12 * 2 = 24 (left: 24)

----------------------------------------
'1 + 1 = 2 (left: 2 24)\n2 * 24 = 48 is not possible since we want to reach 24, 24 / 2 = 12 (left: 12)\nNo possible next steps, as there is only one number.\n'
Action 1:
1 + 1 = 2 (left: 2 24)
2 * 24 = 48 is not possible since we want to reach 24, 24 / 2 = 12 (left: 12)
No possible next steps, as there is only one number.

----------------------------------------
'1 + 1 = 2 (left: 2 24)\n24 / 2 = 12 (left: 12)\nNo possible next steps can be generated with a single number, as we need at least two numbers to perform basic arithmetic operations.\n'
Action 2:
1 + 1 = 2 (left: 2 24)
24 / 2 = 12 (left: 12)
No possible next steps can be generated with a single number, as we need at least two numbers to perform basic arithmetic operations.

--------

In [19]:
from dataclasses import dataclass

@dataclass
class GameOf24State:
    puzzle: str
    steps: list[str]
    current_state: str


In [22]:
def verify2(state: GameOf24State)-> dict:
            """
            Verifies the output of a given task
                1. Checks if the numbers used are the same as the ones provided.
                2. Checks if the operations performed result to 24.

            States 
                {"r": 0} : Not finished.
                {"r": 1} : Finished and correct.
                {"r": -1} : Finished and incorrect.
            """
            current_states = state.current_state.split(" ")
            if len(current_states) !=1 or len(state.steps)<=3:
                # More than one number left
                return {'r':0}
            elif current_states[0] != "24":
                # One number left and it is not 24
                return {'r':-1}
            else:
                # One number left and it is 24
                expression = state.steps[-1].lower().replace('answer: ', '').split('=')[0]
                numbers = re.findall(r'\d+', expression)
                problem_numbers = re.findall(r'\d+', state.puzzle)
                if sorted(numbers) != sorted(problem_numbers):
                    # Numbers used are not the same as the ones provided
                    return {'r': -1}
                try:
                    if sympy.simplify(expression) == 24:
                        return {'r': 1}
                    else:
                        # Operations performed do not result to 24
                        return {'r': -1}
                except Exception as e:
                    print(e)
                    return {'r': -1}


In [23]:
for entry in data:
    x = entry['agent_info'][0]['steps'][0]['x']  # assume all agent_info use the same puzzle
    for i, env in enumerate(entry['env_info']):
        action = env['action']
        lines = [line.strip() for line in action.strip().split('\n') if line.strip()]
        if not lines:
            continue
        last_line = lines[-1]
        # Extract the final numbers left, e.g. "left: 24" or "left: 1 1"
        match = re.search(r'left:\s*(.*)', last_line)
        current_state = match.group(1) if match else ""

        state = GameOf24State(
            puzzle=x,
            steps=lines,
            current_state=current_state
        )

        result = verify2(state)
        print(f"Action {i} result: {result} | Current State: {current_state}")

Action 0 result: {'r': -1} | Current State: 24)
Action 1 result: {'r': 0} | Current State: 
Action 2 result: {'r': 0} | Current State: 
Action 3 result: {'r': 0} | Current State: 


In [57]:
import json
import re
from collections import defaultdict
from sympy import simplify

# === Verification logic ===
def verify(puzzle: str, action: str) -> dict:
    errors = defaultdict(int)
    steps = [line.strip() for line in action.strip().split('\n') if line.strip()]

    if not steps:
        errors["More than one final number / final state not 24"] += 1
        return errors

    final_line = steps[-1]
    match = re.search(r'left:\s*(.*)', final_line)
    if not match:
        errors["No final state"] += 1
        return errors

    final_numbers = match.group(1).split()
    if len(final_numbers) != 1:
        errors["More than one final number"] += 1
    if final_numbers[0] != "24":
        errors["Final number not equal 24"] += 1

    # Try to find the last valid equation to evaluate
    for line in reversed(steps):
        if "=" in line:
            try:
                expr = line.split("=")[0].strip()
                numbers_in_expr = re.findall(r'\d+', expr)
                numbers_in_puzzle = re.findall(r'\d+', puzzle)

                if sorted(numbers_in_expr) != sorted(numbers_in_puzzle):
                    errors["Does not use all input numbers"] += 1
                try:
                    if simplify(expr) != 24:
                        errors["Expression does not evaluate to 24"] += 1
                except Exception:
                    errors["Invalid expression"] += 1
                break  # only consider the last expression
            except Exception:
                errors["Invalid expression"] += 1
                break
    else:
        errors["No valid expression found"] += 1

    return errors


# === Aggregation ===
error_counts = defaultdict(int)
total_actions = 0
skipped = 0

for entry_idx, entry in enumerate(data):
    try:
        puzzle = entry['agent_info'][0]['steps'][0]['x']
    except (KeyError, IndexError):
        print(f"Missing puzzle at entry {entry_idx}")
        continue

    for env_idx, env in enumerate(entry.get("env_info", [])):
        total_actions += 1  # ✅ Count all envs

        action = env.get('action')
        if not action:
            skipped += 1
            print(f"Missing or empty action at entry {entry_idx}, env {env_idx}")
            continue

        errors = verify(puzzle, action)
        for err_type, count in errors.items():
            error_counts[err_type] += count

# === Format output ===
final_results = {
    error: f"{count}/{total_actions}" for error, count in sorted(error_counts.items())
}

print("\nErrors per type:")
for key, value in final_results.items():
    print(f"- {key}: {value}")

print(f"\nTotal actions processed: {total_actions}")
print(f"Total skipped envs (missing/empty action): {skipped}")



Errors per type:
- Does not use all input numbers: 108/336
- Expression does not evaluate to 24: 66/336
- Final number not equal 24: 107/336
- More than one final number: 33/336
- No final state: 228/336

Total actions processed: 336
Total skipped envs (missing/empty action): 0


In [58]:
true_total = sum(len(entry.get("env_info", [])) for entry in data)
print(f"True number of env_info entries (actions): {true_total}")

True number of env_info entries (actions): 336
