In [1]:
# Imports
from lazykey import AsyncKeyHandler
from groq import AsyncGroq
from dataclasses import dataclass
from typing import List
import pickle

import random

import asyncio
import re
import math
import random
import numpy as np
from sympy import simplify
import os
# importing necessary functions from dotenv library
import os
from dotenv import load_dotenv

random.seed(0)

from async_engine.batched_api import BatchingAPI
from async_engine.api import API

from src.prompts.adapt import gameof24 as llama_prompts
from utils import parse_suggestions, create_box

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#Setup lazykey
# Initialization
load_dotenv() 
api_keys = os.getenv("api_keys")
step_api_config = eval_api_config = {
    "max_tokens": 1000,
    "temperature": 0,
    "top_p": 1,
    "request_timeout": 120,
    "top_k": 50
}

# eligible providers ["TogehterAI", "OpenAI", "Groq"]
model = "llama-3.3-70b-versatile"
provider = "Groq"
models = {
    "step": {"model_name":model, "provider":provider},
    "eval": {"model_name":model, "provider":provider},
}
client = AsyncKeyHandler(api_keys, AsyncGroq)

api = API(eval_api_config, client=client, models=models.values(), resources=2, verbose=False)

KeyboardInterrupt: 

In [None]:
from lazykey import AsyncKeyHandler
from groq import AsyncGroq

step_api_config = eval_api_config = {
    "max_tokens": 1000,
    "temperature": 0,
    "top_p": 1,
    "request_timeout": 120,
    "top_k": 50
}

model = "llama-3.3-70b-versatile"
provider = "LazyKey"
models = {
    "step": {"model_name":model, "provider":provider},
    "eval": {"model_name":model, "provider":provider},
}

api_keys = []
client = AsyncKeyHandler(api_keys, AsyncGroq)

api = API(eval_api_config, models=models.values(), resources=2, verbose=False)

step_batcher = BatchingAPI(api, batch_size=1, timeout=2, model=models["step"]["model_name"], tab="step")

In [5]:
# State class

@dataclass(frozen=True)
class GameOf24State:
    # game of 24 puzzle, for example 1 1 4 6
    puzzle: str

    # initialized to the same value as puzzle, but is updated as the game progresses
    current_state: str

    steps: List[str]

    #Randomness used for resampling (random seed)
    randomness: int

    def __hash__(self):
        return hash((self.puzzle, self.current_state, " -> ".join(self.steps)))
    
    def items(self):
        return self.puzzle, self.current_state, self.steps, self.randomness
    
    def duplicate(self, randomness=None):
        return GameOf24State(
            puzzle=self.puzzle,
            current_state=self.current_state,
            steps=self.steps,
            randomness=randomness if randomness is not None else self.randomness)

In [6]:
#Reflexion agent

class GameOf24Agent:

    @staticmethod
    async def step(state: GameOf24State, api, namespace, reflexion: list)-> GameOf24State:
        """
        Given a state, returns the next state one.
        """

        # set up the prompt, based on the current state

        # ToT uses bfs_prompt to generate next steps but then uses
        # the cot_prompt to get the final expression. 
        # For example, input : 1 1 4 6
        # Step 0 : '1 - 1 = 0 (left: 0 4 6)'          BFS prompt
        # Step 1 : '0 + 4 = 4 (left: 4 6)'            BFS prompt
        # Step 2 : '4 * 6 = 24 (left: 24)'            BFS prompt
        # Step 3 : Answer : ((1 - 1) + 4) * 6 = 24    CoT prompt


        # set up the prompt, based on the current state

        current_state = state.current_state
        
        if current_state.strip() == "24":
            # CoT prompt
            steps = "\n".join(state.steps) + "\n"
            

            prompt = llama_prompts.cot_prompt.format(input=state.puzzle) + "Steps:\n" + steps + "Answer: "

            # Set up CoT prompt
            # if any(author in api.model for author in ["meta", "google", "mistral", "gpt-4o"]):
            #     prompt = llama_prompts.cot_prompt.format(input=state.puzzle) + "Steps:\n" + steps + "Answer: "
            # else:
            #     prompt = totor_prompts.cot_prompt.format(input=state.puzzle) + "Steps:\n" + steps

            # Get the final expression
            suggestions = await api.buffered_request(prompt, key=hash(state), namespace=namespace)

            # State does not change, only the steps
            selected_suggestion = suggestions
            selected_state = state.current_state
            


        else:
            if len(reflexion) == 0:
                prompt = llama_prompts.bfs_prompt.format(input=current_state) 
            else:
                prompt = llama_prompts.bfs_reflexion_prompt.format(input=current_state, puzzle = "1 1 4 6", reflexion=reflexion[0]) 
                print(prompt)
                
            # Set up BFS prompt
            # if any(author in api.model for author in ["meta", "google", "mistral", "gpt-4o"]):
            #     prompt = llama_prompts.bfs_prompt.format(input=current_state) + "Keep in mind the following critique from the last step: \n" + reflexion_suggestions
            # else:
            #     prompt = totor_prompts.bfs_prompt.format(input=current_state) + "Keep in mind the following critique from the last step: \n" + reflexion_suggestions

            # Get the next state
            # suggestions = await api.buffered_request(prompt, key=hash(state), namespace=namespace)

            suggestions = await api.buffered_request(prompt, key=hash(state), namespace=namespace)

            # parse suggestions, based on the current state
            parsed_suggestions = parse_suggestions(suggestions)
            if parsed_suggestions == []:
                print(f"No suggestions were parsed from state: {state}")
                print(f"\nPrompt: {prompt}\nSuggestions: {suggestions}\nParsed suggestions: {' | '.join(parsed_suggestions)}\n")
                assert False, "No suggestions found."
            
            suggestions = parsed_suggestions
            
            random.seed(state.randomness)
            selected_suggestion = random.choice(suggestions)
            selected_state = GameOf24Agent.parse_next_state(selected_suggestion)

        # set up new state object
        next_state = GameOf24State(
            puzzle=state.puzzle,
            current_state=selected_state,
            steps=state.steps + [selected_suggestion],
            randomness=random.randint(0, 1000)
        )
        return next_state
    
    @staticmethod
    def parse_next_state(suggestion: str) -> str:
        return suggestion.split('left: ')[-1].split(')')[0]
    
    @staticmethod
    def verify(state: GameOf24State)-> dict:
            """
            Verifies the output of a given task
                1. Checks if the numbers used are the same as the ones provided.
                2. Checks if the operations performed result to 24.

            States 
                {"r": 0} : Not finished.
                {"r": 1} : Finished and correct.
                {"r": -1} : Finished and incorrect.
            """
            current_states = state.current_state.split(" ")
            if len(current_states) !=1 or len(state.steps)<=3:
                # More than one number left
                return {'r':0}
            elif current_states[0] != "24":
                # One number left and it is not 24
                return {'r':-1}
            else:
                # One number left and it is 24
                expression = state.steps[-1].lower().replace('answer: ', '').split('=')[0]
                numbers = re.findall(r'\d+', expression)
                problem_numbers = re.findall(r'\d+', state.puzzle)
                if sorted(numbers) != sorted(problem_numbers):
                    # Numbers used are not the same as the ones provided
                    return {'r': -1}
                try:
                    if simplify(expression) == 24:
                        return {'r': 1}
                    else:
                        # Operations performed do not result to 24
                        return {'r': -1}
                except Exception as e:
                    print(e)
                    return {'r': -1}

    @staticmethod
    async def generate_reflexion(puzzle: str, steps: object, state: GameOf24State, api: API, namespace: tuple) -> str:
        steps_str = "\n".join(steps)
        prompt = llama_prompts.reflexion_prompt.format(puzzle=puzzle, steps=steps_str)
        reflexion = await api.buffered_request(prompt, key=hash(state), namespace=namespace)
        return reflexion
    
    @staticmethod
    async def evaluate_step(puzzle: str, steps: object, state: GameOf24State, api: API, namespace: tuple)-> str:
        prompt = llama_prompts.evaluate_prompt.format(puzzle=puzzle, steps=steps)
        evalution = await api.buffered_request(prompt, key=hash(state), namespace=namespace)
        return evalution


# Solve 1 1 4 6 puzzle:

**Time of reflexion:** Trial wise

**Type of reflexion:** List

**Type of verifier:** LLM

In [7]:
puzzle = "1 1 4 6"
num_steps = 4
num_agents = 2

In [15]:
# Attempting to solve the puzzle (without reflexion)

#Create initial state/environment
states = []
for _ in range(num_agents):
    states.append(GameOf24State(puzzle=puzzle, current_state=puzzle, steps=[], randomness=random.randint(0, 1000)))
step_batcher = BatchingAPI(api, batch_size=1, timeout=2, model=models["step"]["model_name"], tab="step")

finished_states = []

#Stepping
for step in range(num_steps):
    
    print(f"Step {step} : Stepping")
    agent_tasks = [
        asyncio.create_task(
        GameOf24Agent.step(state, step_batcher, namespace=(0, f"Agent: {agent_id}", f"Step : {step}"), reflexion="")
        )
        for agent_id, state in enumerate(states)
    ]
    states = await asyncio.gather(*agent_tasks)
    for agent_id, state in enumerate(states):
        print(f"Current step for agent {agent_id}: {state.steps[-1]} \n")

    # Evaluate whether a puzzle has been solved
    i = 0
    while i < len(states):
        if GameOf24Agent.verify(states[i]) == {"r": 1}:
            print(f"Puzzle finished: {states[i].puzzle}")
            finished_states.append(states.pop(i))
        else:
            i += 1
    # If all puzzles have been solved, break
    if len(states) == 0:
        break

Step 0 : Stepping
Current step for agent 0: 1 * 4 = 4 (left: 1 4 6) 

Current step for agent 1: 4 - 6 = -2 (left: -2 1 1) 

Step 1 : Stepping
Current step for agent 0: 1 * 6 = 6 (left: 4 6) 

Current step for agent 1: 1 + 1 = 2 (left: -2 2) 

Step 2 : Stepping
Current step for agent 0: 4 * 6 = 24 (left: 24) 

Current step for agent 1: -2 * 2 = -4 (left: -4) 

Step 3 : Stepping
Current step for agent 0: (1 * 4) * (1 * 6) = 24 

Current step for agent 1: None 

Puzzle finished: 1 1 4 6


In [None]:
# Generate evaluations / verify steps using LLM

agent_evaluations = [
    asyncio.create_task(
    GameOf24Agent.evaluate_step(puzzle=puzzle, steps=state.steps, state=state, api=step_batcher, namespace=(0, f"Agent: {agent_id}", f"Step : {step}"))
    )
    for agent_id, state in enumerate(states)
]
evaluation = await asyncio.gather(*agent_evaluations)

for agent_id, agent_evaluation in enumerate(evaluation):
    print(f"reflexion {agent_id}: {agent_evaluation} \n")

reflexion 0: To evaluate the given solution attempt for the game of 24 with the numbers 1, 1, 4, and 6, let's break it down step by step.

1. **Step 0**: The initial numbers are 1, 1, 4, and 6.
   - The operation is 6 + 4 = 10.
   - Evaluation: This step is valid as it uses basic arithmetic operations and the numbers are still available. The result is correct.
   - Left numbers: 1, 1, 10.
   - Can these numbers reach 24? Given the numbers 1, 1, and 10, let's evaluate:
     - 1 + 1 + 10 = 12, which is far from 24.
     - (10 - 1) * 1 = 9 * 1 = 9, which is also not 24.
     - The numbers are within a reasonable range but seem challenging to reach 24 directly.
     - Evaluation: likely

2. **Step 1**: The numbers are 1, 1, and 10.
   - The operation is 1 * 10 = 10.
   - Evaluation: This step is mathematically correct, but let's assess if these numbers can reach 24:
     - After this step, we have 1 and 10.
     - 1 + 10 = 11, which is not 24.
     - The operation itself is valid, but reac

  evaluation = await asyncio.gather(*agent_evaluations)
  for agent_id, agent_evaluation in enumerate(evaluation):


In [8]:
#Load validation set
with open('evaluate_prompt_list.pkl', 'rb') as file:
    loaded_list = pickle.load(file)
print(len(loaded_list))

10


In [9]:
answers = {
    "Test 0": 0,
    "Test 1": 1,
    "Test 2": 2,
    "Test 3": 0,
    "Test 4": 2,
    "Test 5": 0,
    "Test 6": 1,
    "Test 7": 0,
    "Test 8": 0,
    "Test 9": 0
}

In [13]:
agent_evaluations_test = [
    asyncio.create_task(
    GameOf24Agent.evaluate_step(puzzle=puzzle, steps=state[0].steps, state=state[0], api=step_batcher, namespace=(0, f"Agent: {agent_id}", f"Step : {step}"))
    )
    for agent_id, state in enumerate(loaded_list)
]
test_evaluations = await asyncio.gather(*agent_evaluations_test)

for agent_id, eval in enumerate(test_evaluations):
    print(f"reflexion {agent_id}: {eval} \n")

reflexion 0: To evaluate the given solution attempt for the game of 24 with the numbers 1, 1, 4, and 6, let's break it down step by step.

1. **Step 0**: The initial numbers are 1, 1, 4, and 6.
   - The operation is 1 + 1 = 2.
   - Evaluation: The arithmetic is correct. The numbers used are available. The result is computed correctly and used properly in the next step.
   - Reach 24: After this step, we have 2, 4, and 6 left. Given these numbers, it's not immediately clear if we can reach 24, but the numbers are within a reasonable range, so it's likely.

2. **Step 1**: The numbers left are 2, 4, and 6.
   - The operation is 6 / 4 = 1.5.
   - Evaluation: The arithmetic is correct. The numbers used are available. The result is computed correctly.
   - Reach 24: After this step, we have 1.5 and 2 left. Given these numbers, reaching 24 seems impossible because the numbers are too small and the operations that can be performed with them are limited.

3. **Step 2**: The numbers left are 1.5

In [14]:
correct = 0
for id in range(10):
    llm_answer = test_evaluations[id][-1]
    correct_answer = answers.get(f"Test {id}")
    if int(llm_answer) == correct_answer:
        correct += 1

print(correct / 10 * 100, "% accuracy")

30.0 % accuracy


In [None]:
# Generate reflexions if the puzzle is not solved

agent_reflexions = [
    asyncio.create_task(
    GameOf24Agent.generate_reflexion(puzzle=puzzle, steps=state.steps, state=state, api=step_batcher, namespace=(0, f"Agent: {agent_id}", f"Step : {step}"))
    )
    for agent_id, state in enumerate(states)
]
reflexion = await asyncio.gather(*agent_reflexions)

for agent_id, agent_reflexion in enumerate(reflexion):
    print(f"reflexion {agent_id}: {agent_reflexion}")

reflexion 0: The previous attempt at solving the puzzle contains a mistake. The error occurs when the player multiplies 1 by 9, resulting in 9, which does not bring them closer to the goal of reaching 24. This mistake can be avoided by carefully evaluating the remaining numbers and available operations to ensure each step progresses towards the target.

A general reflection is that players should prioritize operations that increase the value or create more favorable combinations, rather than repeating or stagnating the current value. This requires a strategic approach, considering the potential outcomes of each operation and the remaining numbers to be used.
reflexion 1: Reflection:
The mistake in the previous attempt is the operation sequence, which led to a dead end with no possible way to reach 24. A similar mistake can be avoided by considering the properties of the numbers and the operations. Specifically, using division with a small divisor (1 in this case) may limit subsequent o

In [None]:
# Reattempting to solve the puzzle (with reflexion)

#Resetting
states = []
for _ in range(num_agents):
    states.append(GameOf24State(puzzle=puzzle, current_state=puzzle, steps=[], randomness=random.randint(0, 1000)))
finished_states = []



#Stepping
for step in range(num_steps):
    
    print(f"Step {step} : Stepping")
    agent_tasks = [
        asyncio.create_task(
        GameOf24Agent.step(state, step_batcher, namespace=(0, f"Agent: {agent_id}", f"Step : {step}"), reflexion=reflexion[agent_id])
        )
        for agent_id, state in enumerate(states)
    ]

    states = await asyncio.gather(*agent_tasks)

    for agent_id, state in enumerate(states):
        print(f"Current step for agent {agent_id}: {state.steps[-1]} \n")

    # Evaluate whether a puzzle has been solved
    i = 0
    while i < len(states):
        print("i: ", i)
        if GameOf24Agent.verify(states[i]) == {"r": 1}:
            print(f"Puzzle finished: {states[i].puzzle}")
            finished_states.append(states.pop(i))
        else:
            i += 1

    # If all puzzles have been solved, break
    if len(states) == 0:
        break

Step 0 : Stepping
Current step for agent 0: 4 * 6 = 24 (left: 1 24) 

Current step for agent 1: 1 / 4 = 0.25 (left: 0.25 1 6) 

i:  0
i:  1
Step 1 : Stepping
Current step for agent 0: 1 * 24 = 24 (left: 24) 

Current step for agent 1: 6 / 1 = 6 (left: 0.25 6) 

i:  0
i:  1
Step 2 : Stepping
Current step for agent 0: 24 / 2 = 12 (left: 12)
24 * 2 = 48 (not possible since 2 is not in the input)
24 + 2 = 26 (not possible since 2 is not in the input)
24 - 2 = 22 (not possible since 2 is not in the input)
24 / 1 = 24 (left: 24)
24 * 1 = 24 (left: 24)
24 + 1 = 25 (not possible since 1 is not in the input)
24 - 1 = 23 (not possible since 1 is not in the input)
24 + 24 = 48 (left: 48)
24 * 24 = 576 (left: 576)
24 - 24 = 0 (left: 0)
24 / 24 = 1 (left: 1)
24 / 3 = 8 (not possible since 3 is not in the input)
24 / 4 = 6 (not possible since 4 is not in the input)
24 / 6 = 4 (not possible since 6 is not in the input)
24 / 8 = 3 (not possible since 8 is not in the input)
24 / 12 = 2 (not possible si

# Solve 1 1 4 6 puzzle:

**Time of reflexion:** Step wise

**Type of reflexion:** List

**Type of verifier:** LLM

['', '', '', '']


In [9]:
#Create initial state/environment
states = []
for _ in range(num_agents):
    states.append(GameOf24State(puzzle=puzzle, current_state=puzzle, steps=[], randomness=random.randint(0, 1000)))
step_batcher = BatchingAPI(api, batch_size=1, timeout=2, model=models["step"]["model_name"], tab="step")

finished_states = []
reflexion = [""]*num_agents
#Stepping
for step in range(num_steps):
    
    print(f"Step {step} : Stepping")
    agent_tasks = [
        asyncio.create_task(
        GameOf24Agent.step(state, step_batcher, namespace=(0, f"Agent: {agent_id}", f"Step : {step}"), reflexion=reflexion[agent_id])
        )
        for agent_id, state in enumerate(states)
    ]
    states = await asyncio.gather(*agent_tasks)
    for agent_id, state in enumerate(states):
        print(f"Current step for agent {agent_id}: {state.steps[-1]} \n")

    # Evaluate whether a puzzle has been solved
    i = 0
    while i < len(states):
        if GameOf24Agent.verify(states[i]) == {"r": 1}:
            print(f"Puzzle finished: {states[i].puzzle}")
            finished_states.append(states.pop(i))
        else:
            i += 1
    
    agent_reflexions = [
    asyncio.create_task(
    GameOf24Agent.generate_reflexion(puzzle=puzzle, steps=state.steps, state=state, api=step_batcher, namespace=(0, f"Agent: {agent_id}", f"Step : {step}"))
    )
    for agent_id, state in enumerate(states)
    ]
    reflexion = await asyncio.gather(*agent_reflexions)
    # If all puzzles have been solved, break
    if len(states) == 0:
        break

Step 0 : Stepping
Current step for agent 0: 4 + 6 = 10 (left: 1 1 10) 

Current step for agent 1: 4 - 1 = 3 (left: 1 3 6) 

Step 1 : Stepping
Use numbers and basic arithmetic operations (+ - * /). Each step, you are only allowed to choose two of the remaining numbers to obtain a new number. Do not explain simply list possible next steps as well as all the remaining numbers and nothing else.

Example: 2 8 8 14
Possible next steps:
2 + 8 = 10 (left: 8 10 14)
8 / 2 = 4 (left: 4 8 14)
14 + 2 = 16 (left: 8 8 16)
2 * 8 = 16 (left: 8 14 16)
8 - 2 = 6 (left: 6 8 14)

Example: 1 3
Possible next steps:
1 + 3 = 4 (left: 4)
1 * 3 = 3 (left: 3)
3 - 1 = 2 (left: 2)
3 / 1 = 3 (left: 3)
1 - 3 = -2 (left: -2)



Based on previous attempts to solve the puzzle, here is some advice on how to proceed:
T

Input: 1 1 10
Possible next steps:
Use numbers and basic arithmetic operations (+ - * /). Each step, you are only allowed to choose two of the remaining numbers to obtain a new number. Do not explain simpl

In [None]:
a = []

print(a[0])

IndexError: list index out of range