# Wordle Agent


WORDLE GAME:


![WordleGame](https://media.phillyvoice.com/media/images/011322_Wordle.2e16d0ba.fill-735x490.png)


### Game Rules: - You have **6 tries** to guess a secret **5-letter** word. 
- Each guess must be a valid **5-letter English word**. 
- After each guess, you will receive feedback indicating how close your guess was. 

### Feedback Format: Each letter in your guess will receive one of three vlaues: 
- G (green): correct letter in the correct position
- Y (yellow): letter exists in the word but in the wrong position
- X (wrong): letter is not in the word


In [1]:
from datasets import load_dataset
import pandas as pd
import dspy
from dotenv import load_dotenv
from pprint import pprint
from typing import Literal, Union
load_dotenv()

gpt4o_mini = dspy.LM('gpt-4o-mini')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class WordleGuess(dspy.Signature):

    feedback: str = dspy.InputField(
        desc="""Feedback from the previous guess, if any. Feedback Format: Each letter in your guess will receive one of three values: 
- G (green): correct letter in the correct position
- Y (yellow): letter exists in the word but in the wrong position
- X (wrong): letter is not in the word. First guess has no feedback."""
    )

    guessed_word: str = dspy.OutputField(
        desc="The guessed word based on the feedback from the previous guess. only alphabetic characters are allowed, and it must be a valid 5-letter English word.",
    )

## Test the signature

with dspy.context(lm=gpt4o_mini):
    wordle_guess = dspy.ChainOfThought(WordleGuess)

    # Example usage
    response = wordle_guess(feedback="First Guess, no feedback")
    pprint(response)

Prediction(
    reasoning='Since this is the first guess and there is no feedback, I will choose a common 5-letter word to start the guessing process.',
    guessed_word='crate'
)


In [3]:
class GuessedWordFeedback(dspy.Signature):

    guessed_word: str = dspy.InputField(desc="The predicted word by the player")
    actual_word: str = dspy.InputField(desc="The actual word to be guessed")
    feedback: str = dspy.OutputField(
        desc="""Feedback Format: Each letter in your guess will receive one of three values: 
- G (green): correct letter in the correct position
- Y (yellow): letter exists in the word but in the wrong position
- X (wrong): letter is not in the word"""
    )

## Test the signature
with dspy.context(lm=gpt4o_mini):
    feedback_player = dspy.ChainOfThought(GuessedWordFeedback)
    response = feedback_player(guessed_word="apple", actual_word="plane")
    pprint(response)

Prediction(
    reasoning='The guessed word "apple" has some letters that match with the actual word "plane". The letter \'p\' is in the same position in both words, so it is marked as green. The letter \'a\' is present in "plane" but in a different position, so it is marked as yellow. The letters \'l\' and \'e\' in "apple" do not appear in "plane", so they are marked as wrong.',
    feedback='GXYXG'
)


In [4]:
class Wordle(dspy.Module):
    """The game of Wordle.
    Game Rules: - You have **6 tries** to guess a secret **5-letter** word.
    - Each guess must be a valid **5-letter English word**.
    - After each guess, you will receive feedback indicating how close your guess was.

    ### Feedback Format: Each letter in your guess will receive one of three values:
    - G (green): correct letter in the correct position
    - Y (yellow): letter exists in the word but in the wrong position
    - X (wrong): letter is not in the word."""

    def __init__(
        self, player1_lm, player2_lm, max_attempts=6, is_training=False, verbose=False
    ):
        super().__init__()
        self.player1 = dspy.ChainOfThought(WordleGuess)
        self.player1.set_lm(player1_lm)
        self.player2 = dspy.ChainOfThought(GuessedWordFeedback)
        self.player2.set_lm(player2_lm)
        self.max_attempts = max_attempts
        self.training_mode = is_training
        self.verbose = verbose
        
        
    def forward(self, word):

        # first attempt has no feedback
        feedback = "Guess the word!"

        for attempt in range(self.max_attempts):

            guess = self.player1(feedback=feedback)
            if guess.guessed_word == word:
                # print(f"Predicted Word: {guess.guessed_word}, Feedback: Correct!")
                return (
                    {"answer": guess.guessed_word}
                    if self.training_mode
                    else "Player 1 Wins after {} attempts!".format(attempt + 1)
                )

            feedback = self.player2(guessed_word=guess.guessed_word, actual_word=word)
            if self.verbose:
                pprint(
                    f"Attempt {attempt + 1}: Predicted Word: {guess.guessed_word}, Feedback: {feedback.feedback}"
                )
                
        
        return {"answer": guess.guessed_word} if self.training_mode else "Sorry, you've run out of attempts."

In [5]:
wordle_game = Wordle(player1_lm=dspy.LM('gpt-4o-mini'), player2_lm=dspy.LM('gpt-4o-mini'), max_attempts=6, is_training=False, verbose=True)

In [7]:
wordle_game(word="piano")

'Attempt 1: Predicted Word: crate, Feedback: XXXXX'
'Attempt 2: Predicted Word: plumb, Feedback: GXXXX'
'Attempt 3: Predicted Word: pasty, Feedback: GXYXX'
'Attempt 4: Predicted Word: plant, Feedback: GXXYY'


'Player 1 Wins after 5 attempts!'

In [7]:
wordle_game(word="plane")

'Attempt 1: Predicted Word: crate, Feedback: XYXYX'
'Attempt 2: Predicted Word: leaky, Feedback: X X Y X X'
'Attempt 3: Predicted Word: brand, Feedback: X X Y X X'


'Player 1 Wins after 4 attempts!'

In [8]:
wordle_game(word="stick")


'Attempt 1: Predicted Word: crate, Feedback: XXXXX'
'Attempt 2: Predicted Word: plumb, Feedback: XXXXX'
'Attempt 3: Predicted Word: crane, Feedback: XXXXX'
'Attempt 4: Predicted Word: plumb, Feedback: XXXXX'
'Attempt 5: Predicted Word: crane, Feedback: XXXXX'
'Attempt 6: Predicted Word: plumb, Feedback: XXXXX'


"Sorry, you've run out of attempts."

# Optimize the Prompt Automatically with MiPROv2

Multiprompt Instruction PRoposal Optimizer Version 2

## Define our Dataset

In [8]:
pd.read_parquet("../data/wordle_grpo.parquet").columns

Index(['prompt', 'completion', 'answer', 'reward', 'task'], dtype='object')

In [9]:

ds = pd.read_parquet("../data/wordle_grpo.parquet")
ds.drop(columns=['prompt', 'completion','reward', 'task'], inplace=True)
ds.rename(columns={'answer': 'word'}, inplace=True)
ds['answer'] = ds['word']
ds = ds.to_dict(orient='records')[:500]
ds = [dspy.Example(**item).with_inputs('word') for item in ds]

train_size = int(0.8 * len(ds))
trainset = ds[:train_size]
devset = ds[train_size:]

# wordle_dataset = WordleDataset()
pprint(devset[:3])

[Example({'word': 'smash', 'answer': 'smash'}) (input_keys={'word'}),
 Example({'word': 'heart', 'answer': 'heart'}) (input_keys={'word'}),
 Example({'word': 'shirt', 'answer': 'shirt'}) (input_keys={'word'})]


In [10]:
# define the evaluator

def exact_match(pred, gt, trace=None):
    return pred["answer"] == gt["answer"]

evaluator = dspy.Evaluate(
    metric=exact_match,
    devset=devset,
    display_table=True,
    display_progress=True,
    num_threads=24,
    provide_traceback=True,
)

wordle_game = Wordle(player1_lm=dspy.LM('gpt-4o-mini'), player2_lm=dspy.LM('gpt-4o-mini'), max_attempts=6, is_training=True, verbose=False)


In [11]:

original_score = evaluator(wordle_game)
print(f"Original score: {original_score}")

Average Metric: 27.00 / 100 (27.0%): 100%|██████████| 100/100 [00:38<00:00,  2.58it/s]

2025/06/20 10:32:20 INFO dspy.evaluate.evaluate: Average Metric: 27 / 100 (27.0%)





Unnamed: 0,word,example_answer,pred_answer,exact_match
0,smash,smash,smash,✔️ [True]
1,heart,heart,heart,✔️ [True]
2,shirt,shirt,shirt,✔️ [True]
3,guide,guide,blame,
4,quick,quick,brave,
...,...,...,...,...
95,chalk,chalk,clash,
96,brake,brake,brash,
97,blade,blade,blaze,
98,heart,heart,heart,✔️ [True]


Original score: 27.0


# Use optimizer

In [None]:
with dspy.context(lm=dspy.LM('gpt-4o-mini')):


    teleprompter = dspy.MIPROv2(
        metric=exact_match,
        # auto="light",
        num_threads=16,
        prompt_model=wordle_game.player1.get_lm(),
        num_candidates=3,
    )

    optimized_wordle_game = teleprompter.compile(
        wordle_game,
        trainset=trainset[:100],
        valset=devset,
        requires_permission_to_run=False,
        num_trials=25
    )

    optimized_score = evaluator(optimized_wordle_game)
    print(f"Optimized score: {optimized_score}")

2025/06/19 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 20
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 100

2025/06/19 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/06/19 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/06/19 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  0%|          | 0/100 [00:00<?, ?it/s]
2025/06/19 18:24:34 INFO dspy.teleprompt.mipro_optimizer_v2: Error generating few-shot examples: Can't pickle StringSignature(guessed_word, actual_word -> reasoning, feedback
    instructions='Given the fields `guessed_word`, `actual_word`, produce the fields `feedback`.'
    guessed_word = Field(annotation=str required=True json_schema_extra={'desc': 'The predicted word by the player', '__dspy_field_type': 'input', 'prefix': 'Guessed Word:'})
    actual_word = Field(annotation=str required=True json_schema_extra={'desc': 'The actual word to be guessed', '__dspy_field_type': 'input', 'prefix': 'Actual Word:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    feedback = Field(annotation=str required=True json_schema_extra={'desc': 'Feedback Format: Each letter in your guess will receive one of three values: 

Average Metric: 40.00 / 100 (40.0%): 100%|██████████| 100/100 [01:00<00:00,  1.65it/s]

2025/06/19 18:26:22 INFO dspy.evaluate.evaluate: Average Metric: 40 / 100 (40.0%)
2025/06/19 18:26:22 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 40.0






2025/06/19 18:26:22 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 25 - Minibatch ==


Average Metric: 11.00 / 35 (31.4%): 100%|██████████| 35/35 [01:06<00:00,  1.90s/it]

2025/06/19 18:27:29 INFO dspy.evaluate.evaluate: Average Metric: 11 / 35 (31.4%)
2025/06/19 18:27:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 31.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 1: Instruction 2'].
2025/06/19 18:27:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [31.43]
2025/06/19 18:27:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [40.0]
2025/06/19 18:27:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/06/19 18:27:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 25 - Minibatch ==



Average Metric: 14.00 / 35 (40.0%): 100%|██████████| 35/35 [00:50<00:00,  1.43s/it]

2025/06/19 18:28:19 INFO dspy.evaluate.evaluate: Average Metric: 14 / 35 (40.0%)





2025/06/19 18:28:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 1: Instruction 2'].
2025/06/19 18:28:20 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [31.43, 40.0]
2025/06/19 18:28:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [40.0]
2025/06/19 18:28:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0


2025/06/19 18:28:20 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 25 - Minibatch ==


Average Metric: 1.00 / 6 (16.7%):  14%|█▍        | 5/35 [00:16<01:26,  2.88s/it]

[W 2025-06-19 18:28:40,487] Trial 3 failed with parameters: {'0_predictor_instruction': 0, '1_predictor_instruction': 1} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/gaueko0/users/asalem/anaconda3/envs/py310_transformers4dot49/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/gaueko0/users/asalem/anaconda3/envs/py310_transformers4dot49/lib/python3.10/site-packages/dspy/teleprompt/mipro_optimizer_v2.py", line 563, in objective
    score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng)
  File "/gaueko0/users/asalem/anaconda3/envs/py310_transformers4dot49/lib/python3.10/site-packages/dspy/teleprompt/utils.py", line 55, in eval_candidate_program
    return evaluate(
  File "/gaueko0/users/asalem/anaconda3/envs/py310_transformers4dot49/lib/python3.10/site-packages/dspy/utils/callback.py", line 326, in sync_wrapper
    return fn(inst

KeyboardInterrupt: 

## After 25 Iterations
![35-iterations](../assets/35_iterations_in.png)