In [47]:
from predibase import Predibase, DeploymentConfig

import os
from dotenv import load_dotenv

from openai import OpenAI
from transformers import AutoTokenizer
import re
from tabulate import tabulate

import numpy as np
import torch

from typing import List
from enum import Enum
from dataclasses import dataclass

import warnings
warnings.filterwarnings('ignore') # i think this is because the Predibase API sometimes gives warnings for version-related stuff (it does it in the course videos too)

# Lesson four: Reward functions for Wordle

In [14]:
base_model_id = "Qwen/Qwen2.5-7B-Instruct"
private_deployment_name = 'qwen2-5-7b-instruct-dlai'

_ = load_dotenv(override=True) # populate env from .env file, reload of file - 'override' - ok here

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

## Set up stuff from the previous lectures

I could just load from utils.py, but I'd like the functions here so I can see them w/o one extra trip.

In [9]:
SYSTEM_PROMPT = """
You are playing Wordle, a word-guessing game.

### Game Rules:
- You have **6 tries** to guess a secret **5-letter** word.
- Each guess must be a valid **5-letter English word**.
- After each guess, you will receive feedback indicating how close your guess was.

### Feedback Format:
Each letter in your guess will receive one of three symbols:
1. ✓ : The letter is in the word and in the CORRECT position.
2. - : The letter is in the word but in the WRONG position.
3. x : The letter is NOT in the word.

### Example:
Secret Word: BRISK

Guess 1: STORM → Feedback: S(-) T(x) O(x) R(-) M(x)
Guess 2: BRAVE → Feedback: B(✓) R(✓) A(x) V(x) E(x)
Guess 3: BRISK → Feedback: B(✓) R(✓) I(✓) S(✓) K(✓)

### Response Format:
Think through the problem and feedback step by step. Make sure to first add your step by step thought process within <think> </think> tags. Then, return your guessed word in the following format: <guess> guessed-word </guess>.
"""

In [18]:
class LetterFeedback(Enum):
    CORRECT = "✓"
    WRONG_POS = "-"
    WRONG_LETTER = "x"

def get_feedback(guess: str, secret_word: str) -> List[LetterFeedback]:
    valid_letters = set(secret_word)
    feedback = []
    for letter, secret_letter in zip(guess, secret_word):
        if letter == secret_letter:
            feedback.append(LetterFeedback.CORRECT)
        elif letter in valid_letters:
            feedback.append(LetterFeedback.WRONG_POS)
        else:
            feedback.append(LetterFeedback.WRONG_LETTER)
    return feedback

@dataclass
class GuessWithFeedback:
    guess: str
    feedback: List[LetterFeedback]

    def __repr__(self) -> str:
        feedback_str = " ".join(f"{letter}({fb.value})" for letter, fb in zip(self.guess, self.feedback))
        return f"{self.guess} → Feedback: {feedback_str}"

    @staticmethod
    def from_secret(guess: str, secret: str) -> "GuessWithFeedback":
        return GuessWithFeedback(guess, get_feedback(guess, secret))

In [20]:
def render_user_prompt(past_guesses: List[GuessWithFeedback]) -> str:
    prompt = "Make a new 5-letter word guess."
    if past_guesses:
        prompt += "\n\nHere is some previous feedback:"
        for i, guess in enumerate(past_guesses):
            prompt += f"\nGuess {i+1}: {guess}"
    return prompt

def get_messages(past_guesses: List[GuessWithFeedback]):
    return [
        {
            "role": "system",
            "content": SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": render_user_prompt(past_guesses)
        },
        {
            "role": "assistant",
            "content": "Let me solve this step by step.\n<think>"
        }
    ]

def render_prompt(past_guesses: List[GuessWithFeedback]):
    messages = get_messages(past_guesses)
    return tokenizer.apply_chat_template(
        messages, tokenize=False, continue_final_message=True
    )

def extract_guess(completion: str) -> str:
    match = re.search(r"<guess>\s*([\s\S]*?)\s*<\/guess>", completion, re.DOTALL)
    if not match:
        return ""
    return match.group(1).strip().upper()

def next_turn(past_guesses: List[GuessWithFeedback], secret_word: str, adapter_id = ""):
    prompt = render_prompt(past_guesses)
    completion = generate_stream(prompt)
    guess = extract_guess(completion)

    feedback = get_feedback(guess, secret_word)
    past_guesses.append(GuessWithFeedback(guess, feedback))
    print("\n\n")
    print(("-" * 100) + "\n")
    for past_guess in past_guesses:
        print(past_guess)

    if guess == secret_word:
        print("🎉 SUCCESS 🎉")
    elif len(past_guesses) >= 6:
        print("❌ better luck next time... ❌")

In [16]:
best_of_client = OpenAI(
    base_url=os.environ["PREDIBASE_QWEN_MODEL_URL"],
    api_key=os.environ["PREDIBASE_API_TOKEN"],
)

def generate(
    messages: List[dict],
    adapter_id: str = "",
    num_guesses: int = 1,
    temperature: float = 0.7,
    max_tokens: int = 1024,
) -> List[str]:
    if temperature > 0.0:
        completions = best_of_client.chat.completions.create(
            model=adapter_id,
            messages=messages,
            n=num_guesses,
            temperature=temperature,
            max_tokens=max_tokens
        )
        return [choice.message.content for choice in completions.choices]
    else:
        return [
            best_of_client.chat.completions.create(
                model=adapter_id,
                messages=messages,
                n=1,
                temperature=temperature,
                max_tokens=max_tokens
            ).choices[0].message.content for _ in range(num_guesses)
        ]

## Create private deployment, if it doesn't exist already

Create a custom deployed instance of the Qwen 2.5 7B instruct model, using Predibase's API, which we'll start using for inference here and then fine-tune - using Predibase's API - below.

In [5]:
def create_deployment(name: str):
    "Creates Predibase 'deployment' per utils.py, needed since I'm running this locally and can't use the one they've created, I think."
    pb = Predibase(api_token=os.environ["PREDIBASE_API_TOKEN"])
    try:
        pb.deployments.create(
            name=name,
            config=DeploymentConfig(
                accelerator='a10_24gb_100', # cheapest ($2.60/hr), I added this to the code from utils.py, thinking I'd avoid it 'helpfully' picking a more expensive option 
                base_model="qwen2-5-7b-instruct",
                min_replicas=0,
                max_replicas=1,
                cooldown_time=1200,
            )
        )
    except Exception:
        print(f"Deployment {name} already exists")

create_deployment(private_deployment_name)

Deployment qwen2-5-7b-instruct-dlai already exists


## Start with a simple reward function

First, I'll try a simple 0/1 reward function.

In [32]:
def wordle_reward(guess: str, secret_word: str) -> int:
    if guess.upper() == secret_word.upper():
        return 1 # correct
    else:
        return 0

In [19]:
secret_word = "POUND"

past_guesses = [
    GuessWithFeedback.from_secret(guess="CRANE", secret=secret_word),
    GuessWithFeedback.from_secret(guess="BLOND", secret=secret_word),
    GuessWithFeedback.from_secret(guess="FOUND", secret=secret_word),
]
past_guesses

[CRANE → Feedback: C(x) R(x) A(x) N(✓) E(x),
 BLOND → Feedback: B(x) L(x) O(-) N(✓) D(✓),
 FOUND → Feedback: F(x) O(✓) U(✓) N(✓) D(✓)]

Build a prompt including the feedback from the previous guesses and then send it to the LLM to have it think and generate a response.

NOTE: As of the beginning of the lecture, this isn't using the custom deployment, I'm pretty sure - it's using the shared Qwen model, due to the way 'best_of_client' is created using the chat.completions API.

In [24]:
llm_messages = get_messages(past_guesses)
llm_messages

[{'role': 'system',
  'content': '\nYou are playing Wordle, a word-guessing game.\n\n### Game Rules:\n- You have **6 tries** to guess a secret **5-letter** word.\n- Each guess must be a valid **5-letter English word**.\n- After each guess, you will receive feedback indicating how close your guess was.\n\n### Feedback Format:\nEach letter in your guess will receive one of three symbols:\n1. ✓ : The letter is in the word and in the CORRECT position.\n2. - : The letter is in the word but in the WRONG position.\n3. x : The letter is NOT in the word.\n\n### Example:\nSecret Word: BRISK\n\nGuess 1: STORM → Feedback: S(-) T(x) O(x) R(-) M(x)\nGuess 2: BRAVE → Feedback: B(✓) R(✓) A(x) V(x) E(x)\nGuess 3: BRISK → Feedback: B(✓) R(✓) I(✓) S(✓) K(✓)\n\n### Response Format:\nThink through the problem and feedback step by step. Make sure to first add your step by step thought process within <think> </think> tags. Then, return your guessed word in the following format: <guess> guessed-word </guess>.

In [27]:
llm_response_full = generate(llm_messages)
llm_response_full

["From the feedback:\n- Guess 1: CRANE -> C(x), R(x), A(x), N(✓), E(x)\n  - This means N is in the 4th position and all other letters are incorrect.\n- Guess 2: BLOND -> B(x), L(x), O(-), N(✓), D(✓)\n  - This confirms N is in the 4th position and D is in the 5th position.\n- Guess 3: FOUND -> F(x), O(✓), U(✓), N(✓), D(✓)\n  - This again confirms N is in the 4th position and D is in the 5th position, and O is in the 2nd position.\n\nSo, the word must contain the letters O, N, and D, and the 2nd, 4th, and 5th positions are fixed with these letters. The remaining 1st and 3rd positions are unknown but should be different from the letters already used (C, R, A, B, L, F, U).\n\nPossible words that fit these criteria and are 5-letter English words are:\n- FOUND\n- ONEDS\n- ONEDR\n\nAmong these, let's choose ONEDS as it's a valid English word and fits the criteria.</think>\n<guess> ONEDS </guess>"]

In [30]:
guessed_word = extract_guess(llm_response_full[0])
guessed_word

'ONEDS'

In [33]:
reward = wordle_reward(guessed_word, secret_word)
print(f'Guessed word: {guessed_word} -> Reward: {reward}')

Guessed word: ONEDS -> Reward: 0


The agent ultimately wants to maximize the rewards over time that it receives. To do this, we need diverse responses and those responses need to lead to diverse/different rewards. For example, if the reward function doesn't discriminate well between different responses, it won't work. To work with this, we use the idea of 'advantage', which in turn is just the standardized equivalent of each reward.

In [41]:
def compute_advantages(rewards: list):
    rewards = np.array(rewards)

    mean_reward = np.mean(rewards)
    std_reward = np.std(rewards)

    # avoid div by zero when std/var is zero (which happens when all rewards are zero)
    if std_reward == 0:
        return [0] * len(rewards)

    advantages = (rewards - mean_reward) / std_reward
    return advantages.tolist()

In [42]:
rewards = [0.0, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
compute_advantages(rewards)

[-1.732050807568877,
 -1.072221928495019,
 -0.4123930494211609,
 -0.08247860988423196,
 0.24743582965269698,
 0.5773502691896258,
 0.9072647087265552,
 1.567093587800413]

In [45]:
def print_guesses_table(extracted_guesses, rewards):
    advantages = compute_advantages(rewards)
    elems = list(zip(range(len(extracted_guesses)), extracted_guesses, rewards, advantages))

    headers = ['Index', 'Guess', 'Reward', 'Advantage']
    table = tabulate(elems, headers=headers, tablefmt='grid').split('\n')
    for row in table:
        print(row)

def render_guess_table(response, reward_fn):
    guesses = [extract_guess(guess) for guess in response]
    rewards = [reward_fn(guess, secret_word) for guess in guesses]
    print_guesses_table(guesses, rewards)

In [51]:
print(f'Secret: {secret_word}')
response = generate(get_messages(past_guesses), num_guesses=8)
render_guess_table(response, wordle_reward)

Secret: POUND
+---------+---------+----------+-------------+
|   Index | Guess   |   Reward |   Advantage |
|       0 | FOUNI   |        0 |           0 |
+---------+---------+----------+-------------+
|       1 | MOONED  |        0 |           0 |
+---------+---------+----------+-------------+
|       2 | DONUT   |        0 |           0 |
+---------+---------+----------+-------------+
|       3 | OUND    |        0 |           0 |
+---------+---------+----------+-------------+
|       4 | INDO    |        0 |           0 |
+---------+---------+----------+-------------+
|       5 | BROKE   |        0 |           0 |
+---------+---------+----------+-------------+
|       6 | ONED    |        0 |           0 |
+---------+---------+----------+-------------+
|       7 | FOOLN   |        0 |           0 |
+---------+---------+----------+-------------+


So, diversity of responses yeah, but no diversity in rewards/advantage and so no learning's possible. 

## Updated reward function to give partial credit

We can be more granular than the binary reward function, which'll indicate when responses are at least marginally better than others, even if they're not the secret word. 

In [54]:
def worldle_reward_partial_credit(guess: str, secret_word: str) -> float:
    if len(guess) != len(secret_word):
        return 0.0 # no reward for guessing the wrong number of letters

    valid_letters = set(secret_word)
    reward = 0.0
    # each correct letter in the right spot is +0.2, each correct letter in the wrong spot is +0.1
    for letter, secret_letter in zip(guess, secret_word):
        if letter == secret_letter:
            reward += 0.2
        elif letter in valid_letters:
            reward += 0.1
        else:
            pass

    return reward

First, what happens if we just let the model always choose the highest priority response? We'll always get the same guess, which means we'll have no diversity in responses, and everything will have an advantage of zero. Not good.

In [56]:
print(f'Secret: {secret_word}')
response = generate(get_messages(past_guesses), num_guesses=8, temperature=0)
render_guess_table(response, worldle_reward_partial_credit)

Secret: POUND
+---------+---------+----------+-------------+
|   Index | Guess   |   Reward |   Advantage |
|       0 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       1 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       2 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       3 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       4 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       5 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       6 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       7 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+


Now with a relatively high temperature/lots of variation - we'll get some diversity, but also crappy guesses because of the randomness. (Since we're not running multiple iterates and adjusting based on the outputs, I don't think we see any of this now - i.e., these responses and the ones in the next bit w/ temp 0.7 aren't necessarily better/worse.)

In [60]:
print(f'Secret: {secret_word}')
response = generate(get_messages(past_guesses), num_guesses=8, temperature=1.3)
render_guess_table(response, worldle_reward_partial_credit)

Secret: POUND
+---------+---------+----------+-------------+
|   Index | Guess   |   Reward |   Advantage |
|       0 | LANDS   |      0.2 |   -0.46525  |
+---------+---------+----------+-------------+
|       1 | BOUND   |      0.8 |    1.56493  |
+---------+---------+----------+-------------+
|       2 | BRICK   |      0   |   -1.14198  |
+---------+---------+----------+-------------+
|       3 | FOUND   |      0.8 |    1.56493  |
+---------+---------+----------+-------------+
|       4 | BOND    |      0   |   -1.14198  |
+---------+---------+----------+-------------+
|       5 | LOOKS   |      0.3 |   -0.126886 |
+---------+---------+----------+-------------+
|       6 | BLOWN   |      0.2 |   -0.46525  |
+---------+---------+----------+-------------+
|       7 | FUNNY   |      0.4 |    0.211477 |
+---------+---------+----------+-------------+


And with a more moderate temp.

In [59]:
print(f'Secret: {secret_word}')
response = generate(get_messages(past_guesses), num_guesses=8, temperature=0.7)
render_guess_table(response, worldle_reward_partial_credit)

Secret: POUND
+---------+---------+----------+-------------+
|   Index | Guess   |   Reward |   Advantage |
|       0 | STAND   |      0.4 |   0.641689  |
+---------+---------+----------+-------------+
|       1 | HOUSE   |      0.4 |   0.641689  |
+---------+---------+----------+-------------+
|       2 | RINDS   |      0.2 |  -0.825029  |
+---------+---------+----------+-------------+
|       3 | TOWNS   |      0.4 |   0.641689  |
+---------+---------+----------+-------------+
|       4 | BOLD    |      0   |  -2.29175   |
+---------+---------+----------+-------------+
|       5 | GRIND   |      0.4 |   0.641689  |
+---------+---------+----------+-------------+
|       6 | CURED   |      0.3 |  -0.0916698 |
+---------+---------+----------+-------------+
|       7 | WORDN   |      0.4 |   0.641689  |
+---------+---------+----------+-------------+
