# Reward functions

Start by importing dependencies:

In [1]:
from utils import *
import numpy as np

import warnings
warnings.filterwarnings("ignore")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [2]:
import torch

Create a predibase deployment that you'll use to call the models:

In [3]:
# Uncomment the line below if running in your own environment - the deployment is already setup for you here
# create_deployment()

Specify the model to use:

In [4]:
model_id = "Qwen/Qwen2.5-7B-Instruct"

## Define a simple reward function

In [5]:
def wordle_reward(guess: str, secret_word: str) -> int:
    if guess.upper() == secret_word.upper():
        return 1   # correct guess
    else:
        return 0   # incorrect guess

Define a secret word and get feedback on past guesses, then score the guesses using the reward function above:

In [6]:
secret_word = "POUND"

past_guesses = [
    GuessWithFeedback.from_secret(guess="CRANE", secret=secret_word),
    GuessWithFeedback.from_secret(guess="BLOND", secret=secret_word),
    GuessWithFeedback.from_secret(guess="FOUND", secret=secret_word),
]
past_guesses

[CRANE → Feedback: C(x) R(x) A(x) N(✓) E(x),
 BLOND → Feedback: B(x) L(x) O(-) N(✓) D(✓),
 FOUND → Feedback: F(x) O(✓) U(✓) N(✓) D(✓)]

In [7]:
response = generate(get_messages(past_guesses))[0]
guess = extract_guess(response)
reward = wordle_reward(guess, secret_word)

print(f"Guessed Word: {guess} -> Reward: {reward}")

Guessed Word: GOWN -> Reward: 0


## Using rewards to calculate advantages

In [8]:
def compute_advantages(rewards: list):
    rewards = np.array(rewards)
    
    # Compute the mean and standard deviation of the rewards
    mean_reward = np.mean(rewards)
    std_reward = np.std(rewards)

    # Avoid division by zero in case of zero variance (typically happens when all rewards are 0)
    # Note: In the GRPO implementation, we add 1e-4 to the std_reward to avoid division by zero
    if std_reward == 0:
        return [0] * len(rewards)

    # Divide by stddev of rewards to normalize range to 0
    advantages = (rewards - mean_reward) / std_reward
    return advantages.tolist()

In [9]:
rewards = [0.0, 0.2, 0.4, 0.5, 0.5, 0.6, 0.8, 1.0]
compute_advantages(rewards)

[-1.6903085094570331,
 -1.01418510567422,
 -0.33806170189140655,
 0.0,
 0.0,
 0.33806170189140655,
 1.0141851056742202,
 1.6903085094570331]

In [10]:
def render_guess_table(response, reward_fn):
    guesses = [extract_guess(guess) for guess in response]
    rewards = [reward_fn(guess, secret_word) for guess in guesses]
    print_guesses_table(guesses, rewards)

In [11]:
print(f"Secret: {secret_word}")
response = generate(get_messages(past_guesses), num_guesses=8)
render_guess_table(response, wordle_reward)

Secret: POUND
+---------+---------+----------+-------------+
|   Index | Guess   |   Reward |   Advantage |
|       0 | RINDS   |        0 |           0 |
+---------+---------+----------+-------------+
|       1 | WORD    |        0 |           0 |
+---------+---------+----------+-------------+
|       2 | GUIDE   |        0 |           0 |
+---------+---------+----------+-------------+
|       3 | ROUND   |        0 |           0 |
+---------+---------+----------+-------------+
|       4 | WORDY   |        0 |           0 |
+---------+---------+----------+-------------+
|       5 | SKIN    |        0 |           0 |
+---------+---------+----------+-------------+
|       6 | STONE   |        0 |           0 |
+---------+---------+----------+-------------+
|       7 | NOUSE   |        0 |           0 |
+---------+---------+----------+-------------+


## Update the reward function to give partial credit

In [12]:
def wordle_reward_partial_credit(guess: str, secret_word: str) -> float:
    if len(guess) != len(secret_word):
        # no reward for having the wrong number of letters
        return 0.0
    
    valid_letters = set(secret_word)
    reward = 0.0
    for letter, secret_letter in zip(guess, secret_word):
        if letter == secret_letter:
            # right letter, right location
            reward += 0.2
        elif letter in valid_letters:
            # right letter, wrong location
            reward += 0.1
        else:
            # no reward
            pass
    return reward

Try scoring a set of responses using updated reward function. Start by setting <b>temperature = 0</b>

In [13]:
print(f"Secret: {secret_word}")
response = generate(get_messages(past_guesses), num_guesses=8, temperature=0)
render_guess_table(response, wordle_reward_partial_credit)

Secret: POUND
+---------+---------+----------+-------------+
|   Index | Guess   |   Reward |   Advantage |
|       0 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       1 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       2 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       3 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       4 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       5 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       6 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+
|       7 | DOWNY   |      0.5 |           0 |
+---------+---------+----------+-------------+


Now set temperature to a high value:

In [14]:
print(f"Secret: {secret_word}")
response = generate(get_messages(past_guesses), num_guesses=8, temperature=1.3)
render_guess_table(response, wordle_reward_partial_credit)

Secret: POUND
+---------+---------+----------+-------------+
|   Index | Guess   |   Reward |   Advantage |
|       0 | RINSE   |      0.1 |  -0.524     |
+---------+---------+----------+-------------+
|       1 | THORN   |      0.2 |  -0.0582223 |
+---------+---------+----------+-------------+
|       2 | WORD    |      0   |  -0.989778  |
+---------+---------+----------+-------------+
|       3 | TESTED  |      0   |  -0.989778  |
+---------+---------+----------+-------------+
|       4 | INNED   |      0.4 |   0.873334  |
+---------+---------+----------+-------------+
|       5 | FOUNI   |      0.6 |   1.80489   |
+---------+---------+----------+-------------+
|       6 | SKUNK   |      0.4 |   0.873334  |
+---------+---------+----------+-------------+
|       7 | ROOT    |      0   |  -0.989778  |
+---------+---------+----------+-------------+


Lastly, set temperature to a moderate value of 0.7:

In [15]:
print(f"Secret: {secret_word}")
response = generate(get_messages(past_guesses), num_guesses=8, temperature=0.7)
render_guess_table(response, wordle_reward_partial_credit)

Secret: POUND
+---------+---------+----------+-------------+
|   Index | Guess   |   Reward |   Advantage |
|       0 | FOUND   |      0.8 |    1.5      |
+---------+---------+----------+-------------+
|       1 | SUNNY   |      0.4 |    0.166667 |
+---------+---------+----------+-------------+
|       2 | WORD    |      0   |   -1.16667  |
+---------+---------+----------+-------------+
|       3 | JUNK    |      0   |   -1.16667  |
+---------+---------+----------+-------------+
|       4 | OUNDN   |      0.5 |    0.5      |
+---------+---------+----------+-------------+
|       5 | BRIND   |      0.4 |    0.166667 |
+---------+---------+----------+-------------+
|       6 | GROUND  |      0   |   -1.16667  |
+---------+---------+----------+-------------+
|       7 | SOOND   |      0.7 |    1.16667  |
+---------+---------+----------+-------------+
