In [7]:
import numpy as np

def robo_buddy_learns_np():
    # These are the things Robo-Buddy can do (actions)
    actions = ["say 'hello'", "sing a song", "do a dance"]
    # This is where Robo-Buddy stores his "belief" about how good each action is (Q-values)
    action_values = np.zeros(len(actions))
    # These are the secret true rewards the "environment" gives
    true_rewards = [5, 10, 2]

    # Robo-Buddy tries things many times to learn better
    for _ in range(50): # We increased the trials for better learning!
        chosen_idx = np.random.randint(len(actions)) # Robo-Buddy picks an action randomly (exploring)
        reward = true_rewards[chosen_idx] # He gets his reward from the secret list

        # This is how Robo-Buddy updates his "belief" (Q-value) about an action
        # It's like a tiny step of learning, where 0.1 is the "learning rate"
        action_values[chosen_idx] += 0.1 * (reward - action_values[chosen_idx])

    # After trying many times, Robo-Buddy finds the action he thinks is best
    best_action_idx = np.argmax(action_values)
    print(f"Robo-Buddy's learned values for actions: {action_values.round(2)}")
    print(f"Based on learning, Robo-Buddy thinks his best action is: '{actions[best_action_idx]}'")


robo_buddy_learns_np()


Robo-Buddy's learned values for actions: [3.43 8.78 1.73]
Based on learning, Robo-Buddy thinks his best action is: 'sing a song'
