In [15]:
import numpy as np

def optimal_action(qvalue, eps):
    """
    Determines what is the action to take given a measure of past
    expected rewards across actions. With probability eps the action
    is not the greedy one
    """
    if np.random.rand() < eps:
        return np.random.randint(len(qvalue))  # random action
    else:
        return qvalue.argmax()  # greedy action

def reward_update(action, reward, qvalue_old, alpha):
    qvalue_new = qvalue_old.copy()
    qvalue_new[action] = qvalue_old[action] + alpha * (reward - qvalue_old[action])
    return qvalue_new

# Parameters for the specific question
NK = 2            # 2-armed bandit
ITEMAX = 1        # Only 3 steps
EPSILON_M = [0.75] # Greedy policy (no exploration)
NEPISODES = 1     # Single episode for the question

# Set random seed for reproducibility
np.random.seed(1234)

# Initialize true values of each action
NMEANS = np.random.normal(loc=0.0, scale=1.0, size=NK)
print("True action values:", NMEANS)

# Initialize Q-values and action count
qvalue = np.zeros(NK)
nchoices = np.zeros(NK)
alpha = 0.25

# Run for specified episodes and steps
for run in range(NEPISODES):
    for tt in range(ITEMAX):
        # Select action using greedy policy
        action = optimal_action(qvalue, EPSILON_M[0])
        
        # Generate reward for the selected action based on its true value
        reward = np.random.normal(loc=NMEANS[action], scale=2.0)
        
        # Update action count and Q-value
        nchoices[action] += 1
        # qvalue = reward_update(action, reward, qvalue, 1 / nchoices[action])
        qvalue = reward_update(action, reward, qvalue, alpha)

# Print final Q-values as the answer to the question
print("Estimated Q values after three steps:", qvalue)


True action values: [ 0.47143516 -1.19097569]
Estimated Q values after three steps: [0.37351662 0.        ]
