In [None]:
import numpy as np

def epsilon_greedy(num_arms, num_plays, epsilon, true_reward_probs):
  Q = np.zeros(num_arms) #estimated rewards for each arm
  N = np.zeros(num_arms) #number of times each arm has been pulled
  rewards = [] # List to store rewards obtained at each step

  def pull_arm(arm):
    return np.random.rand() < true_reward_probs[arm]

  for t in range(num_plays):
    if np.random.rand() < epsilon: #exploration
      arm = np.random.choice(num_arms)
    else: #exploitation
      arm = np.argmax(Q)

    reward = pull_arm(arm)
    rewards.append(reward)

    N[arm] += 1
    Q[arm] += (reward - Q[arm]) / N[arm]

  return Q, N, rewards

#Parameters
num_arms = 3
num_plays = 1000
epsilon = 0.1
true_reward_probs = [0.2, 0.5, 0.7]
num_simulations = 1000

#Run multiple simulations
all_Q = np.zeros((num_simulations, num_arms))
all_N = np.zeros((num_simulations, num_arms))
all_rewards = []

for i in range(num_simulations):
  Q, N, rewards = epsilon_greedy(num_arms, num_plays, epsilon, true_reward_probs)
  all_Q[i] = Q
  all_N[i] = N
  all_rewards.append(sum(rewards))

#Average results
avg_Q = np.mean(all_Q, axis=0)
avg_N = np.mean(all_N, axis=0)
avg_cumulative_reward = np.mean(all_rewards)
best_arm_reward = max(true_reward_probs) * num_plays
avg_regret = best_arm_reward - avg_cumulative_reward

print("Average Estimated values:", avg_Q)
print("Average Counts:", avg_N)
print("Average Cumulative Reward:", avg_cumulative_reward)
print("Average Regret:", avg_regret)
print("Difference between true and average estimated values:", np.abs(true_reward_probs - avg_Q))

Average Estimated values: [0.19757649 0.49300118 0.69902319]
Average Counts: [ 56.027  81.501 862.472]
Average Cumulative Reward: 655.766
Average Regret: 44.23400000000004
Difference between true and average estimated values: [0.00242351 0.00699882 0.00097681]
