In [None]:
import numpy as np

#### Vamos a escribir una función que nos permita escoger uno de los cinco brazos, y nos devuelva una recompensa, dependiendo de la probabilidad de dicho brazo.

In [None]:
def pull_bandit_arm(bandits, bandit_number):
    """Pull arm in position bandit_number and return the obtained reward (0 or 1)."""
    result = np.random.uniform()
    return int(result <= bandits[bandit_number])

#### Ahora, crearemos la función que decide qué acción debe tomar el agente. Con probabilidad epsilon tomará una acción aleatoria; y si no, tomará la acción con mejor media de recompensas.

In [None]:
def take_epsilon_greedy_action(epsilon, average_rewards):
    """Take random action with probability epsilon, else take best action."""
    result = np.random.uniform()
    if result < epsilon:
        return np.random.randint(0, len(average_rewards)) # Random action.
    else:
        return np.argmax(average_rewards) # Greedy action.

#### Definamos las probabilidades de los brazos, los parámetros epsilon y la cantidad de iteraciones o acciones que vamos a tomar. Definamos también tres listas donde guardaremos información sobre las acciones ejecutadas hasta este momento y las recompensas conseguidas para cada brazo.

In [None]:
# Probability of success of each bandit.
bandits = [0.1, 0.3, 0.05, 0.55, 0.4]

num_iterations = 1000
epsilon = 0.1

# Store info to know which one is the best action in each moment.
total_rewards = [0 for _ in range(len(bandits))]
total_attempts = [0 for _ in range(len(bandits))]
avg_rewards = [0.0 for _ in range(len(bandits))]

for iteration in range(num_iterations+1):
  action = take_epsilon_greedy_action(epsilon, avg_rewards)
  reward = pull_bandit_arm(bandits, action)
  # Store result.
  total_rewards[action] += reward
  total_attempts[action] += 1
  avg_rewards[action] = total_rewards[action] / float(total_attempts[action])
  
  if iteration % 100 == 0:
    print('Average reward for bandits in iteration {} is {}'.format(iteration,
                              ['{:.2f}'.format(elem) for elem in avg_rewards]))

#### Imprimimos al mejor bandido y la recompensa total recolectada en los 1000 episodios.

In [None]:
# Print results.
best_bandit = np.argmax(avg_rewards)
print('\nBest bandit is {} with an average observed reward of {:.3f}'
      .format(best_bandit, avg_rewards[best_bandit]))
print('Total observed reward in the {} episodes has been {}'
      .format(num_iterations, sum(total_rewards)))