# Multi-armed bandits

[Source](https://stackabuse.com/introduction-to-reinforcement-learning-with-python/)

In [None]:
import numpy as np

In [None]:
# Number of bandits
k = 3

# Epsilon value for exploration
eps = 0.1

# True probability of winning for each bandit
p_bandits = [0.45, 0.40, 0.80]


In [None]:
# We use mean reward as the goodness value of an action
# Q is the vector of goodness values for the k bandits
Q = [0 for _ in range(k)]
Q

[0, 0, 0]

In [None]:
# This is to keep track of the number of times we try each bandit
N = [0 for _ in range(k)]
N

[0, 0, 0]

In [None]:

def pull(a):
    """Pull arm of bandit number a and return 1 if win, 
    else return 0."""
    if np.random.rand() < p_bandits[a]:
        return 1 #win
    else:
        return 0 #loss

while True: #Press the stop button after a few seconds
    # Select action
    if np.random.rand() > eps:
        # Take greedy action most of the time
        a = np.argmax(Q)
    else:
        # Take random action with probability eps
        a = np.random.randint(0, k)
    
    # Collect reward
    reward = pull(a)
    
    # Incremental average
    N[a] += 1
    Q[a] += 1/N[a] * (reward - Q[a])

KeyboardInterrupt: 

In [None]:
Q

[0.45013061030633933, 0.40073420569601265, 0.8004163998795629]

In [None]:
N

[16844, 16889, 471662]