In [1]:
#Init arms
class BernoulliArm():
    def __init__(self, p):
        self.p = p

    def draw(self):
        if np.random.random() > self.p:
            return 0
        else:
            return 1.0

In [2]:
import numpy as np

In [3]:
#Arms parameters
n = 4 # Number of arms
means = [0.45, 0.55, 0, 0] # Prendre en considération n
arms = []
for mean in means:
    arms.append(BernoulliArm(mean))

In [7]:
# Simulation
counts = np.zeros(n, dtype=int) #Number of counts that we use arm n
values = np.zeros(n) # average amount of reward we’ve gotten when playing each of the N arms
probabilities = np.ones(n)/n
pi = np.zeros(n) # preferences
times = 500 # Number of turns
chosen_arms = [0.0 for i in range(times)]
rewards = [0.0 for i in range(times)]
cumulative_rewards = 0.0

In [8]:
def weighted_choice(weights):
    totals = np.cumsum(weights)
    norm = totals[-1]
    throw = np.random.rand()*norm
    return np.searchsorted(totals, throw)

In [12]:
def boltzmann(pi):
    e = np.exp(pi)
    dist = e / np.sum(e)
    return dist

# Do the softmax algorithm
def select_arm(probabilities, values, t):
    beta = 0.05
    #calculate new probabilities
    if t == 0:
        return np.random.randint(len(values)) # First time
    else:
        dist = weighted_choice(probabilities)
        
        for ind, probability in enumerate(probabilities, start=0):
            if ind == np.argmax(values):
                probabilities[ind] = probability + (beta * (1 - probability))
            else: 
                probabilities[ind] = probability + (beta * (0 - probability))
    
    return dist

In [10]:
for t in range(times):
    chosen_arm = select_arm(probabilities, values, t)  # Run pursuit algo
    reward = arms[chosen_arm].draw()
    
    #Logging purposes
    chosen_arms[t] = chosen_arm  # Logging
    rewards[t] = reward  # Logging
    
    counts[chosen_arm] += 1
    _n = counts[chosen_arm]
    
    value = values[chosen_arm]
    new_value = ((_n - 1) / float(_n)) * value + (1 / float(_n)) * reward  #empirical mean
    values[chosen_arm] = new_value
    
    cumulative_rewards = cumulative_rewards + reward

print(times) 
print(chosen_arms)
print(rewards)
print(cumulative_rewards)

500
[1, 2, 2, 1, 0, 3, 3, 3, 1, 2, 0, 0, 1, 2, 0, 2, 1, 3, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 3, 0, 2, 0, 1, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 2, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,