In [1]:
from numpy.random import random
from random import randint

In [2]:
EPSILON = 0.1
BANDITS = 3
EPISODES = 10000

In [3]:
class Bandit:

    def __init__(self, probability):
        # Q_k(a) stores mean rewards
        self.q = 0
        # k = times action a chosen in past
        self.k = 0
        # probability distribution
        self.probability = probability

    def get_reward(self):
        # Reward = 1 or 0
        if random() < self.probability:
            return 1
        else:
            return 0

In [5]:
class NArmedBandit:

    def __init__(self):
        self.bandits = []
        self.bandits.append(Bandit(0.5))
        self.bandits.append(Bandit(0.6))
        self.bandits.append(Bandit(0.4))

    def run(self):
        for i in range(EPISODES):
            bandit = self.bandits[self.select_bandit()]
            reward = bandit.get_reward()
            self.update(bandit, reward)
            print('Iteration %s, bandit %s with Q value %s' % (i, bandit.probability, bandit.q))

    def select_bandit(self):
        if random() < EPSILON:
            bandit_index = randint(0, BANDITS-1)
        else:
            bandit_index = self.get_bandit_max_q()

        return bandit_index

    def update(self, bandit, reward):
        bandit.k = bandit.k + 1
        bandit.q = bandit.q + (1 / (1 + bandit.k)) * (reward - bandit.q)

    def get_bandit_max_q(self):
        max_q_bandit_index = 0
        max_q = self.bandits[max_q_bandit_index].q

        for i in range(1, BANDITS):
            if self.bandits[i].q > max_q:
                max_q = self.bandits[i].q
                max_q_bandit_index = i

        return max_q_bandit_index

    def show_statistics(self):
        for i in range(BANDITS):
            print('Bandit %s with k: %s' % (i, self.bandits[i].k))

In [6]:
if __name__ == '__main__':
    bandit_problem = NArmedBandit()
    bandit_problem.run()
    bandit_problem.show_statistics()

Iteration 0, bandit 0.5 with Q value 0.0
Iteration 1, bandit 0.5 with Q value 0.3333333333333333
Iteration 2, bandit 0.5 with Q value 0.25
Iteration 3, bandit 0.5 with Q value 0.4
Iteration 4, bandit 0.5 with Q value 0.33333333333333337
Iteration 5, bandit 0.5 with Q value 0.28571428571428575
Iteration 6, bandit 0.5 with Q value 0.25000000000000006
Iteration 7, bandit 0.5 with Q value 0.22222222222222227
Iteration 8, bandit 0.5 with Q value 0.30000000000000004
Iteration 9, bandit 0.5 with Q value 0.27272727272727276
Iteration 10, bandit 0.5 with Q value 0.33333333333333337
Iteration 11, bandit 0.5 with Q value 0.3076923076923077
Iteration 12, bandit 0.5 with Q value 0.35714285714285715
Iteration 13, bandit 0.5 with Q value 0.4
Iteration 14, bandit 0.5 with Q value 0.375
Iteration 15, bandit 0.5 with Q value 0.35294117647058826
Iteration 16, bandit 0.5 with Q value 0.33333333333333337
Iteration 17, bandit 0.5 with Q value 0.31578947368421056
Iteration 18, bandit 0.5 with Q value 0.35000