In [1]:
%matplotlib notebook

import numpy as np
from matplotlib import pyplot as plt

In [72]:
N_ARMS = 10
N_RUNS = 2000
N_STEPS = 10000

# TODO: remove
#N_RUNS = 100
#N_STEPS = 3000

class KBanditEnv:
    def __init__(self, n_arms=N_ARMS, q_start_init=1., q_stdev=1.):
        self.n_arms = N_ARMS
        self.q_start_init = q_start_init
        self.q_stdev = q_stdev
        self.reset()
    
    def reset(self):
        self.q_stars = np.ones(self.n_arms)*self.q_start_init
        # TODO: remove
        self.q_stars = np.random.rand(self.n_arms)*self.q_start_init
        
    def draw_reward(self, a):
        qa_mu = self.q_stars[a]
        return np.random.normal(qa_mu, self.q_stdev)
    
    def get_optimal_a(self):
        return np.argmax(self.q_stars)

    def apply_random_walk(self, mu=0, stdev=0.01):
        self.q_stars += np.random.normal(mu, stdev, self.q_stars.shape)
  
class EpsilonGreedyPolicy:
    def __init__(self, n_arms=N_ARMS, epsilon=0.1):
        self.n_arms = n_arms
        self.epsilon = epsilon
        self.reset()
        
    def reset(self):
        self.Qs = np.zeros(N_ARMS)
    
    def sample_action(self):
        rand = np.random.random()
        if rand >= self.epsilon: # exploit sample-average estimate
            a = np.argmax(self.Qs)
        else: # sample greedy action
            a = np.random.randint(0, self.n_arms)
        return a
    
    def update_reward(self, a, r):
        raise NotImplementedError()
       
class SampleAveragePolicy(EpsilonGreedyPolicy):
    def __init__(self, n_arms=N_ARMS, epsilon=0.1):
        super().__init__(n_arms, epsilon)
        
    def reset(self):
        super().reset()
        self.Ns = np.zeros(N_ARMS)
    
    def update_reward(self, a, r):
        self.Ns[a] += 1
        self.Qs[a] = self.Qs[a] + 1./(self.Ns[a])*(r-self.Qs[a])

class FixedPolicy(EpsilonGreedyPolicy):
    def __init__(self, n_arms=N_ARMS, epsilon=0.1, alpha=0.1):
        super().__init__(n_arms, epsilon)
        self.alpha =alpha
        
    def update_reward(self, a, r):
        self.Qs[a] = self.Qs[a] + self.alpha*(r-self.Qs[a])

class Stats:
    def __init__(self, n_runs=N_RUNS, n_steps=N_STEPS):
        self.n_runs = n_runs
        self.n_steps = n_steps
        self.reset()
        
    def reset(self):
        self.rewards = np.zeros((self.n_runs, self.n_steps))
        self.optimal_action = np.zeros((self.n_runs, self.n_steps))
        
    def update(self, run, step, r, is_a_opt):
        self.rewards[run, step] = r
        self.optimal_action[run, step] = is_a_opt
        
    def get_avg_rewards(self):
        return np.mean(self.rewards, axis=0)

    def get_avg_optimacy(self):
        return np.mean(self.optimal_action, axis=0)

In [73]:
env = KBanditEnv()
pis = [SampleAveragePolicy(), FixedPolicy()]
stats = [Stats(), Stats()]

for run in range(N_RUNS):
    if run % 20 == 0:
        print('run %d/%d' % (run+1, N_RUNS))
    env.reset()
    for pi in pis: pi.reset()
        
    for step in range(N_STEPS):
        a_star = env.get_optimal_a()
        
        for i in range(len(pis)):
            a = pis[i].sample_action()
            r = env.draw_reward(a)
            pis[i].update_reward(a, r)
            
            stats[i].update(run, step, r, (a == a_star))

    env.apply_random_walk()
print('done')

run 1/2000
run 21/2000
run 41/2000
run 61/2000
run 81/2000
run 101/2000
run 121/2000
run 141/2000
run 161/2000
run 181/2000
run 201/2000
run 221/2000
run 241/2000
run 261/2000
run 281/2000
run 301/2000
run 321/2000
run 341/2000
run 361/2000
run 381/2000
run 401/2000
run 421/2000
run 441/2000
run 461/2000
run 481/2000
run 501/2000
run 521/2000
run 541/2000
run 561/2000
run 581/2000
run 601/2000
run 621/2000
run 641/2000
run 661/2000
run 681/2000
run 701/2000
run 721/2000
run 741/2000
run 761/2000
run 781/2000
run 801/2000
run 821/2000
run 841/2000
run 861/2000
run 881/2000
run 901/2000
run 921/2000
run 941/2000
run 961/2000
run 981/2000
run 1001/2000
run 1021/2000
run 1041/2000
run 1061/2000
run 1081/2000
run 1101/2000
run 1121/2000
run 1141/2000
run 1161/2000
run 1181/2000
run 1201/2000
run 1221/2000
run 1241/2000
run 1261/2000
run 1281/2000
run 1301/2000
run 1321/2000
run 1341/2000
run 1361/2000
run 1381/2000
run 1401/2000
run 1421/2000
run 1441/2000
run 1461/2000
run 1481/2000
run 15

In [74]:
avgr_sample_avg = stats[0].get_avg_rewards()
avgr_fixed = stats[1].get_avg_rewards()
opta_sample_avg = stats[0].get_avg_optimacy()
opta_fixed = stats[1].get_avg_optimacy()
steps = np.arange(0, N_STEPS)

fig = plt.figure()
plt.plot(steps, avgr_sample_avg, 'b-', label='Sample average')
plt.plot(steps, avgr_fixed, 'g--', label='Fixed alpha')
plt.xlabel('Steps')
plt.ylabel('Average reward')
xmin, xmax = plt.gca().get_xlim()
ymin, ymax = plt.gca().get_ylim()
plt.axis([xmin, xmax, min(ymin, 0.), ymax])
plt.legend(loc=3)
plt.show()

fig = plt.figure()
plt.plot(steps, opta_sample_avg*100, 'b-', label='Sample average')
plt.plot(steps, opta_fixed*100, 'g--', label='Fixed alpha')
plt.xlabel('Steps')
plt.ylabel('% Optimal action')
xmin, xmax = plt.gca().get_xlim()
ymin, ymax = plt.gca().get_ylim()
plt.axis([xmin, xmax, min(ymin, 0.), ymax])
plt.legend(loc=4)
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>