In [1]:
%matplotlib notebook

import numpy as np
from matplotlib import pyplot as plt

In [2]:
class KArmedBanditEnv:
    def __init__(self, n_arms, q_start_init=1., q_stdev=1.):
        self.n_arms = n_arms
        self.q_start_init = q_start_init
        self.q_stdev = q_stdev
        self.reset()
    
    def reset(self, from_q_init=True):
        if from_q_init:
            self.q_stars = np.ones(self.n_arms)*self.q_start_init
        else:
            self.q_stars = np.random.normal(loc=0., scale=1., size=(self.n_arms,))
        
    def draw_reward(self, a):
        qa_mu = self.q_stars[a]
        return np.random.normal(qa_mu, self.q_stdev)
    
    def get_optimal_a(self):
        return np.argmax(self.q_stars)

    def apply_random_walk(self, mu=0, stdev=0.01):
        self.q_stars += np.random.normal(mu, stdev, self.q_stars.shape)
  
class EpsilonGreedyPolicy:
    def __init__(self, n_arms, epsilon=0.1):
        self.n_arms = n_arms
        self.epsilon = epsilon
        self.reset()
        
    def reset(self):
        self.Qs = np.zeros(self.n_arms)
    
    def sample_action(self):
        rand = np.random.rand()
        if rand >= self.epsilon: # exploit sample-average estimate
            a = np.argmax(self.Qs)
        else: # sample greedy action
            a = np.random.randint(0, self.n_arms)
        return a
    
    def update_reward(self, a, r):
        raise NotImplementedError()
       
class SampleAveragePolicy(EpsilonGreedyPolicy):
    def __init__(self, n_arms, epsilon=0.1):
        super().__init__(n_arms, epsilon)
        
    def reset(self):
        super().reset()
        self.Ns = np.zeros(self.n_arms)
    
    def update_reward(self, a, r):
        self.Ns[a] += 1
        self.Qs[a] = self.Qs[a] + 1./(self.Ns[a])*(r-self.Qs[a])

class FixedAlphaPolicy(EpsilonGreedyPolicy):
    def __init__(self, n_arms, epsilon=0.1, alpha=0.1):
        super().__init__(n_arms, epsilon)
        self.alpha =alpha
        
    def update_reward(self, a, r):
        self.Qs[a] = self.Qs[a] + self.alpha*(r-self.Qs[a])

class Stats:
    def __init__(self, n_runs, n_steps):
        self.n_runs = n_runs
        self.n_steps = n_steps
        self.reset()
        
    def reset(self):
        self.rewards = np.zeros((self.n_runs, self.n_steps))
        self.optimal_action = np.zeros((self.n_runs, self.n_steps))
        
    def update(self, run, step, r, a_chosen, a_opt):
        self.rewards[run, step] = r
        self.optimal_action[run, step] = (a_chosen == a_opt)
        
    def get_avg_rewards(self):
        return np.mean(self.rewards, axis=0)

    def get_avg_optimacy(self):
        return np.mean(self.optimal_action, axis=0)

In [1]:
# TODO: implement UCB, gradient bandit, and greedy with optimal initialization classes
# TODO: implement sample loop while sweeping over all params
# TODO: compute plot sequences: (param value) vs (average reward over first N steps)
# TODO: plot