# Play GaussianMABEnv-v0

In [1]:
import sys
import logging

import numpy as np
np.random.seed(0)
import scipy.stats as stats
import gym
import gym.spaces as spaces
import gym.utils.seeding as seeding

logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

### Environment

In [2]:
class BernoulliMABEnv(gym.Env):
    """ Multi-Armed Bandit (MAB) with Gaussian rewards """

    def __init__(self, n=10, means=None):
        super(BernoulliMABEnv, self).__init__()
        self.observation_space = spaces.Box(low=0, high=0, shape=(0,), dtype=float)
        self.action_space = spaces.Discrete(n)
        self.seed(0)
        self.means = means or self.np_random.randn(n)

    def reset(self, *, seed=None, return_info=False, options=None):
        super().reset(seed=seed)
        return np.empty(0, dtype=float)

    def step(self, action):
        mean = self.means[action]
        reward = self.np_random.normal(mean, 1)
        observation = np.empty(0, dtype=float)
        return observation, reward, True, {}


from gym.envs.registration import register
register(
        id='GaussianMABEnv-v0',
        entry_point=BernoulliMABEnv,
        )

In [3]:
env = gym.make('GaussianMABEnv-v0')
env.seed(0)
for key in vars(env):
    if key == "observation_space":
        continue
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])

00:00:00 [INFO] action_space: Discrete(10)
00:00:00 [INFO] np_random: RandomState(MT19937)
00:00:00 [INFO] means: [-1.41414702  0.89361907  0.30147067 -0.69240736  1.61374064 -1.02064936
  0.04337526 -0.70744904  2.20136056 -0.62931658]
00:00:00 [INFO] spec: EnvSpec(GaussianMABEnv-v0)
00:00:00 [INFO] id: GaussianMABEnv-v0
00:00:00 [INFO] entry_point: <class '__main__.BernoulliMABEnv'>
00:00:00 [INFO] reward_threshold: None
00:00:00 [INFO] nondeterministic: False
00:00:00 [INFO] max_episode_steps: None
00:00:00 [INFO] _kwargs: {}
00:00:00 [INFO] _env_name: GaussianMABEnv


### Agent

$\epsilon$-greedy Agent

In [4]:
class EpsilonGreedyAgent:
    def __init__(self, env):
        self.epsilon = 0.1
        self.action_n = env.action_space.n
        self.counts = np.zeros(self.action_n, dtype=float)
        self.qs = np.zeros(self.action_n, dtype=float)

    def reset(self, mode=None):
        self.mode = mode

    def step(self, observation, reward, done):
        if np.random.rand() < self.epsilon:
            action = np.random.randint(self.action_n)
        else:
            action = self.qs.argmax()
        if self.mode == 'train':
            if done:
                self.reward = reward # save reward
            else:
                self.action = action # save action
        return action

    def close(self):
        if self.mode == 'train':
            self.counts[self.action] += 1
            self.qs[self.action] += (self.reward - self.qs[self.action]) / self.counts[self.action]


agent = EpsilonGreedyAgent(env)

UCB1 Agent

In [5]:
class UCB1Agent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.counts = np.zeros(self.action_n, dtype=float)
        self.qs = np.zeros(self.action_n, dtype=float)

    def reset(self, mode=None):
        self.mode = mode

    def step(self, observation, reward, done):
        total_count = max(self.counts.sum(), 1) # lower bounded by 1
        sqrts = np.sqrt(2 * np.log(total_count) / self.counts.clip(min=0.01))
        ucbs = self.qs + sqrts
        action = ucbs.argmax()
        if self.mode == 'train':
            if done:
                self.reward = reward # save reward
            else:
                self.action = action # save action
        return action

    def close(self):
        if self.mode == 'train':
            self.counts[self.action] += 1
            self.qs[self.action] += (self.reward - self.qs[self.action]) / \
                    self.counts[self.action]

Bayesian UCB Agent

(Use Gaussian distribution)

In [6]:
class BayesianUCBAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.means = np.zeros(self.action_n, dtype=float)
        self.stds = np.ones(self.action_n, dtype=float)

    def reset(self, mode=None):
        self.mode = mode

    def step(self, observation, reward, done):
        ucbs = self.means + 3 * self.stds
        action = ucbs.argmax()
        if self.mode == 'train':
            if done:
                self.reward = reward # save reward
            else:
                self.action = action # save action
        return action

    def close(self):
        if self.mode == 'train':
            old_var_recip = self.stds[self.action] ** -2
            old_natural_param_0 = self.means[self.action] * old_var_recip
            self.means[self.action] = (old_natural_param_0 + self.reward) / \
                    (old_natural_param_0 + 1.)
            self.stds[self.action] = 1. / np.sqrt(old_var_recip + 1.)

Thompson Sampling Agent

(Use Gaussian distribution)

In [7]:
class ThompsonSamplingAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.means = np.zeros(self.action_n, dtype=float)
        self.stds = np.ones(self.action_n, dtype=float)

    def reset(self, mode=None):
        self.mode = mode

    def step(self, observation, reward, done):
        samples = [np.random.normal(mean, std) for mean, std in
                zip(self.means, self.stds)]
        action = np.argmax(samples)
        if self.mode == 'train':
            if done:
                self.reward = reward # save reward
            else:
                self.action = action # save action
        return action

    def close(self):
        if self.mode == 'train':
            old_var_recip = self.stds[self.action] ** -2
            old_natural_param_0 = self.means[self.action] * old_var_recip
            self.means[self.action] = (old_natural_param_0 + self.reward) / \
                    (old_natural_param_0 + 1.)
            self.stds[self.action] = 1. / np.sqrt(old_var_recip + 1.)

### Online Interaction

In [8]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


trial_regrets = []
for trial in range(100):
    # create a new agent for each trial - change agent here
    # agent = EpsilonGreedyAgent(env)
    agent = UCB1Agent(env)
    # agent = BayesianUCBAgent(env)
    # agent = ThompsonSamplingAgent(env)

    # train
    episode_rewards = []
    for episode in range(1000):
        episode_reward, elapsed_steps = play_episode(env.unwrapped, agent,
                max_episode_steps=env.spec.max_episode_steps, mode='train')
        episode_rewards.append(episode_reward)
    regrets = env.means.max() - np.array(episode_rewards)
    trial_regret = regrets.sum()
    trial_regrets.append(trial_regret)

    # test
    episode_rewards = []
    for episode in range(100):
        episode_reward, elapsed_steps = play_episode(env, agent)
        episode_rewards.append(episode_reward)
    logging.info('trial %d: average episode reward = %.2f ± %.2f, regret = %.2f',
            trial, np.mean(episode_rewards), np.std(episode_rewards),
            trial_regret)

logging.info('average regret = %.2f ± %.2f',
        np.mean(trial_regrets), np.std(trial_regrets))

00:00:00 [INFO] trial 0: average episode reward = 2.34 ± 1.12, regret = 126.96
00:00:00 [INFO] trial 1: average episode reward = 2.15 ± 0.83, regret = 47.71
00:00:00 [INFO] trial 2: average episode reward = 2.21 ± 0.96, regret = 78.57
00:00:00 [INFO] trial 3: average episode reward = 2.14 ± 0.97, regret = 37.62
00:00:01 [INFO] trial 4: average episode reward = 2.30 ± 1.02, regret = 62.78
00:00:01 [INFO] trial 5: average episode reward = 2.42 ± 1.04, regret = 87.73
00:00:01 [INFO] trial 6: average episode reward = 2.14 ± 1.04, regret = 9.11
00:00:01 [INFO] trial 7: average episode reward = 2.32 ± 1.08, regret = 56.97
00:00:01 [INFO] trial 8: average episode reward = 2.27 ± 0.87, regret = 81.09
00:00:01 [INFO] trial 9: average episode reward = 2.24 ± 0.95, regret = 86.85
00:00:02 [INFO] trial 10: average episode reward = 2.02 ± 1.10, regret = 88.45
00:00:02 [INFO] trial 11: average episode reward = 2.15 ± 1.07, regret = 66.57
00:00:02 [INFO] trial 12: average episode reward = 2.20 ± 0.94

In [9]:
env.close()