# Use Closed-Form Policy to Play Tiger-v0

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym
import gym.spaces as spaces
import gym.utils.seeding as seeding
from gym.envs.registration import register

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
class Observation:
    LEFT, RIGHT, START = range(3)

class Action:
    LEFT, RIGHT, LISTEN = range(3)


class TigerEnv(gym.Env):

    def __init__(self, episodic=True):
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Discrete(2)
        self.episodic = episodic

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.random.choice(2)
        return Observation.START, {}  # placebo observation

    def step(self, action):
        if action == Action.LISTEN:
            if np.random.rand() > 0.85:
                observation = 1 - self.state
            else:
                observation = self.state
            reward = -1
            termination = False
        else:
            observation = self.state
            if action == self.state:
                reward = 10.
            else:
                reward = -100.
            if self.episodic:
                termination = True
            else:
                termination = False
                observation = self.reset()
        return observation, reward, termination, False, {}


register(
        id="Tiger-v0",
        entry_point=TigerEnv,
        kwargs={"episodic": True},
        )
register(
        id="Tiger200-v0",
        kwargs={"episodic": False},
        entry_point=TigerEnv,
        max_episode_steps=200,
        )

env = gym.make('Tiger-v0')

In [3]:
class Agent:
    def __init__(self, env=None):
        pass

    def reset(self, mode=None):
        self.count = 0

    def step(self, observation, reward, termination):
        if observation == Observation.LEFT:
            self.count += 1
        elif observation == Observation.RIGHT:
            self.count -= 1
        else:  # observation == Observation.START
            self.count = 0

        if self.count > 2:
            action = Action.LEFT
        elif self.count < -2:
            action = Action.RIGHT
        else:
            action = Action.LISTEN
        return action

    def close(self):
        pass


agent = Agent(env)

In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, termination, truncation = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, termination)
        if render:
            env.render()
        if termination or truncation:
            break
        observation, reward, termination, truncation, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:00:00 [INFO] ==== test ====
00:00:00 [INFO] test episode 0: reward = 5.00, steps = 6
00:00:00 [INFO] test episode 1: reward = 3.00, steps = 8
00:00:00 [INFO] test episode 2: reward = 7.00, steps = 4
00:00:00 [INFO] test episode 3: reward = 3.00, steps = 8
00:00:00 [INFO] test episode 4: reward = 7.00, steps = 4
00:00:00 [INFO] test episode 5: reward = 7.00, steps = 4
00:00:00 [INFO] test episode 6: reward = 7.00, steps = 4
00:00:00 [INFO] test episode 7: reward = 7.00, steps = 4
00:00:00 [INFO] test episode 8: reward = 1.00, steps = 10
00:00:00 [INFO] test episode 9: reward = 7.00, steps = 4
00:00:00 [INFO] test episode 10: reward = 3.00, steps = 8
00:00:00 [INFO] test episode 11: reward = 7.00, steps = 4
00:00:00 [INFO] test episode 12: reward = 5.00, steps = 6
00:00:00 [INFO] test episode 13: reward = 3.00, steps = 8
00:00:00 [INFO] test episode 14: reward = 5.00, steps = 6
00:00:00 [INFO] test episode 15: reward = 7.00, steps = 4
00:00:00 [INFO] test episode 16: reward = 1.00, st

In [5]:
env = gym.make('Tiger200-v0')


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:00:00 [INFO] ==== test ====
00:00:00 [INFO] test episode 0: reward = 185.00, steps = 200
00:00:00 [INFO] test episode 1: reward = 53.00, steps = 200
00:00:00 [INFO] test episode 2: reward = 229.00, steps = 200
00:00:00 [INFO] test episode 3: reward = 207.00, steps = 200
00:00:00 [INFO] test episode 4: reward = 207.00, steps = 200
00:00:00 [INFO] test episode 5: reward = 207.00, steps = 200
00:00:00 [INFO] test episode 6: reward = 251.00, steps = 200
00:00:00 [INFO] test episode 7: reward = 196.00, steps = 200
00:00:00 [INFO] test episode 8: reward = 251.00, steps = 200
00:00:00 [INFO] test episode 9: reward = 229.00, steps = 200
00:00:00 [INFO] test episode 10: reward = 141.00, steps = 200
00:00:00 [INFO] test episode 11: reward = 196.00, steps = 200
00:00:00 [INFO] test episode 12: reward = 229.00, steps = 200
00:00:00 [INFO] test episode 13: reward = 207.00, steps = 200
00:00:00 [INFO] test episode 14: reward = 240.00, steps = 200
00:00:00 [INFO] test episode 15: reward = 174.00, 