# Use Closed-Form Policy to Play Acrobot-v1

In [1]:
import sys
import logging
import imp
import itertools

import numpy as np
np.random.seed(0)
import gym

imp.reload(logging)
logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('Acrobot-v1')
env.seed(0)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])

00:00:00 [INFO] env: <AcrobotEnv<Acrobot-v1>>
00:00:00 [INFO] action_space: Discrete(3)
00:00:00 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32)
00:00:00 [INFO] reward_range: (-inf, inf)
00:00:00 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15}
00:00:00 [INFO] _max_episode_steps: 500
00:00:00 [INFO] _elapsed_steps: None
00:00:00 [INFO] id: Acrobot-v1
00:00:00 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv
00:00:00 [INFO] reward_threshold: -100.0
00:00:00 [INFO] nondeterministic: False
00:00:00 [INFO] max_episode_steps: 500
00:00:00 [INFO] _kwargs: {}
00:00:00 [INFO] _env_name: Acrobot


In [3]:
class ClosedFormAgent:
    def __init__(self, _):
        pass

    def reset(self, mode=None):
        pass

    def step(self, observation, _reward, _done):
        x0, y0, x1, y1, v0, v1 = observation
        if v1 < -0.3:
            action = 0
        elif v1 > 0.3:
            action = 2
        else:
            y = y1 + x0 * y1 + x1 * y0
            if y > 0.:
                action = 0
            else:
                action = 2
        return action

    def close(self):
        pass


agent = ClosedFormAgent(env)

In [4]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:00:00 [INFO] ==== test ====
00:00:00 [DEBUG] test episode 0: reward = -70.00, steps = 71
00:00:00 [DEBUG] test episode 1: reward = -194.00, steps = 195
00:00:00 [DEBUG] test episode 2: reward = -71.00, steps = 72
00:00:00 [DEBUG] test episode 3: reward = -78.00, steps = 79
00:00:00 [DEBUG] test episode 4: reward = -64.00, steps = 65
00:00:00 [DEBUG] test episode 5: reward = -157.00, steps = 158
00:00:00 [DEBUG] test episode 6: reward = -71.00, steps = 72
00:00:00 [DEBUG] test episode 7: reward = -74.00, steps = 75
00:00:00 [DEBUG] test episode 8: reward = -91.00, steps = 92
00:00:00 [DEBUG] test episode 9: reward = -138.00, steps = 139
00:00:00 [DEBUG] test episode 10: reward = -92.00, steps = 93
00:00:00 [DEBUG] test episode 11: reward = -90.00, steps = 91
00:00:00 [DEBUG] test episode 12: reward = -107.00, steps = 108
00:00:00 [DEBUG] test episode 13: reward = -85.00, steps = 86
00:00:00 [DEBUG] test episode 14: reward = -78.00, steps = 79
00:00:00 [DEBUG] test episode 15: reward 

In [5]:
env.close()