# Use Closed-Form Policy to Play Pendulum-v1

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('Pendulum-v1', new_step_api=True)
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

00:00:00 [INFO] id: Pendulum-v1
00:00:00 [INFO] entry_point: gym.envs.classic_control:PendulumEnv
00:00:00 [INFO] reward_threshold: None
00:00:00 [INFO] nondeterministic: False
00:00:00 [INFO] max_episode_steps: 200
00:00:00 [INFO] order_enforce: True
00:00:00 [INFO] _kwargs: {}
00:00:00 [INFO] _env_name: Pendulum
00:00:00 [INFO] max_speed: 8
00:00:00 [INFO] max_torque: 2.0
00:00:00 [INFO] dt: 0.05
00:00:00 [INFO] g: 10.0
00:00:00 [INFO] m: 1.0
00:00:00 [INFO] l: 1.0
00:00:00 [INFO] viewer: None
00:00:00 [INFO] action_space: Box([-2.], [2.], (1,), float32)
00:00:00 [INFO] observation_space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
00:00:00 [INFO] np_random: RandomState(MT19937)
00:00:00 [INFO] spec: EnvSpec(Pendulum-v1)


In [3]:
class ClosedFormAgent:
    def __init__(self, _):
        pass

    def reset(self, mode=None):
        pass

    def step(self, observation, reward, termination):
        x, y, angle_velocity = observation
        flip = (y < 0.)
        if flip:
            y *= -1. # now y >= 0
            angle_velocity *= -1.
        angle = np.arcsin(y)
        if x < 0.:
            angle = np.pi - angle
        if (angle < -0.3 * angle_velocity) or \
                (angle > 0.03 * (angle_velocity - 2.5) ** 2. + 1. and \
                angle < 0.15 * (angle_velocity + 3.) ** 2. + 2.):
            force = 2.
        else:
            force = -2.
        if flip:
            force *= -1.
        action = np.array([force,])
        return action

    def close(self):
        pass


agent = ClosedFormAgent(env)

In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation = env.reset(seed=seed)
    reward, termination, truncation = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, termination)
        if render:
            env.render()
        if termination or truncation:
            break
        observation, reward, termination, truncation, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:00:00 [INFO] ==== test ====
00:00:00 [INFO] test episode 0: reward = -267.55, steps = 200
00:00:00 [INFO] test episode 1: reward = -126.17, steps = 200
00:00:00 [INFO] test episode 2: reward = -235.76, steps = 200
00:00:00 [INFO] test episode 3: reward = -127.66, steps = 200
00:00:00 [INFO] test episode 4: reward = -231.72, steps = 200
00:00:00 [INFO] test episode 5: reward = -283.73, steps = 200
00:00:01 [INFO] test episode 6: reward = -2.03, steps = 200
00:00:01 [INFO] test episode 7: reward = -225.50, steps = 200
00:00:01 [INFO] test episode 8: reward = -2.29, steps = 200
00:00:01 [INFO] test episode 9: reward = -283.62, steps = 200
00:00:01 [INFO] test episode 10: reward = -120.88, steps = 200
00:00:01 [INFO] test episode 11: reward = -3.70, steps = 200
00:00:01 [INFO] test episode 12: reward = -238.70, steps = 200
00:00:01 [INFO] test episode 13: reward = -119.87, steps = 200
00:00:01 [INFO] test episode 14: reward = -1.69, steps = 200
00:00:01 [INFO] test episode 15: reward = 

In [5]:
env.close()