# Use Closed-Form Policy to Play BreakoutNoFrameskip-v4

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('BreakoutNoFrameskip-v4', new_step_api=True)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])

00:03:37 [INFO] env: <AtariEnv<BreakoutNoFrameskip-v4>>
00:03:37 [INFO] action_space: Discrete(4)
00:03:37 [INFO] observation_space: Box(0, 255, (210, 160, 3), uint8)
00:03:37 [INFO] reward_range: (-inf, inf)
00:03:37 [INFO] metadata: {'render.modes': ['human', 'rgb_array']}
00:03:37 [INFO] _max_episode_steps: 400000
00:03:37 [INFO] _elapsed_steps: None
00:03:37 [INFO] id: BreakoutNoFrameskip-v4
00:03:37 [INFO] entry_point: gym.envs.atari:AtariEnv
00:03:37 [INFO] reward_threshold: None
00:03:37 [INFO] nondeterministic: False
00:03:37 [INFO] max_episode_steps: 400000
00:03:37 [INFO] _kwargs: {'game': 'breakout', 'obs_type': 'image', 'frameskip': 1}
00:03:37 [INFO] _env_name: BreakoutNoFrameskip


In [3]:
def calc_mean(locs, value=float('nan')):
    indices = locs.nonzero()[0]
    if len(indices) == 0:
        return value
    return np.nanmean(indices)


class ClosedFormAgent:
    def __init__(self, _):
        pass

    def reset(self, mode=None):
        self.pad_x = 72.
        self.ball_x = 72.
        self.ball_y = 95.

    def step(self, observation, reward, termination):
        pixels = np.flipud(observation[95:190, 8:152, 0]) == 200
        pad_x = calc_mean(pixels[0])
        ball_x = calc_mean(pixels[1:].any(axis=0))
        ball_y = calc_mean(pixels[1:].any(axis=1)) + 1.

        pad_xv = pad_x - self.pad_x
        ball_xv = ball_x - self.ball_x
        ball_yv = ball_y - self.ball_y
        target_x = abs(ball_x - ball_xv / ball_yv * ball_y)
        pred_x = pad_x + pad_xv / 2. + np.random.randn() / 3.
        if pred_x < target_x - 1 and pred_x + 5. < pixels.shape[1]:
            action = 2 # right
        elif pred_x > target_x + 1 and pred_x - 5. >= 0:
            action = 3 # left
        else:
            action = 1 # no move
        self.pad_x = pad_x
        self.ball_x = ball_x
        self.ball_y = ball_y
        return action

    def close(self):
        pass


agent = ClosedFormAgent(env)

In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation = env.reset(seed=seed)
    reward, termination, truncation = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, termination)
        if render:
            env.render()
        if termination or truncation:
            break
        observation, reward, termination, truncation, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:03:37 [INFO] ==== test ====
00:15:32 [INFO] test episode 0: reward = 397.00, steps = 400000
00:16:12 [INFO] test episode 1: reward = 860.00, steps = 22312
00:27:02 [INFO] test episode 2: reward = 864.00, steps = 400000
00:27:51 [INFO] test episode 3: reward = 864.00, steps = 27045
00:28:42 [INFO] test episode 4: reward = 864.00, steps = 28393
00:30:24 [INFO] test episode 5: reward = 801.00, steps = 57369
00:41:15 [INFO] test episode 6: reward = 428.00, steps = 400000
00:42:26 [INFO] test episode 7: reward = 864.00, steps = 39369
00:53:12 [INFO] test episode 8: reward = 864.00, steps = 400000
00:54:33 [INFO] test episode 9: reward = 848.00, steps = 46194
00:55:14 [INFO] test episode 10: reward = 824.00, steps = 23012
00:55:44 [INFO] test episode 11: reward = 669.00, steps = 16171
00:56:15 [INFO] test episode 12: reward = 745.00, steps = 17462
00:57:22 [INFO] test episode 13: reward = 629.00, steps = 37797
00:58:10 [INFO] test episode 14: reward = 864.00, steps = 27505
00:59:23 [INFO]

In [5]:
env.close()