### 1. Import Dependencies

In [2]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack	# parallel environments
from stable_baselines3.common.evaluation import evaluate_policy	# evaluate the model
from stable_baselines3.common.env_util import make_atari_env	# create the atari environment
from stable_baselines3.common.env_util import make_vec_env
import os

### 2. Test Environment

In [3]:
# env = gym.make("ALE/Breakout-v")
env = gym.make("Breakout-v4", render_mode='human')

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [4]:
env.reset()
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(0, 255, (210, 160, 3), uint8)
Action space: Discrete(4)


In [5]:
episodes = 3

for episode in range(1, episodes+1):
	obs = env.reset()
	done = False
	score = 0
 
	while not done:
		env.render()
		action = env.action_space.sample()
		obs, reward, done, truncated, info = env.step(action)
		score += reward
	print(f'Episode: {episode}, Score: {score}')
env.close()

  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


Episode: 1, Score: 0.0
Episode: 2, Score: 1.0
Episode: 3, Score: 2.0


In [6]:
env.close()

### 3. Vectorize envrionment and train the model

In [11]:
env = make_atari_env('Breakout-v4', n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4) # stack 4 environments together

In [12]:
env.reset()
env.render()

array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8)

In [13]:
env.close()

In [8]:
log_path = os.path.join('training', 'logs')
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)


Using cpu device
Wrapping the env in a VecTransposeImage.


In [20]:
model.learn(total_timesteps=3000000)

Logging to training/logs/A2C_5
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 597      |
|    ep_rew_mean        | 9        |
| time/                 |          |
|    fps                | 110      |
|    iterations         | 100      |
|    time_elapsed       | 4        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.221   |
|    explained_variance | -227     |
|    learning_rate      | 0.0007   |
|    n_updates          | 40099    |
|    policy_loss        | -0.00906 |
|    value_loss         | 0.03     |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 593       |
|    ep_rew_mean        | 8.57      |
| time/                 |           |
|    fps                | 107       |
|    iterations         | 200       |
|    time_elapsed       | 9         |
|    total_timesteps    | 1000      

<stable_baselines3.a2c.a2c.A2C at 0x7f38d8620160>

### 4. Save and reload Model

In [21]:
a2c_path = os.path.join('training', 'saved_models', 'a2c_breakout_200k')

In [22]:
model.save(a2c_path)

In [15]:
del model

In [23]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


### 5. Evaluate and test

In [17]:
env = make_atari_env('Breakout-v4', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [25]:
import time
vec_env = model.get_env() 
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True) 
    obs, rewards, dones, info = vec_env.step(action) 
    vec_env.render("human")
    time.sleep(0.1)  # pause for 50 ms
env.close()

KeyboardInterrupt: 

: 

In [24]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(14.8, 5.706137047074842)