**1. Import Dependecies**

In [1]:
import os
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env

**2. Test Environment**

In [2]:
!python -m atari_py.import_roms /Roms/ROMS

/opt/anaconda3/envs/ml/bin/python: Error while finding module specification for 'atari_py.import_roms' (ModuleNotFoundError: No module named 'atari_py')


In [2]:
environment_name = 'Breakout-v4'
env = gym.make(environment_name, render_mode='rgb_array')

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [None]:
env.reset()

In [4]:
env.action_space

Discrete(4)

In [5]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [8]:
eposides = 5
for episode in range(eposides):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, trunc, info = env.step(action)
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))
env.close()

Episode: 0 Score: 0.0
Episode: 1 Score: 1.0
Episode: 2 Score: 3.0
Episode: 3 Score: 1.0
Episode: 4 Score: 3.0


**3. Vectorize Environment and Train Model**

In [9]:
env = make_atari_env(environment_name, n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

In [13]:
env.reset()

array([[[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        ...,

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]],


       [[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0

In [14]:
env.render()

array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8)

In [15]:
env.close()

In [17]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [18]:
model.learn(total_timesteps=10000)

Logging to Training/Logs/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 300      |
|    ep_rew_mean        | 1.76     |
| time/                 |          |
|    fps                | 56       |
|    iterations         | 100      |
|    time_elapsed       | 35       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.23    |
|    explained_variance | -0.273   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.134   |
|    value_loss         | 0.0344   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 289      |
|    ep_rew_mean        | 1.6      |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 200      |
|    time_elapsed       | 72       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x30c107820>

**4. Save and Reload Model**

In [19]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_Breakout_Model')
model.save(a2c_path)

In [20]:
del model

In [21]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


**5. Evaluate and Test**

In [24]:
# Evaluation can only be done on one environment
env = make_atari_env(environment_name, n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [25]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

  logger.warn(


(1.5, 1.3601470508735443)