# 1. Import Dependencies

In [1]:
import os
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env

# 2. Test Environment

### Download ROMS Atari Games
[http://www.atarimania.com/roms/Roms.rar](http://www.atarimania.com/roms/Roms.rar)

In [4]:
!python -m atari_py.import_roms ./Roms/ROMS

copying bank_heist.bin from ./Roms/ROMS/Bank Heist (Bonnie & Clyde, Cops 'n' Robbers, Hold-Up, Roaring 20's) (1983) (20th Century Fox Video Games, Bill Aspromonte) (11012) ~.bin to /home/brianadit24/Downloads/AI_Research/rlcourse_env/lib/python3.8/site-packages/atari_py/atari_roms/bank_heist.bin
copying kangaroo.bin from ./Roms/ROMS/Kangaroo (1983) (Atari - GCC, Kevin Osborn) (CX2689) ~.bin to /home/brianadit24/Downloads/AI_Research/rlcourse_env/lib/python3.8/site-packages/atari_py/atari_roms/kangaroo.bin
copying tutankham.bin from ./Roms/ROMS/Tutankham (1983) (Parker Brothers, Dave Engman, Dawn Stockbridge) (PB5340) ~.bin to /home/brianadit24/Downloads/AI_Research/rlcourse_env/lib/python3.8/site-packages/atari_py/atari_roms/tutankham.bin
copying video_pinball.bin from ./Roms/ROMS/Video Pinball - Arcade Pinball (1981) (Atari, Bob Smith - Sears) (CX2648 - 49-75161) ~.bin to /home/brianadit24/Downloads/AI_Research/rlcourse_env/lib/python3.8/site-packages/atari_py/atari_roms/video_pinball

copying pitfall.bin from ./Roms/ROMS/Pitfall! - Pitfall Harry's Jungle Adventure (Jungle Runner) (1982) (Activision, David Crane) (AX-018, AX-018-04) ~.bin to /home/brianadit24/Downloads/AI_Research/rlcourse_env/lib/python3.8/site-packages/atari_py/atari_roms/pitfall.bin
copying pooyan.bin from ./Roms/ROMS/Pooyan (1983) (Konami) (RC 100-X 02) ~.bin to /home/brianadit24/Downloads/AI_Research/rlcourse_env/lib/python3.8/site-packages/atari_py/atari_roms/pooyan.bin
copying private_eye.bin from ./Roms/ROMS/Private Eye (1984) (Activision, Bob Whitehead) (AG-034-04) ~.bin to /home/brianadit24/Downloads/AI_Research/rlcourse_env/lib/python3.8/site-packages/atari_py/atari_roms/private_eye.bin
copying space_invaders.bin from ./Roms/ROMS/Space Invaders (1980) (Atari, Richard Maurer - Sears) (CX2632 - 49-75153) ~.bin to /home/brianadit24/Downloads/AI_Research/rlcourse_env/lib/python3.8/site-packages/atari_py/atari_roms/space_invaders.bin
copying kung_fu_master.bin from ./Roms/ROMS/Kung-Fu Master (1

In [2]:
ENV_NAME = 'Breakout-v0'
env = gym.make(ENV_NAME)

In [4]:
env.action_space

Discrete(4)

In [5]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f"Episode: {episode} Score: {score}")
env.close()

Episode: 1 Score: 0.0
Episode: 2 Score: 0.0
Episode: 3 Score: 0.0
Episode: 4 Score: 1.0
Episode: 5 Score: 1.0


# 3. Vectorize Environment and Train Model

In [6]:
env = make_atari_env(ENV_NAME, n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4) #Train Multiple Env on Same Time

In [12]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [13]:
model.learn(total_timesteps=100000)

Logging to Training/Logs/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 277      |
|    ep_rew_mean        | 1.46     |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 100      |
|    time_elapsed       | 35       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.00752  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.164   |
|    value_loss         | 0.0315   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 282      |
|    ep_rew_mean        | 1.58     |
| time/                 |          |
|    fps                | 57       |
|    iterations         | 200      |
|    time_elapsed       | 69       |
|    total_timesteps    | 4000     |
| train

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 346      |
|    ep_rew_mean        | 3.25     |
| time/                 |          |
|    fps                | 63       |
|    iterations         | 1400     |
|    time_elapsed       | 442      |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -0.12    |
|    explained_variance | 0.863    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | 0.0118   |
|    value_loss         | 0.166    |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 345       |
|    ep_rew_mean        | 3.25      |
| time/                 |           |
|    fps                | 63        |
|    iterations         | 1500      |
|    time_elapsed       | 472       |
|    total_timesteps    | 30000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 439      |
|    ep_rew_mean        | 4.96     |
| time/                 |          |
|    fps                | 62       |
|    iterations         | 2700     |
|    time_elapsed       | 859      |
|    total_timesteps    | 54000    |
| train/                |          |
|    entropy_loss       | -0.236   |
|    explained_variance | 0.941    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2699     |
|    policy_loss        | 0.02     |
|    value_loss         | 0.0282   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 442      |
|    ep_rew_mean        | 5.02     |
| time/                 |          |
|    fps                | 62       |
|    iterations         | 2800     |
|    time_elapsed       | 893      |
|    total_timesteps    | 56000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 437      |
|    ep_rew_mean        | 4.95     |
| time/                 |          |
|    fps                | 62       |
|    iterations         | 4000     |
|    time_elapsed       | 1289     |
|    total_timesteps    | 80000    |
| train/                |          |
|    entropy_loss       | -0.227   |
|    explained_variance | 0.652    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3999     |
|    policy_loss        | 0.01     |
|    value_loss         | 0.0625   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 448      |
|    ep_rew_mean        | 5.16     |
| time/                 |          |
|    fps                | 62       |
|    iterations         | 4100     |
|    time_elapsed       | 1320     |
|    total_timesteps    | 82000    |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x7fbed0033a30>

In [14]:
a2c_path = os.path.join('Training', 'SavedModels', 'A2C_Breakout_Model')
model.save(a2c_path)

In [15]:
del model

In [16]:
model = A2C.load(a2c_path, env)

Wrapping the env in a VecTransposeImage.


# 5. Evaluate Model and Test

In [17]:
env = make_atari_env(ENV_NAME, n_envs=1, seed=0) # Change to Single for Evaluate Model
env = VecFrameStack(env, n_stack=4)

In [18]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(5.7, 2.968164415931166)

In [19]:
env.close()

<b>Conclusion:</b> I think maybe we need to increase total_timesteps on Training Agent.