# 1. Import Dependencies


In [1]:
import os
import gym # OpenAI gym
from stable_baselines3 import A2C # on fait le choix de prendre cet algorithme parmi (PPO,ACER,DQN,....)
from stable_baselines3.common.vec_env import VecFrameStack
# in the course we didn't vectorized our environment, we only trained on one environment at a time
# with breakout we are going to train on four environments at the same time in order to speed up our training
from stable_baselines3.common.evaluation import evaluate_policy # easier to test our model
from stable_baselines3.common.env_util import make_atari_env


# 2. Test Environment

In [2]:
!python -m atari_py.import_roms ./ROMS

copying tutankham.bin from ./ROMS/Tutankham (1983) (Parker Brothers, Dave Engman, Dawn Stockbridge) (PB5340) ~.bin to /home/uruk380/anaconda3/envs/Reinforcement_Learning_YT_course/lib/python3.8/site-packages/atari_py/atari_roms/tutankham.bin
copying phoenix.bin from ./ROMS/Phoenix (1983) (Atari - GCC, Mike Feinstein, John Mracek) (CX2673) ~.bin to /home/uruk380/anaconda3/envs/Reinforcement_Learning_YT_course/lib/python3.8/site-packages/atari_py/atari_roms/phoenix.bin
copying robotank.bin from ./ROMS/Robot Tank (Robotank) (1983) (Activision, Alan Miller) (AZ-028, AG-028-04) ~.bin to /home/uruk380/anaconda3/envs/Reinforcement_Learning_YT_course/lib/python3.8/site-packages/atari_py/atari_roms/robotank.bin
copying qbert.bin from ./ROMS/Q-bert (1987) (Atari) (CX26150).bin to /home/uruk380/anaconda3/envs/Reinforcement_Learning_YT_course/lib/python3.8/site-packages/atari_py/atari_roms/qbert.bin
copying adventure.bin from ./ROMS/Adventure (1980) (Atari, Warren Robinett) (CX2613, CX2613P) (

In [3]:
environment_name = 'Breakout-v0'
env = gym.make(environment_name)

In [4]:
env.reset()

array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8)

In [5]:
env.action_space

Discrete(4)

In [6]:
env.observation_space
# Box(0,255,(210,160,3),unit8)
# lowbound = 0 , upperbound = 255
# 210 -> height
# 160 -> width
# 3 -> an image

Box(0, 255, (210, 160, 3), uint8)

In [7]:
episodes = 5  # we test the cartpole environnement five times
# an episode must be seen as one full game within the environment
# same environments have a fixed episode length
    # CartPole which is 200 frames (length of an episode)
    # others are continuous
        # Breakout play until you run out of lives
for episode in range(1,episodes+1):
    obs = env.reset() # we have an initial set of observation for the environment
    done = False # is our episode is done ?
    score = 0 # compter

    while not done:
        # allow to view the graphical representation of the environment
        env.render()
        action = env.action_space.sample()
        # return the agent's observation (the next observation)
        # obs = array of the new observation space
        # reward = 1 or 0
        # done = whether or not the episode is done -> if True we go out of the while loop
        # info is a dict
        obs, reward, done, info = env.step(action)
        # we accumulate and save the total amount of reward
        score += reward
    print('Episode: {} Score: {}'.format(episode,score))
env.close()

# the score is very low ^^

Episode: 1 Score: 0.0
Episode: 2 Score: 0.0
Episode: 3 Score: 2.0
Episode: 4 Score: 0.0
Episode: 5 Score: 0.0


# 3. Vectorise Environment and Train Model

In [8]:
env = make_atari_env('Breakout-v0', n_envs=4,seed=0)
env = VecFrameStack(env,n_stack=4)

# Helper Functions
    # make_atari_env is a helper from stable baselines that helps create wrapped Atari environments
    # VecframeStack allows us to stack the environments together

In [9]:
env.reset() # not to forget !

array([[[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        ...,

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]],


       [[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0

In [12]:
import time
time.sleep(0.1)
env.render()
# becareful render doesn't work on ubuntu :(
# i have a black window

In [13]:
env.close()

Vectorizing
    Vectorizing the environnement particulary with multiple environments,
    allows you to train the agent faster by training in parallel

In [10]:
log_path = os.path.join("Training_Atari_Breakout","Logs")
model = A2C('CnnPolicy',env, verbose=1,tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [22]:
model.learn(total_timesteps=100000)
# much bigger the timesteps is much bigger is the result !

Logging to Training_Atari_Breakout/Logs/A2C_2
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 553      |
|    ep_rew_mean        | 7.33     |
| time/                 |          |
|    fps                | 293      |
|    iterations         | 100      |
|    time_elapsed       | 6        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -0.336   |
|    explained_variance | 0.985    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5099     |
|    policy_loss        | -0.0388  |
|    value_loss         | 0.0317   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 538      |
|    ep_rew_mean        | 7.06     |
| time/                 |          |
|    fps                | 296      |
|    iterations         | 200      |
|    time_elapsed       | 13       |
|    total_timesteps    | 400

<stable_baselines3.a2c.a2c.A2C at 0x7fa79a420c70>

# 4. Save and Reaload Model

In [12]:
a2c_path = os.path.join('Training_Atari_Breakout','Saved Models', 'A2C_Breakout_Model')
model.save(a2c_path)

In [17]:
 del model


In [18]:
model = A2C.load(a2c_path,env)

Wrapping the env in a VecTransposeImage.


# 5. Evaluate and Test

Becareful we can evaluate our policy with a vectorized environment


In [19]:
env = make_atari_env('Breakout-v0', n_envs=1,seed=0)
env = VecFrameStack(env, n_stack=4)

In [20]:
evaluate_policy(model,env,n_eval_episodes=10,render=True)

(6.7, 2.968164415931166)

In [21]:
env.close()
