# BreakOut Reinforcement Learning

## 01. Import dependencies

In [1]:
import os
import gym
import ale_py
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env

  from .autonotebook import tqdm as notebook_tqdm


## 02. Load and Understand Environment

In [8]:
env = gym.make('MsPacman-v0')

In [4]:
env.action_space

Discrete(9)

In [5]:
env.observation_space

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

In [9]:
eps = 10
for episode in range(1, eps+1):
    state = env.reset() # Obtain initial Observations
    done = False
    score = 0
    
    while not done:
        env.render()   # Visualize environment
        action = env.action_space.sample() # random action
        n_state, reward, done, info = env.step(action)  # apply an action
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))
env.close() # close render frame

Episode: 1 Score: 200.0
Episode: 2 Score: 240.0
Episode: 3 Score: 200.0
Episode: 4 Score: 240.0
Episode: 5 Score: 230.0
Episode: 6 Score: 230.0
Episode: 7 Score: 150.0
Episode: 8 Score: 810.0
Episode: 9 Score: 270.0
Episode: 10 Score: 270.0


In [9]:
env.close()

## 03. Vectorizing Environment

In [11]:
env = make_atari_env('MsPacman-v0', n_envs=4, seed=1)
env = VecFrameStack(env, n_stack=4)

In [8]:
env.render()

## 04. Train Model

In [12]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [14]:
model.learn(total_timesteps=500000)

Logging to Training\Logs\A2C_4
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 747      |
|    ep_rew_mean        | 702      |
| time/                 |          |
|    fps                | 19       |
|    iterations         | 100      |
|    time_elapsed       | 100      |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -0.763   |
|    explained_variance | 0.567    |
|    learning_rate      | 0.0007   |
|    n_updates          | 25105    |
|    policy_loss        | 0.703    |
|    value_loss         | 3.7      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 766      |
|    ep_rew_mean        | 696      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 200      |
|    time_elapsed       | 199      |
|    total_timesteps    | 4000     |
| train

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 839      |
|    ep_rew_mean        | 857      |
| time/                 |          |
|    fps                | 19       |
|    iterations         | 1400     |
|    time_elapsed       | 1412     |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -0.755   |
|    explained_variance | 0.848    |
|    learning_rate      | 0.0007   |
|    n_updates          | 26405    |
|    policy_loss        | 0.0296   |
|    value_loss         | 4.14     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 850      |
|    ep_rew_mean        | 842      |
| time/                 |          |
|    fps                | 19       |
|    iterations         | 1500     |
|    time_elapsed       | 1510     |
|    total_timesteps    | 30000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 871      |
|    ep_rew_mean        | 889      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 2800     |
|    time_elapsed       | 2792     |
|    total_timesteps    | 56000    |
| train/                |          |
|    entropy_loss       | -0.73    |
|    explained_variance | -0.206   |
|    learning_rate      | 0.0007   |
|    n_updates          | 27805    |
|    policy_loss        | 1.61     |
|    value_loss         | 4.57     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 868      |
|    ep_rew_mean        | 895      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 2900     |
|    time_elapsed       | 2890     |
|    total_timesteps    | 58000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 797      |
|    ep_rew_mean        | 730      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 4200     |
|    time_elapsed       | 4183     |
|    total_timesteps    | 84000    |
| train/                |          |
|    entropy_loss       | -0.427   |
|    explained_variance | 0.801    |
|    learning_rate      | 0.0007   |
|    n_updates          | 29205    |
|    policy_loss        | 0.163    |
|    value_loss         | 6.37     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 807      |
|    ep_rew_mean        | 746      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 4300     |
|    time_elapsed       | 4281     |
|    total_timesteps    | 86000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 841      |
|    ep_rew_mean        | 792      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 5600     |
|    time_elapsed       | 5561     |
|    total_timesteps    | 112000   |
| train/                |          |
|    entropy_loss       | -0.582   |
|    explained_variance | 0.874    |
|    learning_rate      | 0.0007   |
|    n_updates          | 30605    |
|    policy_loss        | 0.782    |
|    value_loss         | 4.42     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 874      |
|    ep_rew_mean        | 838      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 5700     |
|    time_elapsed       | 5659     |
|    total_timesteps    | 114000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 913      |
|    ep_rew_mean        | 952      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 7000     |
|    time_elapsed       | 6935     |
|    total_timesteps    | 140000   |
| train/                |          |
|    entropy_loss       | -0.445   |
|    explained_variance | 0.777    |
|    learning_rate      | 0.0007   |
|    n_updates          | 32005    |
|    policy_loss        | -0.506   |
|    value_loss         | 11.9     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 923      |
|    ep_rew_mean        | 952      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 7100     |
|    time_elapsed       | 7034     |
|    total_timesteps    | 142000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 975      |
|    ep_rew_mean        | 1.09e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 8400     |
|    time_elapsed       | 8299     |
|    total_timesteps    | 168000   |
| train/                |          |
|    entropy_loss       | -0.589   |
|    explained_variance | 0.947    |
|    learning_rate      | 0.0007   |
|    n_updates          | 33405    |
|    policy_loss        | 1.3      |
|    value_loss         | 1.81     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 974      |
|    ep_rew_mean        | 1.09e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 8500     |
|    time_elapsed       | 8395     |
|    total_timesteps    | 170000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 962      |
|    ep_rew_mean        | 1.03e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 9800     |
|    time_elapsed       | 9646     |
|    total_timesteps    | 196000   |
| train/                |          |
|    entropy_loss       | -0.248   |
|    explained_variance | 0.984    |
|    learning_rate      | 0.0007   |
|    n_updates          | 34805    |
|    policy_loss        | 0.044    |
|    value_loss         | 0.89     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 962      |
|    ep_rew_mean        | 1.03e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 9900     |
|    time_elapsed       | 9744     |
|    total_timesteps    | 198000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 938      |
|    ep_rew_mean        | 1.06e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 11200    |
|    time_elapsed       | 11019    |
|    total_timesteps    | 224000   |
| train/                |          |
|    entropy_loss       | -0.653   |
|    explained_variance | 0.967    |
|    learning_rate      | 0.0007   |
|    n_updates          | 36205    |
|    policy_loss        | 0.0774   |
|    value_loss         | 1.89     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 936      |
|    ep_rew_mean        | 1.04e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 11300    |
|    time_elapsed       | 11115    |
|    total_timesteps    | 226000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 940      |
|    ep_rew_mean        | 1.04e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 12600    |
|    time_elapsed       | 12362    |
|    total_timesteps    | 252000   |
| train/                |          |
|    entropy_loss       | -0.234   |
|    explained_variance | 0.519    |
|    learning_rate      | 0.0007   |
|    n_updates          | 37605    |
|    policy_loss        | 0.0896   |
|    value_loss         | 55.1     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 934      |
|    ep_rew_mean        | 1.02e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 12700    |
|    time_elapsed       | 12457    |
|    total_timesteps    | 254000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 898      |
|    ep_rew_mean        | 930      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 14000    |
|    time_elapsed       | 13712    |
|    total_timesteps    | 280000   |
| train/                |          |
|    entropy_loss       | -0.431   |
|    explained_variance | 0.982    |
|    learning_rate      | 0.0007   |
|    n_updates          | 39005    |
|    policy_loss        | 0.0794   |
|    value_loss         | 1.69     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 913      |
|    ep_rew_mean        | 946      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 14100    |
|    time_elapsed       | 13808    |
|    total_timesteps    | 282000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 879      |
|    ep_rew_mean        | 935      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 15400    |
|    time_elapsed       | 15091    |
|    total_timesteps    | 308000   |
| train/                |          |
|    entropy_loss       | -0.241   |
|    explained_variance | 0.988    |
|    learning_rate      | 0.0007   |
|    n_updates          | 40405    |
|    policy_loss        | -0.177   |
|    value_loss         | 1.15     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 883      |
|    ep_rew_mean        | 941      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 15500    |
|    time_elapsed       | 15191    |
|    total_timesteps    | 310000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 866      |
|    ep_rew_mean        | 879      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 16800    |
|    time_elapsed       | 16480    |
|    total_timesteps    | 336000   |
| train/                |          |
|    entropy_loss       | -0.681   |
|    explained_variance | 0.897    |
|    learning_rate      | 0.0007   |
|    n_updates          | 41805    |
|    policy_loss        | -1.61    |
|    value_loss         | 7.4      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 879      |
|    ep_rew_mean        | 894      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 16900    |
|    time_elapsed       | 16577    |
|    total_timesteps    | 338000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 909      |
|    ep_rew_mean        | 994      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 18200    |
|    time_elapsed       | 17866    |
|    total_timesteps    | 364000   |
| train/                |          |
|    entropy_loss       | -0.163   |
|    explained_variance | 0.994    |
|    learning_rate      | 0.0007   |
|    n_updates          | 43205    |
|    policy_loss        | -0.156   |
|    value_loss         | 0.64     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 893      |
|    ep_rew_mean        | 978      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 18300    |
|    time_elapsed       | 17965    |
|    total_timesteps    | 366000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 884      |
|    ep_rew_mean        | 926      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 19600    |
|    time_elapsed       | 19249    |
|    total_timesteps    | 392000   |
| train/                |          |
|    entropy_loss       | -0.457   |
|    explained_variance | 0.941    |
|    learning_rate      | 0.0007   |
|    n_updates          | 44605    |
|    policy_loss        | 0.0234   |
|    value_loss         | 4.73     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 895      |
|    ep_rew_mean        | 928      |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 19700    |
|    time_elapsed       | 19383    |
|    total_timesteps    | 394000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 948      |
|    ep_rew_mean        | 1.04e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 21000    |
|    time_elapsed       | 20677    |
|    total_timesteps    | 420000   |
| train/                |          |
|    entropy_loss       | -0.403   |
|    explained_variance | 0.799    |
|    learning_rate      | 0.0007   |
|    n_updates          | 46005    |
|    policy_loss        | -0.335   |
|    value_loss         | 8.91     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 971      |
|    ep_rew_mean        | 1.09e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 21100    |
|    time_elapsed       | 20776    |
|    total_timesteps    | 422000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 902      |
|    ep_rew_mean        | 1e+03    |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 22400    |
|    time_elapsed       | 22197    |
|    total_timesteps    | 448000   |
| train/                |          |
|    entropy_loss       | -0.673   |
|    explained_variance | 0.994    |
|    learning_rate      | 0.0007   |
|    n_updates          | 47405    |
|    policy_loss        | -0.232   |
|    value_loss         | 0.392    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 911      |
|    ep_rew_mean        | 1.02e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 22500    |
|    time_elapsed       | 22310    |
|    total_timesteps    | 450000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 931      |
|    ep_rew_mean        | 1.06e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 23800    |
|    time_elapsed       | 23640    |
|    total_timesteps    | 476000   |
| train/                |          |
|    entropy_loss       | -0.511   |
|    explained_variance | 0.917    |
|    learning_rate      | 0.0007   |
|    n_updates          | 48805    |
|    policy_loss        | -0.585   |
|    value_loss         | 4.53     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 933      |
|    ep_rew_mean        | 1.06e+03 |
| time/                 |          |
|    fps                | 20       |
|    iterations         | 23900    |
|    time_elapsed       | 23740    |
|    total_timesteps    | 478000   |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x1ed64022c50>

## 05. Save and Reload Model

In [13]:
# Save model
A2C_Path = os.path.join('Training', 'Saved Models', 'A2C_Model_MsPacman')
#model.save(A2C_Path)

In [14]:
# Load Model
#del model
model = A2C.load(A2C_Path, env=env)

Wrapping the env in a VecTransposeImage.


## 06. Evaluate Model

In [17]:
env = make_atari_env('MsPacman-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)
mean_score, std = evaluate_policy(model, env, n_eval_episodes=10, render=True)
print('Rata-rata Score:', mean_score)

Rata-rata Score: 882.0


In [7]:
import numpy as np
import time

env = make_atari_env('MsPacman-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)
eps = 10
for episode in range(1, eps+1):
    obs = env.reset() 
    done = False
    score = 0
    
    while not done:
        env.render()   # Visualize environment
        time.sleep(0.1)
        action, _ = model.predict(obs) # Take action using model
        obs, reward, done, info = env.step(action)  # apply an action
        score += reward
    print('Episode: {} Score: {}'.format(episode, np.max(score)))
env.close()

  "We strongly suggest supplying `render_mode` when "


Episode: 1 Score: 20.0
Episode: 2 Score: 20.0
Episode: 3 Score: 16.0
Episode: 4 Score: 11.0
Episode: 5 Score: 55.0
Episode: 6 Score: 15.0
Episode: 7 Score: 14.0
Episode: 8 Score: 56.0
Episode: 9 Score: 14.0
Episode: 10 Score: 14.0


In [18]:
env.close()