# BreakOut Reinforcement Learning

## 01. Import dependencies

In [2]:
import os
import gym
import ale_py
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env

## 02. Load and Understand Environment

In [3]:
env = gym.make('MsPacman-v0')

In [4]:
env.action_space

Discrete(9)

In [5]:
env.observation_space

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

In [7]:
eps = 10
for episode in range(1, eps+1):
    state = env.reset() # Obtain initial Observations
    done = False
    score = 0
    
    while not done:
        env.render()   # Visualize environment
        action = env.action_space.sample() # random action
        n_state, reward, done, info = env.step(action)  # apply an action
        score += reward
    print('Episode: {} Score: {}'.format(episode, score))
env.close() # close render frame

Episode: 1 Score: 210.0
Episode: 2 Score: 150.0
Episode: 3 Score: 300.0
Episode: 4 Score: 240.0
Episode: 5 Score: 170.0
Episode: 6 Score: 200.0
Episode: 7 Score: 190.0
Episode: 8 Score: 240.0
Episode: 9 Score: 130.0
Episode: 10 Score: 270.0


In [9]:
env.close()

## 03. Vectorizing Environment

In [10]:
env = make_atari_env('MsPacman-v0', n_envs=4, seed=1)
env = VecFrameStack(env, n_stack=4)

## 04. Train Model

In [11]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [None]:
model.learn(total_timesteps=2000000)

Logging to Training\Logs\A2C_2
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 763      |
|    ep_rew_mean        | 334      |
| time/                 |          |
|    fps                | 25       |
|    iterations         | 100      |
|    time_elapsed       | 79       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.14    |
|    explained_variance | 0.0634   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.308    |
|    value_loss         | 0.42     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 780      |
|    ep_rew_mean        | 359      |
| time/                 |          |
|    fps                | 25       |
|    iterations         | 200      |
|    time_elapsed       | 157      |
|    total_timesteps    | 4000     |
| train

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 762      |
|    ep_rew_mean        | 533      |
| time/                 |          |
|    fps                | 25       |
|    iterations         | 1400     |
|    time_elapsed       | 1093     |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -0.701   |
|    explained_variance | 0.989    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | -0.0837  |
|    value_loss         | 0.598    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 769      |
|    ep_rew_mean        | 570      |
| time/                 |          |
|    fps                | 25       |
|    iterations         | 1500     |
|    time_elapsed       | 1170     |
|    total_timesteps    | 30000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 833      |
|    ep_rew_mean        | 630      |
| time/                 |          |
|    fps                | 25       |
|    iterations         | 2800     |
|    time_elapsed       | 2217     |
|    total_timesteps    | 56000    |
| train/                |          |
|    entropy_loss       | -0.746   |
|    explained_variance | 0.942    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2799     |
|    policy_loss        | 0.00877  |
|    value_loss         | 1.36     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 837      |
|    ep_rew_mean        | 646      |
| time/                 |          |
|    fps                | 25       |
|    iterations         | 2900     |
|    time_elapsed       | 2296     |
|    total_timesteps    | 58000    |
| train/                |          |
|

## 05. Save and Reload Model

In [13]:
# Save model
A2C_Path = os.path.join('Training', 'Saved Models', 'A2C_Model_Breakout')
model.save(A2C_Path)

In [47]:
# Load Model
del model
model = A2C.load(A2C_Path, env=env)

Wrapping the env in a VecTransposeImage.


## 06. Evaluate Model

In [49]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)
mean_score, std = evaluate_policy(model, env, n_eval_episodes=10, render=True)
print('Rata-rata Score:', mean_score)

Rata-rata Score: 8.5


In [50]:
env.close()