## Instructions
Read the README for instructions to create the conda environment and
create a ipykernel to use that environment in a Jupyter notebook.

##### Start off with all of our imports

In [1]:
# Disable TensorFlow Warnings(Because I don't like seeing them)
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

  return f(*args, **kwds)


In [2]:
# Imports for the whole notebook
import gym
import numpy as np

from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, GreedyQPolicy

from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam

Using TensorFlow backend.


##### In this section, we're going to create both train and play environments and set some "global" (to this notebook) variables

In [3]:
ATARI_ENV = 'Breakout-v0'
train_env = gym.make(ATARI_ENV)
np.random.seed(42)
train_env.seed(42)

play_env = gym.make(ATARI_ENV)
np.random.seed(42)
play_env.seed(42)

[42, 742738649]

In [4]:
nb_train_actions = train_env.action_space.n
nb_play_actions = play_env.action_space.n

In [5]:
# I've been playing with these values a lot during testing, so I wanted to
# separate them for easy access
nb_steps_fit = 750
nb_steps_warmup = nb_steps_fit / 3
update = nb_steps_warmup / 3

In [6]:
train_model = Sequential()
train_model.add(Flatten(input_shape=(3,) + train_env.observation_space.shape))
train_model.add(Dense(64, activation='relu'))
train_model.add(Dense(32, activation='relu'))
train_model.add(Dense(16, activation='relu'))
train_model.add(Dense(nb_train_actions, activation='linear'))

In [7]:
memory = SequentialMemory(limit=1000, window_length=3)
policy = EpsGreedyQPolicy(eps=0.1)
train_dqn = DQNAgent(model=train_model, nb_actions=nb_train_actions,
                     memory=memory, nb_steps_warmup=nb_steps_warmup,
                     target_model_update=update, policy=policy)
train_dqn.compile(Adam(lr=1e-4), metrics=['mae'])

In [8]:
train_dqn.fit(train_env, nb_steps=nb_steps_fit, visualize=False, verbose=2)

Training for 750 steps ...
 444/750: episode: 1, duration: 57.234s, episode steps: 444, steps per second: 8, episode reward: 2.000, mean reward: 0.005 [0.000, 1.000], mean action: 0.385 [0.000, 3.000], mean observation: 40.593 [0.000, 200.000], loss: 3538.403526, mean_absolute_error: 82.915953, mean_q: 51.997073
 713/750: episode: 2, duration: 73.242s, episode steps: 269, steps per second: 4, episode reward: 2.000, mean reward: 0.007 [0.000, 1.000], mean action: 0.922 [0.000, 3.000], mean observation: 40.459 [0.000, 200.000], loss: 51.082680, mean_absolute_error: 2.968939, mean_q: 4.790206
done, took 141.053 seconds


<keras.callbacks.History at 0x7f0faa60eda0>

In [9]:
train_dqn.save_weights('policy.h5', overwrite=True)

## Play Environment

In [10]:
play_model = Sequential()
play_model.add(Flatten(input_shape=(3,) + play_env.observation_space.shape))
play_model.add(Dense(64, activation='relu'))
play_model.add(Dense(32, activation='relu'))
play_model.add(Dense(16, activation='relu'))
play_model.add(Dense(nb_play_actions, activation='linear'))

In [11]:
memory = SequentialMemory(limit=1000, window_length=3)
policy = GreedyQPolicy()
play_dqn = DQNAgent(model=play_model, nb_actions=nb_play_actions,
                    memory=memory, nb_steps_warmup=nb_steps_warmup,
                    target_model_update=update, policy=policy)
play_dqn.compile(Adam(lr=1e-4), metrics=['mae'])

In [12]:
play_dqn.load_weights('./policy.h5')

In [13]:
play_dqn.test(play_env, nb_episodes=5, visualize=True,
              nb_max_episode_steps=500)

Testing for 5 episodes ...
Episode 1: reward: 0.000, steps: 162
Episode 2: reward: 0.000, steps: 165
Episode 3: reward: 0.000, steps: 167
Episode 4: reward: 0.000, steps: 161
Episode 5: reward: 0.000, steps: 160


<keras.callbacks.History at 0x7f0f98363048>