## Instructions
Read the README for instructions to create the conda environment and
create a ipykernel to use that environment in a Jupyter notebook.

In [1]:
# Disable TensorFlow Warnings(Because I don't like seeing them)
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

  return f(*args, **kwds)


In [2]:
#imports
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy

Using TensorFlow backend.


In [3]:
# Import gym to get the Atari Environment
import gym

In [4]:
# Get the Breakout-v0 environment from gym for use in keras-rl
env = gym.make('Breakout-v0')

In [5]:
print(env.observation_space.shape)

(210, 160, 3)


In [6]:
height = env.observation_space.shape[0]
width = env.observation_space.shape[1]
channels = env.observation_space.shape[2]

In [7]:
print(env.action_space.n)

4


In [8]:
# Import from Keras to build our CNN
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam

In [9]:

model = Sequential()
model.add(Flatten(input_shape=(3,) + env.observation_space.shape))
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(env.action_space.n, activation='linear'))

In [10]:
memory = SequentialMemory(limit=1000, window_length=3)
policy = EpsGreedyQPolicy(eps=0.1)
dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [11]:
dqn.fit(env, nb_steps=750, visualize=False, verbose=2)

Training for 750 steps ...
 270/750: episode: 1, duration: 26.000s, episode steps: 270, steps per second: 10, episode reward: 2.000, mean reward: 0.007 [0.000, 1.000], mean action: 1.000 [0.000, 3.000], mean observation: 40.629 [0.000, 200.000], loss: 345.321385, mean_absolute_error: 4.003123, mean_q: 3.825445
 436/750: episode: 2, duration: 25.474s, episode steps: 166, steps per second: 7, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: 1.036 [0.000, 3.000], mean observation: 40.762 [0.000, 200.000], loss: 55.647289, mean_absolute_error: 1.805040, mean_q: 0.178495
 598/750: episode: 3, duration: 22.178s, episode steps: 162, steps per second: 7, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: 1.049 [0.000, 3.000], mean observation: 40.762 [0.000, 200.000], loss: 0.002755, mean_absolute_error: 0.009431, mean_q: 0.034117
done, took 88.470 seconds


<keras.callbacks.History at 0x7f051862c550>

In [12]:
dqn.save_weights('policy.h5', overwrite=True)

In [13]:
dqn.test(env, nb_episodes=1, visualize=True, nb_max_episode_steps=500, verbose=2)

Testing for 1 episodes ...
Episode 1: reward: 0.000, steps: 169


<keras.callbacks.History at 0x7f05042c6cc0>