In [1]:
import argparse
import sys
from PIL import Image
import numpy as np
import gym
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

Using TensorFlow backend.


In [2]:
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4

In [3]:
class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation[20:-10,:,:])
        img = img.resize((84,84)).convert('L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == (84,84)
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

In [4]:
sys.argv = ['foo']
parser = argparse.ArgumentParser()
parser.add_argument('--mode', choices=['train', 'test'], default='train')
parser.add_argument('--env-name', type=str, default='SpaceInvaders-v0')
parser.add_argument('--weights', type=str, default=None)
args = parser.parse_args()

# Get the environment and extract the number of actions.
env = gym.make(args.env_name)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [5]:
# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = Sequential()
if K.image_dim_ordering() == 'tf':
    # (width, height, channels)
    model.add(Permute((2, 3, 1), input_shape=input_shape))
elif K.image_dim_ordering() == 'th':
    # (channels, width, height)
    model.add(Permute((1, 2, 3), input_shape=input_shape))
else:
    raise RuntimeError('Unknown image_dim_ordering.')
model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
model.add(Activation('relu'))
model.add(Convolution2D(64, (4, 4), strides=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_1 (Permute)          (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 20, 20, 32)        8224      
_________________________________________________________________
activation_1 (Activation)    (None, 20, 20, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
activation_2 (Activation)    (None, 9, 9, 64)          0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
activation_3 (Activation)    (None, 7, 7, 64)          0         
__________

In [7]:
args.mode

'train'

In [9]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

if args.mode == 'train':
    # Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
    weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
    checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(args.env_name)
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
    callbacks += [FileLogger(log_filename, interval=100)]
    dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000)

    # After training is done, we save the final weights one more time.
    dqn.save_weights(weights_filename, overwrite=True)

    # Finally, evaluate our algorithm for 10 episodes.
    dqn.test(env, nb_episodes=10, visualize=False)
elif args.mode == 'test':
    weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
    if args.weights:
        weights_filename = args.weights
    dqn.load_weights(weights_filename)
dqn.test(env, nb_episodes=10, visualize=True)

Training for 1750000 steps ...
Interval 1 (0 steps performed)
16 episodes - episode_reward: 7.625 [5.000, 14.000] - ale.lives: 2.161

Interval 2 (10000 steps performed)
14 episodes - episode_reward: 9.214 [3.000, 20.000] - ale.lives: 2.166

Interval 3 (20000 steps performed)
14 episodes - episode_reward: 9.571 [2.000, 20.000] - ale.lives: 2.112

Interval 4 (30000 steps performed)
16 episodes - episode_reward: 8.188 [4.000, 11.000] - ale.lives: 2.174

Interval 5 (40000 steps performed)
14 episodes - episode_reward: 9.429 [5.000, 16.000] - ale.lives: 2.073

Interval 6 (50000 steps performed)
13 episodes - episode_reward: 10.846 [5.000, 16.000] - loss: 0.016 - mean_absolute_error: 2.339 - mean_q: 2.823 - mean_eps: 0.951 - ale.lives: 2.093

Interval 7 (60000 steps performed)
17 episodes - episode_reward: 7.765 [5.000, 17.000] - loss: 0.014 - mean_absolute_error: 2.339 - mean_q: 2.823 - mean_eps: 0.942 - ale.lives: 2.091

Interval 8 (70000 steps performed)
15 episodes - episode_reward: 8.53

14 episodes - episode_reward: 11.500 [6.000, 19.000] - loss: 0.013 - mean_absolute_error: 2.218 - mean_q: 2.677 - mean_eps: 0.708 - ale.lives: 2.038

Interval 34 (330000 steps performed)
14 episodes - episode_reward: 12.500 [5.000, 31.000] - loss: 0.013 - mean_absolute_error: 2.228 - mean_q: 2.688 - mean_eps: 0.699 - ale.lives: 2.070

Interval 35 (340000 steps performed)
15 episodes - episode_reward: 10.133 [5.000, 16.000] - loss: 0.013 - mean_absolute_error: 2.240 - mean_q: 2.704 - mean_eps: 0.690 - ale.lives: 2.059

Interval 36 (350000 steps performed)
15 episodes - episode_reward: 10.733 [4.000, 19.000] - loss: 0.013 - mean_absolute_error: 2.237 - mean_q: 2.699 - mean_eps: 0.681 - ale.lives: 2.045

Interval 37 (360000 steps performed)
14 episodes - episode_reward: 11.357 [5.000, 24.000] - loss: 0.013 - mean_absolute_error: 2.256 - mean_q: 2.723 - mean_eps: 0.672 - ale.lives: 1.936

Interval 38 (370000 steps performed)
16 episodes - episode_reward: 10.625 [5.000, 26.000] - loss: 0.01

13 episodes - episode_reward: 13.538 [1.000, 23.000] - loss: 0.014 - mean_absolute_error: 2.245 - mean_q: 2.709 - mean_eps: 0.429 - ale.lives: 2.141

Interval 65 (640000 steps performed)
12 episodes - episode_reward: 16.083 [3.000, 30.000] - loss: 0.014 - mean_absolute_error: 2.264 - mean_q: 2.730 - mean_eps: 0.420 - ale.lives: 2.130

Interval 66 (650000 steps performed)
12 episodes - episode_reward: 15.833 [5.000, 24.000] - loss: 0.014 - mean_absolute_error: 2.267 - mean_q: 2.735 - mean_eps: 0.411 - ale.lives: 2.259

Interval 67 (660000 steps performed)
12 episodes - episode_reward: 14.583 [6.000, 21.000] - loss: 0.015 - mean_absolute_error: 2.256 - mean_q: 2.722 - mean_eps: 0.402 - ale.lives: 2.028

Interval 68 (670000 steps performed)
15 episodes - episode_reward: 12.600 [4.000, 19.000] - loss: 0.014 - mean_absolute_error: 2.251 - mean_q: 2.716 - mean_eps: 0.393 - ale.lives: 2.117

Interval 69 (680000 steps performed)
14 episodes - episode_reward: 11.714 [3.000, 20.000] - loss: 0.01

12 episodes - episode_reward: 17.417 [5.000, 34.000] - loss: 0.013 - mean_absolute_error: 2.185 - mean_q: 2.636 - mean_eps: 0.150 - ale.lives: 2.046

Interval 96 (950000 steps performed)
11 episodes - episode_reward: 17.000 [8.000, 33.000] - loss: 0.014 - mean_absolute_error: 2.191 - mean_q: 2.642 - mean_eps: 0.141 - ale.lives: 2.205

Interval 97 (960000 steps performed)
13 episodes - episode_reward: 15.231 [4.000, 31.000] - loss: 0.014 - mean_absolute_error: 2.184 - mean_q: 2.633 - mean_eps: 0.132 - ale.lives: 2.209

Interval 98 (970000 steps performed)
10 episodes - episode_reward: 17.400 [5.000, 27.000] - loss: 0.014 - mean_absolute_error: 2.204 - mean_q: 2.657 - mean_eps: 0.123 - ale.lives: 2.110

Interval 99 (980000 steps performed)
15 episodes - episode_reward: 11.800 [4.000, 22.000] - loss: 0.014 - mean_absolute_error: 2.219 - mean_q: 2.675 - mean_eps: 0.114 - ale.lives: 2.012

Interval 100 (990000 steps performed)
13 episodes - episode_reward: 12.154 [4.000, 18.000] - loss: 0.0

11 episodes - episode_reward: 17.545 [7.000, 27.000] - loss: 0.014 - mean_absolute_error: 2.225 - mean_q: 2.682 - mean_eps: 0.100 - ale.lives: 1.998

Interval 127 (1260000 steps performed)
10 episodes - episode_reward: 19.000 [12.000, 31.000] - loss: 0.013 - mean_absolute_error: 2.221 - mean_q: 2.676 - mean_eps: 0.100 - ale.lives: 1.898

Interval 128 (1270000 steps performed)
12 episodes - episode_reward: 19.417 [5.000, 34.000] - loss: 0.014 - mean_absolute_error: 2.240 - mean_q: 2.699 - mean_eps: 0.100 - ale.lives: 2.169

Interval 129 (1280000 steps performed)
13 episodes - episode_reward: 14.462 [2.000, 23.000] - loss: 0.014 - mean_absolute_error: 2.213 - mean_q: 2.666 - mean_eps: 0.100 - ale.lives: 2.146

Interval 130 (1290000 steps performed)
12 episodes - episode_reward: 16.417 [6.000, 36.000] - loss: 0.014 - mean_absolute_error: 2.216 - mean_q: 2.671 - mean_eps: 0.100 - ale.lives: 2.107

Interval 131 (1300000 steps performed)
11 episodes - episode_reward: 20.182 [13.000, 28.000] 

11 episodes - episode_reward: 19.364 [10.000, 36.000] - loss: 0.015 - mean_absolute_error: 2.204 - mean_q: 2.657 - mean_eps: 0.100 - ale.lives: 1.998

Interval 158 (1570000 steps performed)
14 episodes - episode_reward: 15.143 [7.000, 25.000] - loss: 0.015 - mean_absolute_error: 2.216 - mean_q: 2.672 - mean_eps: 0.100 - ale.lives: 1.988

Interval 159 (1580000 steps performed)
15 episodes - episode_reward: 12.800 [2.000, 29.000] - loss: 0.015 - mean_absolute_error: 2.222 - mean_q: 2.680 - mean_eps: 0.100 - ale.lives: 2.057

Interval 160 (1590000 steps performed)
13 episodes - episode_reward: 16.308 [6.000, 30.000] - loss: 0.015 - mean_absolute_error: 2.223 - mean_q: 2.681 - mean_eps: 0.100 - ale.lives: 2.073

Interval 161 (1600000 steps performed)
14 episodes - episode_reward: 15.429 [8.000, 26.000] - loss: 0.015 - mean_absolute_error: 2.224 - mean_q: 2.682 - mean_eps: 0.100 - ale.lives: 2.224

Interval 162 (1610000 steps performed)
11 episodes - episode_reward: 19.545 [6.000, 27.000] -

<keras.callbacks.History at 0x157e6f2fac8>

In [26]:
img.ndim

3

In [28]:
env

<TimeLimit<AtariEnv<SpaceInvaders-v0>>>

In [12]:
weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
if args.weights:
    weights_filename = args.weights
dqn.load_weights(weights_filename)
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 27.000, steps: 980
Episode 2: reward: 26.000, steps: 1016
Episode 3: reward: 22.000, steps: 912
Episode 4: reward: 16.000, steps: 807
Episode 5: reward: 17.000, steps: 717
Episode 6: reward: 12.000, steps: 645
Episode 7: reward: 24.000, steps: 857
Episode 8: reward: 18.000, steps: 829
Episode 9: reward: 17.000, steps: 839
Episode 10: reward: 20.000, steps: 816


<keras.callbacks.History at 0x1584e11bd30>