In [1]:
import numpy as np
import gym

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

Using TensorFlow backend.


In [2]:
ENV_NAME = 'MountainCarContinuous-v0'

In [3]:
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

In [4]:
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('linear'))
print(actor.summary())

action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

W0625 00:26:16.236955 140276171405120 deprecation_wrapper.py:119] From /home/oxygen/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0625 00:26:16.254884 140276171405120 deprecation_wrapper.py:119] From /home/oxygen/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0625 00:26:16.267619 140276171405120 deprecation_wrapper.py:119] From /home/oxygen/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 2)                 0         
____________________

In [5]:
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

W0625 00:26:30.332848 140276171405120 deprecation_wrapper.py:119] From /home/oxygen/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0625 00:26:30.333665 140276171405120 deprecation_wrapper.py:119] From /home/oxygen/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0625 00:26:32.751868 140276171405120 deprecation_wrapper.py:119] From /home/oxygen/anaconda3/envs/py36/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [6]:
agent.fit(env, nb_steps=100000, visualize=False, verbose=1, nb_max_episode_steps=200)

Training for 50000 steps ...
Interval 1 (0 steps performed)
50 episodes - episode_reward: -1.377 [-5.155, -0.110] - loss: 0.000 - mean_absolute_error: 0.001 - mean_q: 0.017

Interval 2 (10000 steps performed)
50 episodes - episode_reward: -1.484 [-9.105, -0.097] - loss: 0.000 - mean_absolute_error: 0.001 - mean_q: 0.016

Interval 3 (20000 steps performed)
50 episodes - episode_reward: -1.316 [-11.323, -0.079] - loss: 0.000 - mean_absolute_error: 0.001 - mean_q: 0.013

Interval 4 (30000 steps performed)
50 episodes - episode_reward: -1.386 [-5.536, -0.124] - loss: 0.000 - mean_absolute_error: 0.000 - mean_q: 0.011

Interval 5 (40000 steps performed)
done, took 189.835 seconds


<keras.callbacks.History at 0x7f9403d132e8>