In [9]:
import numpy as np
import gym

from gym import envs

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam

from rl.agents import NAFAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess
from rl.core import Processor

Implemntation of Pendulum using a NAF Agent

In [10]:
class PendulumProcessor(Processor):
    def process_reward(self, reward):
        # The magnitude of the reward can be important. Since each step yields a relatively
        # high reward, we reduce the magnitude by two orders.
        return reward / 100.

In [11]:
ENV_NAME = 'Pendulum-v0'

In [12]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

  result = entry_point.load(False)


Build all necessary models

In [13]:
# Create value model
V_model = Sequential()
V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(1))
V_model.add(Activation('linear'))
print(V_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_4 (Flatten)          (None, 3)                 0         
_________________________________________________________________
dense_13 (Dense)             (None, 16)                64        
_________________________________________________________________
activation_13 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_14 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_15 (Activation)   (None, 16)                0         
__________

In [14]:
# Create action value model
mu_model = Sequential()
mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(nb_actions))
mu_model.add(Activation('linear'))
print(mu_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_5 (Flatten)          (None, 3)                 0         
_________________________________________________________________
dense_17 (Dense)             (None, 16)                64        
_________________________________________________________________
activation_17 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_18 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_19 (Activation)   (None, 16)                0         
__________

In [15]:
# Create loss function model
action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
x = Concatenate()([action_input, Flatten()(observation_input)])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
x = Activation('linear')(x)
L_model = Model(inputs=[action_input, observation_input], outputs=x)
print(L_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observation_input (InputLayer)  (None, 1, 3)         0                                            
__________________________________________________________________________________________________
action_input (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
flatten_6 (Flatten)             (None, 3)            0           observation_input[0][0]          
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 4)            0           action_input[0][0]               
                                                                 flatten_6[0][0]                  
__________

In [16]:
# Configure agent
processor = PendulumProcessor()
memory = SequentialMemory(limit=100000, window_length=1)

random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions)
agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
                 memory=memory, nb_steps_warmup=100, random_process=random_process,
                 gamma=.99, target_model_update=1e-3, processor=processor)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

In [None]:
# Fit agent
agent.fit(env, nb_steps=140000, visualize=False, verbose=1, nb_max_episode_steps=200)

Training for 140000 steps ...
Interval 1 (0 steps performed)
50 episodes - episode_reward: -1.867 [-4.111, -0.019] - loss: 0.000 - mean_absolute_error: 0.007 - mean_q: -0.073

Interval 2 (10000 steps performed)
50 episodes - episode_reward: -2.096 [-5.379, -0.222] - loss: 0.000 - mean_absolute_error: 0.008 - mean_q: -0.090

Interval 3 (20000 steps performed)
50 episodes - episode_reward: -1.954 [-7.582, -0.135] - loss: 0.000 - mean_absolute_error: 0.008 - mean_q: -0.108

Interval 4 (30000 steps performed)
50 episodes - episode_reward: -1.674 [-4.018, -0.028] - loss: 0.000 - mean_absolute_error: 0.008 - mean_q: -0.124

Interval 5 (40000 steps performed)
50 episodes - episode_reward: -1.866 [-14.944, -0.156] - loss: 0.000 - mean_absolute_error: 0.009 - mean_q: -0.141

Interval 6 (50000 steps performed)
50 episodes - episode_reward: -2.345 [-15.014, -0.181] - loss: 0.000 - mean_absolute_error: 0.009 - mean_q: -0.160

Interval 7 (60000 steps performed)
50 episodes - episode_reward: -4.094 

In [24]:
# After training is done, we save the final weights.
agent.save_weights('cdqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [None]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=100, visualize=True, nb_max_episode_steps=600)