In [1]:
import numpy as np
import gym

from gym import envs

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam

from rl.agents import NAFAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess
from rl.core import Processor

Using TensorFlow backend.


Implemntation of Pendulum using a NAF Agent

In [2]:
class PendulumProcessor(Processor):
    def process_reward(self, reward):
        # The magnitude of the reward can be important. Since each step yields a relatively
        # high reward, we reduce the magnitude by two orders.
        return reward / 100.

In [3]:
ENV_NAME = 'Pendulum-v0'

In [4]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

  result = entry_point.load(False)


Build all necessary models

In [5]:
# Create value model
V_model = Sequential()
V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(1))
V_model.add(Activation('linear'))
print(V_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 3)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                64        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [6]:
# Create action value model
mu_model = Sequential()
mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(nb_actions))
mu_model.add(Activation('linear'))
print(mu_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 3)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                64        
_________________________________________________________________
activation_5 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_6 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_7 (Activation)    (None, 16)                0         
__________

In [7]:
# Create loss function model
action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
x = Concatenate()([action_input, Flatten()(observation_input)])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
x = Activation('linear')(x)
L_model = Model(inputs=[action_input, observation_input], outputs=x)
print(L_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observation_input (InputLayer)  (None, 1, 3)         0                                            
__________________________________________________________________________________________________
action_input (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
flatten_3 (Flatten)             (None, 3)            0           observation_input[0][0]          
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 4)            0           action_input[0][0]               
                                                                 flatten_3[0][0]                  
__________

In [8]:
# Configure agent
processor = PendulumProcessor()
memory = SequentialMemory(limit=100000, window_length=1)

random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions)
agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
                 memory=memory, nb_steps_warmup=100, random_process=random_process,
                 gamma=.99, target_model_update=1e-3, processor=processor)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

In [9]:
# Fit agent
agent.fit(env, nb_steps=300000, visualize=False, verbose=1, nb_max_episode_steps=200)

Training for 300000 steps ...
Interval 1 (0 steps performed)
50 episodes - episode_reward: -13.525 [-17.954, -8.896] - loss: 0.000 - mean_absolute_error: 0.009 - mean_q: -0.239

Interval 2 (10000 steps performed)
50 episodes - episode_reward: -11.515 [-15.053, -6.601] - loss: 0.002 - mean_absolute_error: 0.018 - mean_q: -0.770

Interval 3 (20000 steps performed)
50 episodes - episode_reward: -5.570 [-14.958, -0.090] - loss: 0.003 - mean_absolute_error: 0.025 - mean_q: -1.069

Interval 4 (30000 steps performed)
50 episodes - episode_reward: -3.292 [-11.055, -0.035] - loss: 0.004 - mean_absolute_error: 0.027 - mean_q: -1.069

Interval 5 (40000 steps performed)
50 episodes - episode_reward: -4.786 [-14.961, -0.068] - loss: 0.004 - mean_absolute_error: 0.029 - mean_q: -0.998

Interval 6 (50000 steps performed)
50 episodes - episode_reward: -2.722 [-14.968, -0.052] - loss: 0.004 - mean_absolute_error: 0.029 - mean_q: -0.913

Interval 7 (60000 steps performed)
50 episodes - episode_reward: -

<keras.callbacks.History at 0x12208b7b8>

In [10]:
# After training is done, we save the final weights.
agent.save_weights('cdqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [15]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=100, visualize=True, nb_max_episode_steps=6000)

Testing for 100 episodes ...
Episode 1: reward: -1.254, steps: 200
Episode 2: reward: -1.275, steps: 200
Episode 3: reward: -2.557, steps: 200
Episode 4: reward: -3.811, steps: 200
Episode 5: reward: -1.318, steps: 200
Episode 6: reward: -2.529, steps: 200
Episode 7: reward: -2.317, steps: 200
Episode 8: reward: -2.263, steps: 200
Episode 9: reward: -1.274, steps: 200
Episode 10: reward: -1.341, steps: 200
Episode 11: reward: -3.871, steps: 200
Episode 12: reward: -0.010, steps: 200
Episode 13: reward: -2.442, steps: 200
Episode 14: reward: -1.334, steps: 200
Episode 15: reward: -1.333, steps: 200
Episode 16: reward: -3.894, steps: 200
Episode 17: reward: -1.330, steps: 200
Episode 18: reward: -2.383, steps: 200
Episode 19: reward: -3.426, steps: 200
Episode 20: reward: -1.273, steps: 200
Episode 21: reward: -1.263, steps: 200
Episode 22: reward: -1.311, steps: 200
Episode 23: reward: -1.306, steps: 200
Episode 24: reward: -1.292, steps: 200
Episode 25: reward: -1.353, steps: 200
Episo

KeyboardInterrupt: 