In [1]:
import tensorflow as tf
import tflearn
import numpy as np
import gym
from ActorPair import ActorPair
from CriticPair import CriticPair
from ReplayBuffer import ReplayBuffer

In [2]:

# ==========================
#   Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 50000
# Max episode length
MAX_EP_STEPS = 200
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.003
# Base learning rate for the Critic Network
CRITIC_LEARNING_RATE = 0.001
# Discount factor 
GAMMA = 0.99
# Soft target update param
TAU = 0.001

# ===========================
#   Utility Parameters
# ===========================
# Render gym env during training
RENDER_ENV = True
# Use Gym Monitor
GYM_MONITOR_EN = True
# Gym environment
ENV_NAME = 'Pendulum-v0'
# Directory for storing gym results
MONITOR_DIR = './results/gym_ddpg'
# Directory for storing tensorboard summary results
SUMMARY_DIR = './results/tf_ddpg'
RANDOM_SEED = 1337
# Size of replay buffer
BUFFER_SIZE = 10000
MINIBATCH_SIZE = 64

In [3]:
def train(sess, env, actor, critic):
    
    # Initialize our Tensorflow variables
    sess.run(tf.initialize_all_variables())
   
    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE)
    
    for i in range(MAX_EPISODES):
        s = env.reset()
#         print "Episode", i
        ep_reward = 0
        ep_ave_max_q = 0
        for j in range(MAX_EP_STEPS):
            
            if RENDER_ENV:
                env.render()
            
            # generating a step
            
            # adding noise so that actor explores systematically across episode and step
            a = actor.predict(np.reshape(s, (1, -1))) #+ (1. / (1. + i + j))
            # get new state and reward
            s2, r, is_done, info = env.step(a[0])
            
            # add step to replay buffer
            
            replay_buffer.add(np.reshape(s, (actor.s_dim,)),
                              np.reshape(a, (actor.a_dim,)), r,
                              is_done, np.reshape(s2, (actor.s_dim,)))
            
            # keep adding steps until there are enough to do a training update
            
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, is_done_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)
                
                
                # calculate targets
                target_qs = critic.predict_target(s2_batch, actor.predict_target(s2_batch))
                
                #if the game has ended target_q not added to get hindsight q
                hindsight_q_vec = (r_batch + (1 - is_done_batch.astype(float)) * GAMMA * np.reshape(target_qs, (MINIBATCH_SIZE,)))
#                 print "r_batch has shape: ", r_batch.shape
#                 print "is_done_batch has shape: ", is_done_batch.shape
#                 print "target_qs has shape: ", target_qs.shape
#                 print "H q vec has shape: ", hindsight_q_vec.shape
                
                
                
                
                #critic training
                
                predicted_q_value, _ = critic.train(s_batch, a_batch,
                                                     np.reshape(hindsight_q_vec, (MINIBATCH_SIZE, 1)))
                ep_ave_max_q += np.amax(predicted_q_value)
                
                # actor training
                actions = actor.predict(s_batch)
                dQda_list = critic.action_gradients(s_batch, actions) # could repeat more than once, or even less than once
                actor.train(s_batch, dQda_list[0])
                
                # updates targets
                actor.update_target_network()
                critic.update_target_network()
                
            s=s2
                
            ep_reward += r
                
            if is_done or j == MAX_EP_STEPS-1:
                print '| Reward: %.2i' % int(ep_reward), " | Episode", i, \
                    '| Qmax: %.4f' % (ep_ave_max_q / float(j))
                break
        
                
            

In [None]:
# defining environment
sess = tf.Session()

env = gym.make('Pendulum-v0')

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

# make sure action bound is symmetric (can change in future,
# but need to remember to scale actor output appropriately)
assert (env.action_space.high == -env.action_space.low)

action_bound = env.action_space.high

# start up actor and critic pair

actor = ActorPair(sess, state_dim, action_dim, action_bound, 
                 ACTOR_LEARNING_RATE, TAU)

critic = CriticPair(sess, state_dim, action_dim,
                   CRITIC_LEARNING_RATE, TAU,  actor.get_num_trainable_vars())

train(sess, env, actor, critic)

[2016-11-09 21:43:24,354] Making new env: Pendulum-v0


| Reward: -1431  | Episode 0 | Qmax: -2.3196
| Reward: -1572  | Episode 1 | Qmax: -0.4209
| Reward: -1449  | Episode 2 | Qmax: -1.1744
| Reward: -1637  | Episode 3 | Qmax: -1.3928
| Reward: -1248  | Episode 4 | Qmax: -0.9626
| Reward: -1104  | Episode 5 | Qmax: -0.5138
| Reward: -1537  | Episode 6 | Qmax: -0.6419
| Reward: -1490  | Episode 7 | Qmax: -0.7667
| Reward: -1462  | Episode 8 | Qmax: -0.9650
| Reward: -1255  | Episode 9 | Qmax: -0.5602
